package EPrints::Plugin::Sword::Import::RJ_Broker; use strict; use XML::LibXML 1.63; #use Data::Dumper; use EPrints::Plugin::Sword::Import; our @ISA = qw/ EPrints::Plugin::Sword::Import /; our %SUPPORTED_MIME_TYPES = ( "application/zip" => 1, ); our %UNPACK_MIME_TYPES = ( "application/zip" => "Sword::Unpack::Sub_Zip", ); sub new { my ( $class, %params ) = @_; my $self = $class->SUPER::new(%params); $self->{name} = "SWORD Importer - Repository Junction Broker (Default EPrints)"; # Make it visible on the import menu and elsewhere $self->{visible} = "all"; $self->{produce} = [ 'dataobj/eprint' ]; return $self; } ## end sub new ### $opts{file} = $file; ### $opts{mime_type} = $headers->{content_type}; ### $opts{dataset_id} = $target_collection; ### $opts{owner_id} = $owner->get_id; ### $opts{depositor_id} = $depositor->get_id if(defined $depositor); ### $opts{no_op} = is this a No-op? ### $opts{verbose} = is this verbosed? sub input_file { my ( $plugin, %opts ) = @_; my $session = $plugin->{session}; my $dir = $opts{dir}; my $mime = $opts{mime_type}; my $file = $opts{file}; my $NO_OP = $opts{no_op}; my $log_dir = $session->config('archiveroot'); my $LOGDIR = "$log_dir/var"; my $logging = "|Broker_Sword_Import"; # barf unless we have been given a .zip file unless ( defined $SUPPORTED_MIME_TYPES{$mime} ) { $plugin->add_verbose("[ERROR] unknown MIME TYPE '$mime'."); $plugin->set_status_code(415); return undef; } my $unpacker = $UNPACK_MIME_TYPES{$mime}; my $tmp_dir; # This is localised simply so we can have a bunch of variables to deduce the # METS file I want (I know the file will be called mets.xml, and I know it # will be at the top of the tree. # It also barfs if we can't unzip the file given. { $tmp_dir = EPrints::TempDir->new( "swordXXXX", UNLINK => 1 ); if ( !defined $tmp_dir ) { print STDERR "\n[SWORD-DEPOSIT] [INTERNAL-ERROR] Failed to create the temp directory!"; $plugin->add_verbose("[ERROR] failed to create the temp directory."); $plugin->set_status_code(500); return undef; } ## end if ( !defined $tmp_dir) my $files = $plugin->unpack_files( $unpacker, $file, $tmp_dir ); unless ( defined $files ) { $plugin->add_verbose("[ERROR] failed to unpack the files"); return undef; } my $candidates = $plugin->get_files_to_import( $files, "text/xml" ); if ( scalar(@$candidates) == 0 ) { $plugin->add_verbose("[ERROR] could not find the XML file"); $plugin->set_status_code(400); return undef; } elsif ( scalar(@$candidates) > 1 ) { # We have more than one possible xml file. Take 'mets.xml' if it exists, # otherwise just take the one with the 'highest' name, alphabetic-wise my @mets = grep /\bmets\.xml$/, @$candidates; if ( scalar(@mets) == 1 ) { $$candidates[0] = $mets[0]; $plugin->add_verbose( "[WARNING] there were more than one XML file in this archive, however I'm using the one called 'mets.xml'" ); } elsif ( scalar(@mets) > 1 ) { @$candidates = sort { length($a) cmp length($b) } @mets; $plugin->add_verbose( "[WARNING] there were more than one mets.XML file in this archive, however I'm using the one closest to the top of the 'tree'" ); } else { @mets = sort @$candidates; $$candidates[0] = $mets[0]; $plugin->add_verbose( "[WARNING] there were more than one XML file in this archive. I am using the one with the 'highest' name, alphabetic-wise (closer to 'z' than 'A')" ); } ## end else [ if ( scalar(@mets) == ...)] } ## end elsif ( scalar(@$candidates...)) $file = $$candidates[0]; } ## end if ( defined $unpacker) # OK - so $file now contains the path & filenamne of the mets.xml file we're importing # $tmp_dir is the directory that the .zip file has been unpacked into. my $dataset_id = $opts{dataset_id}; my $owner_id = $opts{owner_id}; my $depositor_id = $opts{depositor_id}; my $fh; if ( !open( $fh, $file ) ) { $plugin->add_verbose( "[ERROR] couldnt open the file: '$file' because '$!'"); $plugin->set_status_code(500); return; } ## end if ( !open( $fh, $file...)) # next problem: sometimes we end up with an extra directory in the stack # (as in $tmp_dir/content/stuff) # we need to pull "content/" off the file names my $unpack_dir; my $fntmp; $fntmp = $file; if ( $fntmp =~ /^(.*)\/(.*)$/ ) { $unpack_dir = $1; $fntmp = $2; } # $tmp_dir is the temporary directory # $file is the name of the mets.xml file in that directory # $unpack_dir is whatever 'content/' actually turns out to be (if anything) # $fntmp is the mets.xml file we're going to import (under $tmp_dir/$unpack_dir) # Finally, we can read the xml from the file: my $xml; while ( my $d = <$fh> ) { $xml .= $d; } close $fh; # $dataset is the EPrints object for where we're putting the deposit my $dataset = $session->get_archive()->get_dataset($dataset_id); # again, barf if the dataset is not provided (though as this is through SWORD, # this shouldn't happen if ( !defined $dataset ) { print STDERR "\n[SWORD-METS] [INTERNAL-ERROR] Failed to open the dataset '$dataset_id'."; $plugin->add_verbose( "[INTERNAL ERROR] failed to open the dataset '$dataset_id'"); $plugin->set_status_code(500); return; } ## end if ( !defined $dataset) # convert the xml text to a LibXML dom-object # What EPrints doesn't do is validate, it just checks for well-formed-ness # We have it in an eval block in case the xml is not well-formed, as the parser # throws an exception if the document is invalid my $dom_doc; eval { $dom_doc = EPrints::XML::parse_xml_string($xml); }; # No dom? barf if ( $@ || !defined $dom_doc ) { $plugin->add_verbose("[ERROR] failed to parse the xml: '$@'"); $plugin->set_status_code(400); return; } if ( !defined $dom_doc ) { $plugin->{status_code} = 400; $plugin->add_verbose("[ERROR] failed to parse the xml."); return; } # If the top element is not mets, barf # This is just in case someone sends us a non METS xml file my $dom_top = $dom_doc->getDocumentElement; if ( lc $dom_top->tagName ne 'mets' ) { $plugin->set_status_code(400); $plugin->add_verbose( "[ERROR] failed to parse the xml: no tag found."); return; } ## end if ( lc $dom_top->tagName...) ##### # Now we dig down to get the xmldata from the METS dmdSec section ##### # METS Headers (ignored) my $mets_hdr = ( $dom_top->getElementsByTagName("metsHdr") )[0]; # METS Descriptive Metadata (main section for us) # need to loop on dmdSec: my @dmd_sections = $dom_top->getElementsByTagName("dmdSec"); my $dmd_id; my $md_wrap; my $found_wrapper = 0; # METS can have more than one dmdSec, and we can't assume that what's # pushed in conforms to what we want, so we want to find the stuff # formally idendtified as containing epdcx # (I know it doesn't guarentee anything, but its a start!) foreach my $dmd_sec (@dmd_sections) { # need to extract xmlData from here $md_wrap = ( $dmd_sec->getElementsByTagName("mdWrap") )[0]; next if ( !defined $md_wrap ); next if ( !( lc $md_wrap->getAttribute("MDTYPE") eq 'other' && defined $md_wrap->getAttribute("OTHERMDTYPE") && lc $md_wrap->getAttribute("OTHERMDTYPE") eq 'epdcx' ) ); $found_wrapper = 1; $dmd_id = $dmd_sec->getAttribute("ID"); last; } ## end foreach my $dmd_sec (@dmd_sections) unless ($found_wrapper) { $plugin->set_status_code(400); $plugin->add_verbose( "[ERROR] failed to parse the xml: could not find epdcx section." ); return; } ## end unless ($found_wrapper) ######### # Now we get the (hopefully epdcx) xml from the xmlData section ######## my $xml_data = ( $md_wrap->getElementsByTagName("xmlData") )[0]; if ( !defined $xml_data ) { $plugin->set_status_code(400); $plugin->add_verbose( "[ERROR] failed to parse the xml: no tag found."); return; } ## end if ( !defined $xml_data) ######## # and parse it into a hash ######## my $epdata = $plugin->parse_epdcx_xml_data($xml_data); return unless ( defined $epdata ); # Add some useful (SWORD) info if ( defined $depositor_id ) { $epdata->{userid} = $owner_id; $epdata->{sword_depositor} = $depositor_id; } else { $epdata->{userid} = $owner_id; } $epdata->{eprint_status} = $dataset_id; # We now want to deal with the associated files (if any). # This is not a straight "every file is a document" import: EPrints has this # weird thing where a "document" can have multiple files. The Broker deals # with this by having defining the structure in , and putting # each document into a
. If there are multiple files, the main file is # the first file in the set # The first thing to do is create a list of all the files & their IDs # File Section which will contain optional info about files to import: my %files; my $file_counter = 0; foreach my $file_sec ( $dom_top->getElementsByTagName("fileSec") ) { my $file_grp = ( $file_sec->getElementsByTagName("fileGrp") )[0]; $file_sec = $file_grp if ( defined $file_grp ) ; # this is because the tag is optional foreach my $file_div ( $file_sec->getElementsByTagName("file") ) { my $file_id = $file_div->getAttribute("ID"); my $file_loc = ( $file_div->getElementsByTagName("FLocat") )[0]; if ( defined $file_loc ) { # yeepee we have a file (maybe) my $fn = $file_loc->getAttribute("href"); unless ( defined $fn ) { # to accommodate the gdome XML library: $fn = $file_loc->getAttribute("xlink:href"); } next unless ( defined $fn ); $files{$file_id} = $fn; # list files to get locally } ## end if ( defined $file_loc) } ## end foreach my $file_div ( $file_sec...) } ## end foreach my $file_sec ( $dom_top...) unless ( scalar keys %files ) { $plugin->add_verbose( "[WARNING] no tag found: no files will be imported."); } # we also need to hang onto details about where the importer unpacked stuff. # I know it's nothing to do with files in the METS manifest, however this is # probably the best place to keep them :) $files{unpack_dir} = $unpack_dir; $files{fntmp} = $fntmp; # Oh - if the SWORD conenction has specified NO-OP, then don't actually # go any further if ($NO_OP) { # need to send 200 Successful (the deposit handler will generate # the XML response) $plugin->add_verbose( "[OK] Plugin - import successful (but in No-Op mode)."); $plugin->set_status_code(200); return; } ## end if ($NO_OP) my $eprint = $dataset->create_object( $plugin->{session}, $epdata ); unless ( defined $eprint ) { $plugin->set_status_code(500); $plugin->add_verbose("[ERROR] failed to create the EPrint object."); return; } $logging .= "|" . $eprint->get_id; # Having created a base EPrint, and determined what files we have to add # to it, and we're actually going to deposit it, we need to create a "map" # of what files go where from the element. # $dmd_id contains the ID for the div we're looking for # # Oh, and a complication: the embargo information is kept in the # amdSec, so we need to find any embargo dates. # There will be two options: ClosedAccess and RestrictedAccess (RJB doesn't # record OpenAccess dated) # The system needs to map the resourceId of the epdcx:description to both the # rightsAccess and the date $xml_data = ""; my $amdSec = ( $dom_top->getElementsByTagName("amdSec") )[0]; $xml_data = ( $amdSec->getElementsByTagName("xmlData") )[0]; $files{'embargo_hash'} = $plugin->parse_rights_data($xml_data) if $xml_data; my ( $embargo, $security ); foreach my $structMap ( $dom_top->getElementsByTagName("structMap") ) { # we want the [first] div that has a dmdid that matches $dmd_id my $div; my @divs = $structMap->getElementsByTagName("div"); #childNodes; foreach my $d (@divs) { next unless $d->hasAttribute('DMDID'); my $v = $d->getAttribute('DMDID'); next unless $v eq $dmd_id; $div = $d; } ## end foreach my $d (@divs) if ($div) { my $doc_data_ref = {}; my @nodes = $div->getElementsByTagName("div"); #childNodes; # As we need the embargo details later on, we have our own # process_struct_elements routine that adds embargos and returns # the embargo date and the security restriction. my $document; ( $document, $embargo, $security ) = process_struct_elements( \@nodes, \%files, $doc_data_ref, $eprint, $session ); if ( !defined $document ) { $plugin->add_verbose("[WARNING] Failed to create Document object."); } } ## end if ($div) } ## end foreach my $structMap ( $dom_top...) if ( $plugin->keep_deposited_file() ) { if ( $plugin->attach_deposited_file( $eprint, $opts{file}, $opts{mime_type}, $embargo, $security ) ) { $plugin->add_verbose("[OK] attached deposited file."); } else { $plugin->add_verbose("[WARNING] failed to attach the deposited file."); } } ## end if ( $plugin->keep_deposited_file...) $plugin->add_verbose("[OK] EPrint object created."); # Keep a track of the logging my $log_string = scalar(CORE::localtime) . "$logging"; open( LOG, ">>$LOGDIR/transfer_log" ) or warn("could not open log file $LOGDIR/transfer_log"); print LOG "$log_string\n"; close(LOG); # and let's put it in the error_log too since it's dead useful $plugin->{session}->log("logging|$log_string"); return $eprint; } ## end sub input_file # This is where we actually track & build the stuff that # adds the files. It also sets any embargo period. # As we need the embargo details later on, it returns the # embargo date and the security restriction. sub process_struct_elements { my ( $nodes_ref, $files_hashref, $doc_data_ref, $eprint, $session ) = @_; my $hash_ref = {}; my $document; my $unpack_dir = $files_hashref->{unpack_dir}; my $embargo_hash = $files_hashref->{embargo_hash}; my ( $embargo, $security ); $security = 'staffonly'; # Go through all the children foreach my $element ( @{$nodes_ref} ) { my $ID = $element->getAttribute("ID"); if ( exists $embargo_hash->{$ID} ) { $embargo = $embargo_hash->{$ID}->{'date'}; } my @fptrs = $element->getElementsByTagName("fptr"); #childNodes; # check to see what element it is if ( scalar @fptrs ) { my $doc_data = {}; foreach my $fptr (@fptrs) { my $fileid = $fptr->getAttribute("FILEID"); my $file = $files_hashref->{$fileid}; $file =~ m/^(\w+)/; my $prot = $1 if $1; $file =~ m/\/([^\/]+)$/; # find everything after the last slash my $filename = $1 if $1; # if the file is not a web reference, add 'file://' and the unpack path # to the filename for ($prot) { /http/ && do { last; }; # $filename stays unchanged { # default option $file =~ s#^#file://$unpack_dir/#; last; }; } ## end for ($prot) $doc_data->{_parent} = $eprint unless defined $doc_data->{_parent}; $doc_data->{eprintid} = $eprint->get_id unless defined $doc_data->{eprintid}; $doc_data->{main} = $filename unless $doc_data->{main}; $doc_data->{format} = $session->get_repository->call( 'guess_doc_type', $session, $filename ) unless exists $doc_data->{format}; if ($embargo) { $doc_data->{date_embargo} = $embargo; $doc_data->{security} = $security; } my %file_data; $file_data{filename} = $filename; $file_data{url} = $file; $doc_data->{files} = [] unless exists $doc_data->{files}; push @{ $doc_data->{files} }, \%file_data; } ## end foreach my $fptr (@fptrs) my $doc_dataset = $session->get_repository->get_dataset("document"); local $session->get_repository->{config}->{enable_web_imports} = 1; local $session->get_repository->{config}->{enable_file_imports} = 1; $document = EPrints::DataObj::Document->create_from_data( $session, $doc_data, $doc_dataset ); } ## end if ( scalar @fptrs ) } ## end foreach my $element ( @{$nodes_ref...}) return ( $document, $embargo, $security ); } ## end sub process_struct_elements sub add_file { my ( $eprint, $hash_ref, $unpack_dir ) = @_; my $prot; my $file = $hash_ref->{'file'}; $file =~ m/^(\w+)/; $prot = $1 if $1; $file =~ m/\/([^\/]+)$/; # find everything after the last slash my $filename = $1 if $1; return undef unless $filename; $filename =~ s# #\ #g; # replace spaces with '\ ' # if the file is not a web reference, add 'file://' and the unpack path to the # filename for ($prot) { /http/ && do { # $filename stays unchanged last; }; { $file =~ s#^#file://$unpack_dir/# if $file; last; }; } ## end for ($prot) my $session = $hash_ref->{'session'}; # Setup: we return a doc_data hash. # If one is passed in, we update & return that, else create a new one. my $doc_data = {}; $doc_data->{_parent} = $eprint unless defined $doc_data->{_parent}; $doc_data->{eprintid} = $eprint->get_id unless defined $doc_data->{eprintid}; $doc_data->{main} = $filename unless $doc_data->{main}; $doc_data->{format} = $session->get_repository->call( 'guess_doc_type', $session, $filename ) unless exists $doc_data->{format}; my %file_data; $file_data{filename} = $filename; $file_data{url} = "$file"; $doc_data->{files} = [ \%file_data ]; return $doc_data; } ## end sub add_file sub keep_deposited_file { return 1; } sub parse_epdcx_xml_data { my ( $plugin, $xml ) = @_; my $set = ( $xml->getElementsByTagName("descriptionSet") )[0]; unless ( defined $set ) { $plugin->set_status_code(400); $plugin->add_verbose("ERROR: no tag found."); return; } my $epdata = {}; my @creators_array = (); foreach my $desc ( $set->getElementsByTagName("description") ) { my $att = $desc->getAttribute("epdcx:resourceId"); for ($att) { # title; abstract; identifer (publisherID); # creator; affilitated institution (from authors) # Note: we need to pass in the full $set so we can find # creator info m#^sword-mets-epdcx# && do { $plugin->_parse_epdcx( $desc, $epdata, $set ); last; }; # type; doi; date (yyyy-mm-dd); status; publisher; copyright_holder; m#^sword-mets-expr# && do { $plugin->_parse_expr( $desc, $epdata ); last; }; # publication; issn; isbn; volume; issue; pages; m#^sword-mets-manif# && do { $plugin->_parse_manif( $desc, $epdata, $set ); last; }; } } ## end foreach my $desc ( $set->getElementsByTagName...) return $epdata; } ## end sub parse_epdcx_xml_data # Parser for the "work" level # item type; title; abstract; identifer (publisherID); # creator; affilitated institution (from authors) # Funder (a ';' seperated list); Grant codes (also a ';' seperated list) # epdata is passed in, therefore the routine modifies the actual hash sub _parse_epdcx { my ( $plugin, $desc, $epdata, $set ) = @_; foreach my $stat ( $desc->getElementsByTagName("statement") ) { my ( $field, $value ) = _parse_statement($stat); for ($field) { # Note - for creators, we have two resources of information # 1) There is the name string here in the 'work' section # 2) The broker also stores an extended record in a seperate # description with a known resourceId... a resourceId that # matches the $attr value returned here. m#creator# && do { my ( $name, $attr ) = split /\t/, $value; my ( $family, $given, $email, $inst ); # can we get the information from Person description? if ($attr) { foreach my $d ( $set->getElementsByTagName("description") ) { next unless $d->getAttribute('epdcx:resourceId') eq $attr; foreach my $stat2 ( $d->getElementsByTagName("statement") ) { my ( $field2, $value2 ) = _parse_statement($stat2); for ($field2) { m#givenname# && do { $given = $value2; last; }; m#familyname# && do { $family = $value2; last; }; m#email# && do { $email = $value2; last; }; } } } } # if not, deduce it from the text here unless ($family) { if ( $name =~ /(\w+)\,\s?(.*)$/ ) { $family = $1, $given = $2; } else { $family = $value; } } # now add the information to the eprint # NOTE: eprints uses the index into arrays to syncronise # names, email addresses and other fields, so we have to # add something, even if its NULL push @{ $epdata->{creators_name} }, { family => $family, given => $given }; push @{ $epdata->{creators_id} }, $email; last; }; m#title# && do { $epdata->{'title'} = $value; last; }; m#abstract# && do { $epdata->{'abstract'} = $value; last; }; m#identifier# && do { $epdata->{'id_number'} = $value; last; }; # In EPrints, there is a "Funders" field, but no grant. # This is a repeating field, so we can push the items onto an array # To note grant codes, we are bodging the string: # : , , # ... it makes it human readable, and computer-parseable m#funders# && do { if ($value) { my ( $funder, $gcode ); $funder = $value; $funder =~ s/^funder //; my @grants = (); foreach my $d ( $set->getElementsByTagName("description") ) { next unless $d->getAttribute('epdcx:resourceId') eq $value; foreach my $stat2 ( $d->getElementsByTagName("statement") ) { my ( $field2, $value2 ) = _parse_statement($stat2); if ( $field2 eq 'grant_code' ) { push @grants, $value2; } } if ( scalar @grants ) { $funder .= ': '; $funder .= join '; ', @grants; } } push @{ $epdata->{'funders'} }, $funder; } last; }; # This becomes a pseudo-list (as EPrints doesn't have this # as a multiple field m#affiliatedInstitution# && do { my $i; $i = $epdata->{'institution'} if exists $epdata->{'institution'}; if ($i) { my @i = split /; /, $i; push @i, $value; $value = join '; ', @i; } $epdata->{'institution'} = $value; last; }; } } ## end foreach my $stat ( $desc->getElementsByTagName...) } ## end if($att =~ /^sword-mets-/)... # Parser for the "expression" level # genre, citation, doi, status, publisher, editor (provenance for # opendepot), date published # epdata is passed in, therefore the routine modifies the actual hash sub _parse_expr { my ( $plugin, $desc, $epdata ) = @_; foreach my $stat ( $desc->getElementsByTagName("statement") ) { my ( $field, $value ) = _parse_statement($stat); for ($field) { # This gets used once we have built the full record... m#type# && do { $epdata->{'type'} = $value; last; }; # date (published) m#date# && do { $epdata->{ispublished} = 'pub'; $epdata->{date} = $value; $epdata->{date_type} = 'published'; last; }; # This could be either a local identifier # a local identifier code, or a URI into the broker # We definitely don't want the latter. # Of the first two, take a doi in preference to a simple identifier m#identifier# && do { last if ( $value =~ /edina\.ac\.uk/ ) ; # skip anything that refers to edina my $i; $i = $epdata->{identifier} if exists $epdata->{identifier}; last if ( $i =~ /doi\.org/ ); # skip if we already have a doi $epdata->{identifier} = $value; last; }; # peer reviewed m#refereed# && do { $epdata->{'refereed'} = $value; last; }; # we need to come back to this: for a thesis, the broker # understands that thesis are "institution; department" # whereas everything else is just "publisher" # stuff it in publisher for now, and sort it later m#publisher# && do { $epdata->{'publisher'} = $value; last; }; m#provenance# && do { $epdata->{'provenance'} = $value; # copyright_holder? last; }; } ## end if ( defined $field ) } ## end foreach my $stat ( $desc->getElementsByTagName...) } ## end if($att =~ /^sword-mets-/)... # Parser for the "manifest" level # epdata is passed in, therefore the routine modifies the actual hash sub _parse_manif { my ( $plugin, $desc, $epdata, $set ) = @_; foreach my $stat ( $desc->getElementsByTagName("statement") ) { my ( $field, $value ) = _parse_statement($stat); for ($field) { m#cite# && do { my $cite = clean_text($value); # now we need to decode the citation # $publication [$vol[($num)]][, $pps|$pgs] my ( $pub, $vol, $num, $pps, $pgs ); # Start by pulling the page range or number of pages off the end if ( $cite =~ /\, (\d+(:?\-\d+))$/ ) { $cite =~ s/\, (\d+(:?\-\d+))$//; $num = $1 if $1; if ( $num =~ /\-/ ) { $pps = $num } else { $pgs = $num } $num = undef; } ## end if ( $cite =~ /\, (\d+(:?\-\d+))$/) # Do we have volume number ? if ( $cite =~ /\((.+)\)$/ ) { $cite =~ s/\((.+)\)$//; $num = $1 if $1; } # Get the volume number if ( $cite =~ /\s+(\d+)$/ ) { $cite =~ s/\s+(\d+)$//; $vol = $1 if $1; } $pub = $cite; $epdata->{publication} = $pub if ( $pub && not exists $epdata->{publication} ); $epdata->{volume} = $vol if ( $vol && not exists $epdata->{volume} ); $epdata->{number} = $num if ( $num && not exists $epdata->{number} ); $epdata->{pagerange} = $pps if ( $pps && not exists $epdata->{pagerange} ); # $epdata->{pages} = $pgs if $pgs; last; }; # issn & isbn m#issn# && do { $epdata->{'issn'} = $value; last; }; m#isbn# && do { $epdata->{'isbn'} = $value; last; }; # publication,volume, issue, pages # these trump anything from the citation m#publication# && do { $epdata->{'publication'} = $value; last; }; m#volume# && do { $epdata->{'volume'} = $value; last; }; m#issue# && do { $epdata->{'issue'} = $value; last; }; m#pagerange# && do { $epdata->{'pagerange'} = $value; last; }; m#pages# && do { # Pages has to be a numeric value! $epdata->{'pages'} = $value + 0; last; }; # Available URLS (DOI, official_url, related urls) m#available_url# && do { # available-doi-1; available-official_url-1; available-related_url-4; ... # can we get the information from Person description? if ($value) { foreach my $d ( $set->getElementsByTagName("description") ) { next unless $d->getAttribute('epdcx:resourceId') eq $value; my $uri; $uri = $d->getAttribute('epdcx:resourceUrl'); if ($value =~ /official/) { $epdata->{'official_url'} = $uri; } elsif ($value =~ /doi/) { $epdata->{'id_number'} = $uri; } else { push @{$epdata->{'related_url'}}, {url => $uri, type=>'pub'} if $uri } } }; last; }; } ## end if ( defined $field ) } ## end foreach my $stat ( $desc->getElementsByTagName...) } ## end if($att =~ /^sword-mets-/)... # Parser for the unknown level # This is where we try to find the author/creator information # epdata is passed in, therefore the routine modifies the actual hash sub _parse_unknown { my ( $plugin, $desc, $epdata ) = @_; $epdata->{unknown} = {} unless exists $epdata->{unknown}; my $resourceId; $resourceId = $desc->getAttribute("resourceId"); # we only parse if we have a resourceId: creators have them if ($resourceId) { $epdata->{unknown}->{$resourceId} = {}; foreach my $stat ( $desc->getElementsByTagName("statement") ) { my ( $field, $value ) = _parse_statement($stat); for ($field) { m#givenname# && do { $epdata->{'pages'} = $value; last; }; #familyname #institution #email } } ## end foreach my $stat ( $desc->getElementsByTagName...) } } ## end if($att =~ /^sword-mets-/)... sub _parse_statement { my ($stat) = @_; my $property = $stat->getAttribute("propertyURI"); unless ( defined $property ) { $property = $stat->getAttribute("epdcx:propertyURI"); } for ($property) { m#http://purl.org/dc/elements/1.1/type# && do { return _get_type($stat); last; }; ## end if ($property eq 'http://purl.org/dc/elements/1.1/type'...) m#http://purl.org/dc/elements/1.1/title# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; my $title = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( "title", clean_text($title) ); last; }; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/title'...) m#http://purl.org/dc/terms/abstract# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; my $abstract = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( "abstract", clean_text($abstract) ); last; }; ## end elsif ($property eq 'http://purl.org/dc/terms/abstract'...) m#http://purl.org/dc/elements/1.1/creator# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value .= "\t" . $stat->getAttribute("epdcx:valueRef") ; # add the value of the attribute to the creators list return ( "creator", $value ); last; }; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/creator'...) m#http://purl.org/dc/elements/1.1/identifier# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; my $id = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( "id_number", clean_text($id) ); last; }; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/identifier'...) m#http://purl.org/eprint/terms/status# && do { if ( $stat->getAttribute("valueURI") =~ /status\/(.*)$/ ) { my $status = $1; if ( $status eq 'PeerReviewed' ) { $status = 'TRUE'; } elsif ( $status eq 'NonPeerReviewed' ) { $status = 'FALSE'; } else { return; } return ( 'refereed', $status ); # is this the proper field? } ## end if ( $stat->getAttribute...) return; last; }; ## end elsif ($property eq 'http://purl.org/eprint/terms/status'...) m#http://purl.org/dc/elements/1.1/language# && do { # LANGUAGE (not parsed) last; }; m#http://purl.org/dc/terms/available# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; my $id = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( "date", clean_text($id) ); last; }; m#http://purl.org/eprint/terms/copyrightHolder# && do { # COPYRIGHT HOLDER (not parsed) last; }; m#http://purl.org/dc/terms/bibliographicCitation# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; my $cite = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'cite', $cite ); last; }; m#http://purl.org/dc/elements/1.1/publisher# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'publisher', $value ); last; }; # This lot are OA-RJ extensions to cover blantently missing fields m#http://opendepot.org/broker/elements/1.0/publication# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'publication', $value ); last; }; m#http://opendepot.org/broker/elements/1.0/issn# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'issn', $value ); last; }; m#http://opendepot.org/broker/elements/1.0/isbn# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'isbn', $value ); last; }; m#http://opendepot.org/broker/elements/1.0/volume# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'volume', $value ); last; }; m#http://opendepot.org/broker/elements/1.0/issue# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'issue', $value ); last; }; m#http://opendepot.org/broker/elements/1.0/pages# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'pagerange', $value ) if $value =~ /\-/; return ( 'pages', $value ); last; }; ## new record metadata m#http://purl.org/dc/elements/1.1/givenname# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'givenname', $value ); last; }; m#http://purl.org/dc/elements/1.1/familyname# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'familyname', $value ); last; }; m#http://xmlns.com/foaf/0.1/mbox# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'email', $value ); last; }; m#http://purl.org/eprint/terms/affiliatedInstitution# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'institution', $value ); last; }; m#http://www.loc.gov/loc.terms/relators/EDT# && do { my $value = ( $stat->getElementsByTagName("valueString") )[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); return ( 'provenance', $value ); # editor? last; }; m#http://www.loc.gov/loc.terms/relators/FND# && do { my $value = $stat->getAttribute("epdcx:valueRef") ; # add the value of the attribute to the creators list return ( "funders", $value ); last; }; m#http://purl.org/eprint/terms/grantNumber# && do { my $value; if ($stat->hasAttribute("epdcx:valueRef")) { $value = $stat->getAttribute("epdcx:valueRef"); } else { my @vs = $stat->getElementsByTagName("valueString"); if (scalar @vs) { $value = $vs[0]; $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); } } return ( "grant_code", $value ); last; }; ## end new data ## Alternate sources m#http://purl.org/eprint/terms/isAvailableAs# && do { my $value = $stat->getAttribute("epdcx:valueRef") ; return ( "available_url", $value ); last; }; ## Data for embargoes m#http://purl.org/dc/terms/accessRights# && do { my $value = $stat->getAttribute("epdcx:valueRef") ; # add the value of the attribute to the creators list $value =~ /([^\/]+)$/; $value = $1 if $1; return ( "access", $value ); last; }; } ## end for ($property) return; } ## end sub _parse_statement sub _get_type { my ($stat) = @_; if ( $stat->getAttribute("valueURI") =~ /type\/(.*)$/ ) { # then $1 = type # reference for these mappings is: # "http://www.ukoln.ac.uk/repositories/digirep/index/Eprints_Type_Vocabulary_Encoding_Scheme" my $type = $1; # no need to clean_text( $1 ) here if ( $type eq 'JournalArticle' || $type eq 'JournalItem' || $type eq 'SubmittedJournalArticle' || $type eq 'WorkingPaper' ) { $type = 'article'; } ## end if ( $type eq 'JournalArticle'...) elsif ( $type eq 'Book' ) { $type = 'book'; } elsif ( $type eq 'BookItem' ) { $type = 'book_section'; } elsif ( $type eq 'ConferenceItem' || $type eq 'ConferencePoster' || $type eq 'ConferencePaper' ) { $type = 'conference_item'; } ## end elsif ( $type eq 'ConferenceItem'...) elsif ( $type eq 'Patent' ) { $type = 'patent'; } elsif ( $type eq 'Report' ) { $type = 'monograph'; # I think? } elsif ( $type eq 'Thesis' ) { $type = 'thesis'; } else { return ; # problem there! But the user can still correct this piece of data... } return ( "type", $type ); } ## end if ( $stat->getAttribute...) return; } ## end sub _get_type sub clean_text { my ($text) = @_; my @lines = split( "\n", $text ); foreach (@lines) { $_ =~ s/\s+$//; $_ =~ s/^\s+//; } return join( " ", @lines ); } ## end sub clean_text ################ # # We need to replace the default routine so we can apply # an embargo to the deposited file. sub attach_deposited_file { my ( $self, $eprint, $file, $mime, $embargo, $security ) = @_; my $fn = $file; if ( $file =~ /^.*\/(.*)$/ ) { $fn = $1; } my %doc_data; $doc_data{eprintid} = $eprint->get_id; $doc_data{format} = $mime; $doc_data{formatdesc} = $self->{session}->phrase("Sword/Deposit:document_formatdesc"); $doc_data{main} = $fn; if ( $embargo && $security ) { $doc_data{date_embargo} = $embargo; $doc_data{security} = $security; } local $self->{session}->get_repository->{config}->{enable_file_imports} = 1; my %file_data; $file_data{filename} = $fn; $file_data{url} = "file://$file"; $doc_data{files} = [ \%file_data ]; $doc_data{_parent} = $eprint; my $doc_dataset = $self->{session}->get_repository->get_dataset("document"); my $document = EPrints::DataObj::Document->create_from_data( $self->{session}, \%doc_data, $doc_dataset ); return 0 unless ( defined $document ); $document->make_thumbnails; $eprint->generate_static; $self->set_deposited_file_docid( $document->get_id ); return 1; } sub parse_rights_data { my ( $plugin, $xml ) = @_; my $set = ( $xml->getElementsByTagName("descriptionSet") )[0]; unless ( defined $set ) { $plugin->set_status_code(400); $plugin->add_verbose("ERROR: no tag found."); return; } my $data = {}; foreach my $desc ( $set->getElementsByTagName("description") ) { # This is the ID that ties the description to the structMap record my $att = $desc->getAttribute("epdcx:resourceId"); foreach my $stat ( $desc->getElementsByTagName("statement") ) { my ( $field, $value ) = _parse_statement($stat); $data->{$att} = {} unless exists $data->{$att}; $data->{$att}->{$field} = $value; } } return $data; } 1;