package EPrints::Plugin::Sword::Import::METS; use strict; use EPrints::Plugin::Sword::Import; use EPrints::TempDir; our @ISA = qw/ EPrints::Plugin::Sword::Import /; our %SUPPORTED_MIME_TYPES = ( "application/zip" => 1, ); our %UNPACK_MIME_TYPES = ( "application/zip" => "Sword::Unpack::Zip", ); sub new { my( $class, %params ) = @_; my $self = $class->SUPER::new( %params ); $self->{name} = "SWORD Importer - METS/EPDCX (alpha)"; return $self; } ### $opts{file} = $file; ### $opts{mime_type} = $headers->{content_type}; ### $opts{dataset_id} = $target_collection; ### $opts{owner_id} = $owner->get_id; ### $opts{depositor_id} = $depositor->get_id if(defined $depositor); ### $opts{no_op} = is this a No-op? ### $opts{verbose} = is this verbosed? sub input_file { my ( $plugin, %opts ) = @_; my $session = $plugin->{session}; my $dir = $opts{dir}; my $mime = $opts{mime_type}; my $file = $opts{file}; my $NO_OP = $opts{no_op}; # Let's find the XML file to import: unless( defined $SUPPORTED_MIME_TYPES{$mime} ) { $plugin->add_verbose( "[ERROR] unknown MIME TYPE '$mime'." ); $plugin->set_status_code( 415 ); return undef; } my $unpacker = $UNPACK_MIME_TYPES{$mime}; my $tmp_dir; if( defined $unpacker ) { $tmp_dir = EPrints::TempDir->new( "swordXXXXX", UNLINK => 1 ); if( !defined $tmp_dir ) { print STDERR "\n[SWORD-DEPOSIT] [INTERNAL-ERROR] Failed to create the temp directory!"; $plugin->add_verbose( "[ERROR] failed to create the temp directory." ); $plugin->set_status_code( 500 ); return undef; } my $files = $plugin->unpack_files( $unpacker, $file, $tmp_dir ); unless(defined $files) { $plugin->add_verbose( "[ERROR] failed to unpack the files" ); return undef; } my $candidates = $plugin->get_files_to_import( $files, "text/xml" ); if(scalar(@$candidates) == 0) { $plugin->add_verbose( "[ERROR] could not find the XML file" ); $plugin->set_status_code( 400 ); return undef; } elsif(scalar(@$candidates) > 1) { $plugin->add_verbose( "[WARNING] there were more than one XML file in this archive. I am using the first one" ); } $file = $$candidates[0]; } my $dataset_id = $opts{dataset_id}; my $owner_id = $opts{owner_id}; my $depositor_id = $opts{depositor_id}; my $fh; if( !open( $fh, $file ) ) { $plugin->add_verbose( "[ERROR] couldnt open the file: '$file' because '$!'" ); $plugin->set_status_code( 500 ); return; } my $unpack_dir; my $fntmp; $fntmp = $file; if( $fntmp =~ /^(.*)\/(.*)$/ ) { $unpack_dir = $1; $fntmp = $2; } # needs to read the xml from the file: my $xml; while( my $d = <$fh> ) { $xml .= $d } close $fh; my $dataset = $session->get_archive()->get_dataset( $dataset_id ); if(!defined $dataset) { print STDERR "\n[SWORD-METS] [INTERNAL-ERROR] Failed to open the dataset '$dataset_id'."; $plugin->add_verbose( "[INTERNAL ERROR] failed to open the dataset '$dataset_id'" ); $plugin->set_status_code( 500 ); return; } my $dom_doc; eval { $dom_doc = EPrints::XML::parse_xml_string( $xml ); }; if($@ || !defined $dom_doc) { $plugin->add_verbose( "[ERROR] failed to parse the xml: '$@'" ); $plugin->set_status_code( 400 ); return; } if( !defined $dom_doc ) { $plugin->{status_code} = 400; $plugin->add_verbose( "[ERROR] failed to parse the xml." ); return; } my $dom_top = $dom_doc->getDocumentElement; if( lc $dom_top->tagName ne 'mets' ) { $plugin->set_status_code( 400 ); $plugin->add_verbose( "[ERROR] failed to parse the xml: no tag found." ); return; } # METS Headers (ignored) my $mets_hdr = ($dom_top->getElementsByTagName( "metsHdr" ))[0]; # METS Descriptive Metadata (main section for us) # need to loop on dmdSec: my @dmd_sections = $dom_top->getElementsByTagName( "dmdSec" ); my $md_wrap; my $found_wrapper = 0; foreach my $dmd_sec (@dmd_sections) { # need to extract xmlData from here $md_wrap = ($dmd_sec->getElementsByTagName( "mdWrap" ))[0]; next if( !defined $md_wrap ); next if(!( lc $md_wrap->getAttribute( "MDTYPE" ) eq 'other' && defined $md_wrap->getAttribute( "OTHERMDTYPE" ) && lc $md_wrap->getAttribute( "OTHERMDTYPE" ) eq 'epdcx' )); $found_wrapper = 1; last; } unless( $found_wrapper ) { $plugin->set_status_code( 400 ); $plugin->add_verbose( "[ERROR] failed to parse the xml: could not find epdcx section." ); return; } my $xml_data = ($md_wrap->getElementsByTagName( "xmlData" ))[0]; if(!defined $xml_data) { $plugin->set_status_code( 400 ); $plugin->add_verbose( "[ERROR] failed to parse the xml: no tag found." ); return; } my $epdata = $plugin->parse_epdcx_xml_data( $xml_data ); return unless( defined $epdata ); # File Section which will contain optional info about files to import: my @files; foreach my $file_sec ( $dom_top->getElementsByTagName( "fileSec" ) ) { my $file_grp = ($file_sec->getElementsByTagName( "fileGrp" ))[0]; $file_sec = $file_grp if( defined $file_grp ); # this is because the tag is optional foreach my $file_div ($file_sec->getElementsByTagName( "file" )) { my $file_loc = ($file_div->getElementsByTagName( "FLocat" ))[0]; if(defined $file_loc) { # yeepee we have a file (maybe) my $fn = $file_loc->getAttribute( "href" ); unless( defined $fn ) { # to accommodate the gdome XML library: $fn = $file_loc->getAttribute( "xlink:href" ); } next unless( defined $fn ); next if $fn =~ /^http/; # wget those files? push @files, $fn; } } } unless( scalar(@files) ) { $plugin->add_verbose( "[WARNING] no tag found: no files will be imported." ); } if( $NO_OP ) { # need to send 200 Successful (the deposit handler will generate the XML response) $plugin->add_verbose( "[OK] Plugin - import successful (but in No-Op mode)." ); $plugin->set_status_code( 200 ); return; } if(defined $depositor_id) { $epdata->{userid} = $owner_id; $epdata->{sword_depositor} = $depositor_id; } else { $epdata->{userid} = $owner_id; } $epdata->{eprint_status} = $dataset_id; my $eprint = $dataset->create_object( $plugin->{session}, $epdata ); unless(defined $eprint) { $plugin->set_status_code( 500 ); $plugin->add_verbose( "[ERROR] failed to create the EPrint object." ); return; } foreach my $file (@files) { my %doc_data; $doc_data{_parent} = $eprint; $doc_data{eprintid} = $eprint->get_id; $doc_data{format} = $session->get_repository->call( 'guess_doc_type', $session, $unpack_dir."/".$file ); $doc_data{main} = $file; local $session->get_repository->{config}->{enable_file_imports} = 1; my %file_data; $file_data{filename} = $file; $file_data{url} = "file://$unpack_dir/$file"; $doc_data{files} = [ \%file_data ]; my $doc_dataset = $session->get_repository->get_dataset( "document" ); my $document = EPrints::DataObj::Document->create_from_data( $session, \%doc_data, $doc_dataset ); if(!defined $document) { $plugin->add_verbose( "[WARNING] Failed to create Document object." ); } } if( $plugin->keep_deposited_file() ) { if( $plugin->attach_deposited_file( $eprint, $opts{file}, $opts{mime_type} ) ) { $plugin->add_verbose( "[OK] attached deposited file." ); } else { $plugin->add_verbose( "[WARNING] failed to attach the deposited file." ); } } $plugin->add_verbose( "[OK] EPrint object created." ); return $eprint; } sub keep_deposited_file { return 1; } sub parse_epdcx_xml_data { my ( $plugin, $xml ) = @_; my $set = ($xml->getElementsByTagName( "descriptionSet" ))[0]; unless( defined $set ) { $plugin->set_status_code( 400 ); $plugin->add_verbose( "ERROR: no tag found." ); return; } my $epdata = {}; foreach my $desc ($set->getElementsByTagName( "description" )) { foreach my $stat ($desc->getElementsByTagName( "statement" )) { my ($field, $value) = _parse_statement( $stat ); if( defined $field ) { if( $field eq 'creator' ) { $value = clean_text( $value ); if($value =~ /(\w+)\,\s?(.*)$/) { push @{$epdata->{creators_name}}, {family => $1, given=>$2}; } else { push @{$epdata->{creators_name}}, {family => $value}; } } else { $epdata->{$field} = $value; } } } } return $epdata; } sub _parse_statement { my ( $stat ) = @_; my $property = $stat->getAttribute( "propertyURI" ); unless( defined $property ) { $property = $stat->getAttribute( "epdcx:propertyURI" ); } if( $property eq 'http://purl.org/dc/elements/1.1/type' ) { if( $stat->getAttribute( "valueURI" ) =~ /type\/(.*)$/ ) { # then $1 = type # reference for these mappings is: # "http://www.ukoln.ac.uk/repositories/digirep/index/Eprints_Type_Vocabulary_Encoding_Scheme" my $type = $1; # no need to clean_text( $1 ) here if( $type eq 'JournalArticle' || $type eq 'JournalItem' || $type eq 'SubmittedJournalArticle' || $type eq 'WorkingPaper' ) { $type = 'article'; } elsif( $type eq 'Book' ) { $type = 'book'; } elsif( $type eq 'BookItem' ) { $type = 'book_section'; } elsif( $type eq 'ConferenceItem' || $type eq 'ConferencePoster' || $type eq 'ConferencePaper' ) { $type = 'conference_item'; } elsif( $type eq 'Patent' ) { $type = 'patent'; } elsif( $type eq 'Report' ) { $type = 'monograph'; # I think? } elsif( $type eq 'Thesis' ) { $type = 'thesis'; } else { return; # problem there! But the user can still correct this piece of data... } return ( "type", $type ); } return; } elsif( $property eq 'http://purl.org/dc/elements/1.1/title' ) { my $value = ($stat->getElementsByTagName( "valueString" ))[0]; my $title = EPrints::XML::to_string( EPrints::XML::contents_of( $value ) ); return ( "title", clean_text($title) ); } elsif( $property eq 'http://purl.org/dc/terms/abstract' ) { my $value = ($stat->getElementsByTagName( "valueString" ))[0]; my $abstract = EPrints::XML::to_string( EPrints::XML::contents_of( $value ) ); return ( "abstract", clean_text($abstract) ); } elsif( $property eq 'http://purl.org/dc/elements/1.1/creator' ) { my $value = ($stat->getElementsByTagName( "valueString" ))[0]; my $name = EPrints::XML::to_string( EPrints::XML::contents_of( $value ) ); return ( "creator", clean_text($name) ); } elsif( $property eq 'http://purl.org/dc/elements/1.1/identifier' ) { my $value = ($stat->getElementsByTagName( "valueString" ))[0]; my $id = EPrints::XML::to_string( EPrints::XML::contents_of( $value ) ); return ( "id_number", clean_text($id) ); } elsif( $property eq 'http://purl.org/eprint/terms/status' ) { if( $stat->getAttribute( "valueURI" ) =~ /status\/(.*)$/ ) { my $status = $1; if( $status eq 'PeerReviewed' ) { $status = 'TRUE'; } elsif( $status eq 'NonPeerReviewed' ) { $status = 'FALSE'; } else { return; } return ( 'refereed', $status ); # is this the proper field? } return; } elsif( $property eq 'http://purl.org/dc/elements/1.1/language' ) { # LANGUAGE (not parsed) } elsif( $property eq 'http://purl.org/dc/terms/available' ) { # DATE OF AVAILABILITY (not parsed) } elsif( $property eq 'http://purl.org/eprint/terms/copyrightHolder' ) { # COPYRIGHT HOLDER (not parsed) } return; } sub clean_text { my ( $text ) = @_; my @lines = split( "\n", $text ); foreach(@lines) { $_ =~ s/\s+$//; $_ =~ s/^\s+//; } return join(" ", @lines); } 1;