=head1 NAME Jisc PubRouter RIOXX importer Copyright (C) 2017 Jisc This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see < https://www.gnu.org/licenses/lgpl.txt>. =cut package EPrints::Plugin::Import::PubRouter; use strict; use EPrints::Plugin::Import::DefaultXML; use Locale::Language; use LWP::Simple; our @ISA = qw/ EPrints::Plugin::Import::DefaultXML /; my %namespaces = ( 'ali' => 'http://www.niso.org/schemas/ali/1.0/', 'dcterms' => 'http://purl.org/dc/terms/', 'rioxxterms' => 'http://www.rioxx.net/schema/v2.0/rioxx/', 'pr' => 'http://pubrouter.jisc.ac.uk/rioxxplus/', ); my %types = ( 'Journal Article' => 'article', 'Book' => 'book', 'Book chapter' => 'book_section', 'Book edited' => 'book_section', 'Conference Paper/Proceeding/Abstract' => 'conference_item', 'Journal Article/Review' => 'article', 'Manual/Guide' => 'monograph', 'Monograph' => 'monograph', 'Policy briefing report' => 'monograph', 'Technical Report' => 'monograph', 'Technical Standard' => 'monograph', 'Thesis' => 'thesis', 'Other' => 'other', 'Consultancy Report' => 'monograph', 'Working paper' => 'monograph', ); my %monograph_types = ( 'Manual/Guide' => 'manual', 'Policy briefing report' => 'other', 'Technical Report' => 'technical_report', 'Technical Standard' => 'technical_report', 'Consultancy Report' => 'project_report', 'Working paper' => 'working_paper', ); my %content = ( 'AO' => 'draft', 'SMUR' => 'submitted', 'AM' => 'accepted', 'VoR' => 'published', 'CVoR' => 'updated', 'EVoR' => 'updated', ); my %license_urls = ( "http://creativecommons.org/licenses/by-nd/3.0/" => 'cc_by_nd', "http://creativecommons.org/licenses/by/3.0/" => 'cc_by', "http://creativecommons.org/licenses/by-nc/3.0/" => 'cc_by_nc', "http://creativecommons.org/licenses/by-nc-nd/3.0/" => 'cc_by_nc_nd', "http://creativecommons.org/licenses/by-nd-sa/3.0/" => 'cc_by_nc_sa', "http://creativecommons.org/licenses/by-sa/3.0/" => 'cc_by_sa', "http://creativecommons.org/licenses/by-nd/4.0/" => 'cc_by_nd_4', "http://creativecommons.org/licenses/by/4.0/" => 'cc_by_4', "http://creativecommons.org/licenses/by-nc/4.0/" => 'cc_by_nc_4', "http://creativecommons.org/licenses/by-nc-nd/4.0/" => 'cc_by_nc_nd_4', "http://creativecommons.org/licenses/by-nd-sa/4.0/" => 'cc_by_nc_sa_4', "http://creativecommons.org/licenses/by-sa/4.0/" => 'cc_by_sa_4', "http://creativecommons.org/publicdomain/zero/1.0/legalcode/" => 'cc_public_domain', "http://www.gnu.org/licenses/gpl.html" => 'cc_gnu_gpl', "http://www.gnu.org/licenses/lgpl.html" => 'cc_gnu_lgpl', "https://creativecommons.org/licenses/by-nd/3.0/" => 'cc_by_nd', "https://creativecommons.org/licenses/by/3.0/" => 'cc_by', "https://creativecommons.org/licenses/by-nc/3.0/" => 'cc_by_nc', "https://creativecommons.org/licenses/by-nc-nd/3.0/" => 'cc_by_nc_nd', "https://creativecommons.org/licenses/by-nd-sa/3.0/" => 'cc_by_nc_sa', "https://creativecommons.org/licenses/by-sa/3.0/" => 'cc_by_sa', "https://creativecommons.org/licenses/by-nd/4.0/" => 'cc_by_nd_4', "https://creativecommons.org/licenses/by/4.0/" => 'cc_by_4', "https://creativecommons.org/licenses/by-nc/4.0/" => 'cc_by_nc_4', "https://creativecommons.org/licenses/by-nc-nd/4.0/" => 'cc_by_nc_nd_4', "https://creativecommons.org/licenses/by-nd-sa/4.0/" => 'cc_by_nc_sa_4', "https://creativecommons.org/licenses/by-sa/4.0/" => 'cc_by_sa_4', "https://creativecommons.org/publicdomain/zero/1.0/legalcode/" => 'cc_public_domain', "https://www.gnu.org/licenses/gpl.html" => 'cc_gnu_gpl', "https://www.gnu.org/licenses/lgpl.html" => 'cc_gnu_lgpl', ); sub new { my( $class, %params ) = @_; my $self = $class->SUPER::new(%params); $self->{name} = "Jisc PubRouter RIOXX importer"; $self->{visible} = "all"; $self->{produce} = [ 'list/eprint', 'dataobj/eprint' ]; $self->{accept} = [ qw( application/atom+xml application/vnd.rioxx2.data+xml ) ]; return $self; } sub input_fh { my( $plugin, %opts ) = @_; my $fh = $opts{"fh"}; my @keys = keys %opts; my $xml = join "", <$fh>; my $list; if( $xml =~ /^<\?xml/ ) { $list = $plugin->input_fh_xml( $xml, %opts ); } $list ||= EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids => [] ); return $list; } sub input_fh_xml { my( $plugin, $xml, %opts ) = @_; my $doc = EPrints::XML::parse_xml_string( $xml ); my $dataobj = $plugin->xml_to_dataobj( $opts{dataset}, $doc->documentElement ); EPrints::XML::dispose( $doc ); return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids => [defined($dataobj) ? $dataobj->get_id : ()] ); } sub xml_to_epdata { my( $plugin, $dataset, $xml ) = @_; my $session = $plugin->{session}; my $epdata = {}; my $docdata = {}; #note my @notes = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'note' ); my $note_text; foreach my $n ( @notes ) { push @{$note_text}, $plugin->xml_to_text( $n ); } $epdata->{note} = join( " " . $plugin->phrase( "note_separator"), @{$note_text} ) . " " if defined $note_text; $epdata->{note} ||= ""; #comment $epdata->{suggestions} = $plugin->getNameSpaceValue( $xml, $namespaces{'pr'}, 'comment' ); #subjects my @subjects = $xml->getElementsByTagNameNS( $namespaces{'dcterms'}, 'subject' ); my @keywords; foreach my $s( @subjects ) { push @keywords, $plugin->xml_to_text( $s ); } $epdata->{keywords} = join( ", ", @keywords ); #related urls my @relations = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'relation' ); $epdata->{related_url} = []; foreach my $r( @relations ) { push @{$epdata->{related_url}}, { url => $r->getAttribute( "url" ), type => $plugin->xml_to_text( $r ), } }; #type my $type = $plugin->getNameSpaceValue( $xml, $namespaces{'rioxxterms'}, 'type' ); if( $type eq "" ) { $type = $plugin->getNameSpaceValue( $xml, $namespaces{'dcterms'}, 'type' ); } $epdata->{type} = $types{$type} if defined $type; $epdata->{type} ||= 'other'; #source, volume and number my $source = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'source')->item(0); if( defined $source ) { if( $epdata->{type} eq "book_section" ) { $epdata->{book_title} = $plugin->xml_to_text( $source ); } else { $epdata->{publication} = $plugin->xml_to_text( $source ); } $epdata->{volume} = $source->getAttribute( "volume" ) if defined $source; $epdata->{number} = $source->getAttribute( "issue" ) if defined $source; } #source id my @sourceids = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'source_id'); my $issn_set = 0; my @sourceid_notes; foreach my $sourceid (@sourceids) { my $sourceid_type = $sourceid->getAttribute( "type" ); if( $sourceid_type eq "eissn" ) { $epdata->{issn} = $plugin->xml_to_text( $sourceid ); $issn_set = 1; #used to prioritise eissn value } elsif( $sourceid_type eq "issn" && !$issn_set ) { $epdata->{issn} = $plugin->xml_to_text( $sourceid ); } elsif( $sourceid_type eq "pissn" && $issn_set ) { $epdata->{issn} = $plugin->xml_to_text( $sourceid ); } elsif( $sourceid_type eq "isbn" ) { $epdata->{isbn} = $plugin->xml_to_text( $sourceid ); } #Add all types of source_id values to Additional Information field push @sourceid_notes, "$sourceid_type " . $plugin->xml_to_text( $sourceid ); } if( scalar @sourceid_notes > 0 ) { $epdata->{note} = $plugin->appendToNote( $epdata->{note}, "Journal IDs", join( "; ", @sourceid_notes ) ); } #publisher $epdata->{publisher} = $plugin->getNameSpaceValue( $xml, $namespaces{'dcterms'}, 'publisher' ); #title $epdata->{title} = $plugin->getNameSpaceValue( $xml, $namespaces{'dcterms'}, 'title' ); #if monograph work out the monograph type if( $epdata->{type} eq "monograph" ) { $epdata->{monograph_type} = $monograph_types{$type}; } #version my $version = $plugin->getNameSpaceValue( $xml, $namespaces{'rioxxterms'}, 'version' ); if( $version ne "" ) { my $c = $content{$version}; $docdata->{content} = $c if defined $c; }; #page range my $pagerange = $plugin->getNameSpaceValue( $xml, $namespaces{'pr'}, 'page_range' ); $epdata->{pagerange} = $pagerange; if( !defined $epdata->{pagerange} ) { $epdata->{pagerange} = $plugin->getNameSpaceValue( $xml, $namespaces{'pr'}, 'start_page' ); my $end_page = $plugin->getNameSpaceValue( $xml, $namespaces{'pr'}, 'end_page' ); $epdata->{pagerange} = $epdata->{pagerange} . '-' . $end_page if( defined $end_page && defined $epdata->{pagerange} ); } #pages $epdata->{pages} = $plugin->getNameSpaceValue( $xml, $namespaces{'pr'}, 'num_pages' ); #language my $code = $plugin->getNameSpaceValue( $xml, $namespaces{'dcterms'}, 'language' ); if( length $code > 2 ) { $code = "en"; #assume english } $docdata->{language} = $code; #description $epdata->{abstract} = $plugin->getNameSpaceValue( $xml, $namespaces{'dcterms'}, 'abstract' ); #identifier my @identifiers = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'identifier' ); foreach my $id (@identifiers) { my $identifier_type = $id->getAttribute( "type" ); if( $identifier_type eq "doi" ) { $epdata->{id_number} = $plugin->xml_to_text( $id ); } } #now loop around again, now that DOI has been handled my @id_notes; foreach my $id (@identifiers) { my $identifier_type = $id->getAttribute( "type" ); if( $identifier_type ne "doi" ) { if( defined $epdata->{id_number} ) { push @id_notes, "$identifier_type: " . $plugin->xml_to_text( $id ); } else { $epdata->{id_number} = $plugin->xml_to_text( $id ); } } } if( scalar @id_notes > 0 ) { $epdata->{note} = $plugin->appendToNote( $epdata->{note}, "Article IDs", join( "; ", @id_notes ) ); } #version_of_record my $vor = $plugin->getNameSpaceValue( $xml, $namespaces{'rioxxterms'}, 'version_of_record' ); $epdata->{id_number} = $vor if $vor ne ""; #dateAccepted my $acceptance_set = 0; my $acceptance_date = $plugin->getNameSpaceValue( $xml, $namespaces{'dcterms'}, 'dateAccepted' ); if( $acceptance_date ) { my ( $year, $month, $day ) = split /-/, $acceptance_date; my $date_string = $year; $date_string = "$month-$date_string" if defined $month; $date_string = "$day-$date_string" if defined $day; if( $dataset->has_field( "dates" ) ) { $epdata->{dates} ||= []; push @{$epdata->{dates}}, { date => $acceptance_date, date_type => "accepted", }; $acceptance_set = 1; } elsif( $dataset->has_field( "rioxx2_dateAccepted_input" ) ) { $epdata->{rioxx2_dateAccepted_input} = $acceptance_date; } else { $epdata->{note} = $plugin->appendToNote( $epdata->{note}, "Accepted", $date_string ); } } #publication date #if DatesDatesDates is present use that... my $publication_set = 0; my $publication_date = $plugin->getNameSpaceValue( $xml, $namespaces{'rioxxterms'}, 'publication_date' ); if( $publication_date ) { my ( $year, $month, $day ) = split /-/, $publication_date; my $date_string = $year; $date_string = "$month-$date_string" if defined $month; $date_string = "$day-$date_string" if defined $day; if( $dataset->has_field( "dates" ) ) { $epdata->{dates} ||= []; push @{$epdata->{dates}}, { date => $publication_date, date_type => "published", }; $publication_set = 1; } else #use ordinary date field { $epdata->{date} = $publication_date; $epdata->{date_type} = "published"; $publication_set = 1; } } #history dates my @dates = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'history_date' ); my @date_notes; foreach my $date ( @dates ) { my $date_set = 0; my $d = $plugin->xml_to_text( $date ); my ( $year, $month, $day ) = split /-/, $d; my $date_string = $year; $date_string = "$month-$date_string" if defined $month; $date_string = "$day-$date_string" if defined $day; my $date_type = $date->getAttribute( "type" ); if( ! ( ($date_type eq "accepted" && $acceptance_set) || ($date_type eq "published" && $publication_set ) ) ) { if( $dataset->has_field( "dates" ) ) { my $dates_field = $dataset->field( "dates_date_type" ); my $types = $dates_field->property( "options" ); if( grep { $date_type eq $_ } @{$types} ) { $epdata->{dates} ||= []; push @{$epdata->{dates}}, { date => $d, date_type => $date_type, }; $date_set = 1; } } elsif( grep { $date_type eq $_ } @{$dataset->field( "date_type" )->property( "options" )} ) { $epdata->{date} = $d; $epdata->{date_type} = $date_type; $date_set = 1; } } #always add date to Additional Information field push @date_notes, "$date_type $date_string"; } if( scalar @date_notes > 0 ) { $epdata->{note} = $plugin->appendToNote( $epdata->{note}, "History", join( "; ", @date_notes ) ); } #status if( $publication_set ) { $epdata->{ispublished} = "pub"; } #project and funders $epdata->{projects} = []; $epdata->{funders} = []; my @projects = $xml->getElementsByTagNameNS( $namespaces{rioxxterms}, "project" ); foreach my $project (@projects) { #set project value my $project_id = $plugin->xml_to_text( $project ); if( defined $project_id ) { push @{$epdata->{projects}}, $project_id; } #funder names my $funder_name = $project->getAttribute( "funder_name" ); if( defined $project_id ) { if( defined $funder_name ) { push @{$epdata->{funders}}, $funder_name; } } #RIOXX and funder ids my @funder_ids = split /; /, $project->getAttribute( "funder_id" ); foreach my $funder_id ( @funder_ids ) { #remove label from funder id $funder_id = substr $funder_id, index( $funder_id, ":" ) + 1; if( $dataset->has_field( "rioxx2_project_input" ) ) { $epdata->{rioxx2_project_input} ||= []; push @{$epdata->{rioxx2_project_input}}, { project => $project_id, funder_name => $funder_name, funder_id => $funder_id, }; } } } #free_to_read my $free_to_read = $xml->getElementsByTagNameNS( $namespaces{ali}, "free_to_read" )->item(0); if( $free_to_read && $dataset->has_field( "rioxx2_free_to_read_input" ) ) { my $start_date = $free_to_read->getAttribute( "start_date" ); my $end_date = $free_to_read->getAttribute( "end_date" ) ; $epdata->{rioxx2_free_to_read_input} = { free_to_read => "Yes", start_date => $start_date, end_date => $end_date, }; } #license my @licenses = $xml->getElementsByTagNameNS( $namespaces{pr}, "license" ); #create the additional information content my %license_notes; my %licenses; foreach my $license (@licenses) { #get license data my $license_start_date = $license->getAttribute( "start_date" ); my ( $year, $month, $day ) = split /-/, $license_start_date; my $date_string = $year; $date_string = "$month-$date_string" if defined $month; $date_string = "$day-$date_string" if defined $day; my $license_url = $license->getAttribute( "url" ); my $license_version = $license->getAttribute( "version" ); my $license_desc = $plugin->xml_to_text( $license ); #store simple license data $licenses{$license_start_date} = $license_url; #store license data as note my @license_string; push( @license_string, "starting on $date_string" ) if defined $license_start_date; push( @license_string, $license_desc ) if defined $license_desc; push( @license_string, $license_url ) if defined $license_url; push( @license_string, "license version $license_version" ) if defined $license_version; my $license_note = join( ", ", @license_string ); #if license note exists, add it to the hash if( defined $license_note ) { if( exists $license_notes{$license_start_date} ) { $license_notes{$license_start_date} = $license_notes{$license_start_date} . "; $license_note"; } else { $license_notes{$license_start_date} = $license_note; } } } #try to add earliest license to the document my $added_to_doc = 0; foreach my $date ( sort keys %licenses ) { #add licence to doc data if possible - only want the earliest license for this if( exists $license_urls{$licenses{$date}}) { $added_to_doc = 1; $docdata->{license} = $license_urls{$licenses{$date}}; } last; } if( scalar @licenses > 1 ) #add all notes to Additional Information if more than one license { my @license_info; foreach my $date ( sort keys %license_notes ) { push( @license_info, $license_notes{$date} ); } $epdata->{note} = $plugin->appendToNote( $epdata->{note}, "Licenses for this article", join("; ", @license_info ) ); } elsif( scalar @licenses == 1 && !$added_to_doc) #add to additional information { foreach my $date ( sort keys %license_notes ) { $epdata->{note} = $plugin->appendToNote( $epdata->{note}, "License for this article", $license_notes{$date} ); } } #embargo date my $embargo = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'embargo' )->item(0); if( $embargo ) { my $embargo_start_date = $embargo->getAttribute( "start_date" ); my $embargo_end_date = $embargo->getAttribute( "end_date" ); my $duration = $embargo->getAttribute( "duration" ); if( $embargo_start_date ) { my ( $year, $month, $day ) = split /-/, $embargo_start_date; my $date_string = $year; $date_string = "$month-$date_string" if defined $month; $date_string = "$day-$date_string" if defined $day; if( $embargo_start_date ne $publication_date ) { #add embargo note to Additional Information my $embargo_note = "starting on $date_string"; if( $embargo_end_date ) { my ( $end_year, $end_month, $end_day ) = split /-/, $embargo_end_date; my $end_date_string = $end_year; $end_date_string = "$end_month-$end_date_string" if defined $end_month; $end_date_string = "$end_day-$end_date_string" if defined $end_day; $embargo_note .= ", ending on $end_date_string"; } if( $duration ) { $embargo_note .= ", duration $duration "; if( $duration > 1 ) { $embargo_note .= "months"; } else { $embargo_note .= "month"; } } $epdata->{note} = $plugin->appendToNote( $epdata->{note}, "Embargo", $embargo_note ); } } if( $embargo_end_date ) { $docdata->{date_embargo} = $embargo_end_date; $docdata->{security} = "staffonly"; } } #authors $epdata->{creators} = []; $epdata->{corp_creators} = []; my @creators = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'author' ); foreach my $creator (@creators) { my $creatordata = {}; #id my @creator_ids = $creator->getElementsByTagNameNS( $namespaces{'pr'}, 'id' ); foreach my $creator_id (@creator_ids) { my $creator_id_type = $creator_id->getAttribute( "type" ); if($creator_id_type eq "orcid" && $dataset->has_field( "creators_orcid" ) ) { $creatordata->{orcid} = $plugin->xml_to_text( $creator_id ); } } #email $creatordata->{id} = $plugin->getNameSpaceValue( $creator, $namespaces{'pr'}, 'email' ); #name my $creatorname = {}; $creatorname->{family} = $plugin->getNameSpaceValue( $creator, $namespaces{'pr'}, 'surname' ); $creatorname->{given} = $plugin->getNameSpaceValue( $creator, $namespaces{'pr'}, 'firstnames' ); $creatorname->{honourific} = $plugin->getNameSpaceValue( $creator, $namespaces{'pr'}, 'suffix' ); $creatordata->{name} = $creatorname; push @{$epdata->{creators}}, $creatordata; #org_name my $org_name = $plugin->getNameSpaceValue( $creator, $namespaces{'pr'}, 'org_name' ); push @{$epdata->{corp_creators}}, $org_name if defined $org_name; } #contributors $epdata->{contributors} = []; my @contributors = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'contributor' ); foreach my $contributor (@contributors) { #type my $contributordata = {}; $contributordata->{type} = $plugin->getNameSpaceValue( $contributor, $namespaces{'pr'}, 'type' ); #id my @contributor_ids = $contributor->getElementsByTagNameNS( $namespaces{'pr'}, 'id' ); foreach my $contributor_id (@contributor_ids) { my $contributor_id_type = $contributor_id->getAttribute( "type" ); if($contributor_id_type eq "orcid" && $dataset->has_field( "contributors_orcid" ) ) { $contributordata->{orcid} = $plugin->xml_to_text( $contributor_id ); } } #email $contributordata->{id} = $plugin->getNameSpaceValue( $contributor, $namespaces{'pr'}, 'email' ); #name my $contributorname = {}; $contributorname->{family} = $plugin->getNameSpaceValue( $contributor, $namespaces{'pr'}, 'surname' ); $contributorname->{given} = $plugin->getNameSpaceValue( $contributor, $namespaces{'pr'}, 'firstnames' ); $contributorname->{honourific} = $plugin->getNameSpaceValue( $contributor, $namespaces{'pr'}, 'suffix' ); $contributordata->{name} = $contributorname; push @{$epdata->{contributors}}, $contributordata; #org_name my $org_name = $plugin->getNameSpaceValue( $contributor, $namespaces{'pr'}, 'org_name' ); push @{$epdata->{corp_creators}}, $org_name if defined $org_name; } ####OLD#### #apc #if( $dataset->has_field( "rioxx2_apc_input" ) ) #{ # $epdata->{rioxx2_apc_input} = $plugin->getNameSpaceValue( $xml, $namespaces{'rioxxterms'}, 'apc' ); # #} #coverage #if( $dataset->has_field( "rioxx2_coverage_input" ) ) #{ # $epdata->{rioxx2_coverage_input} = []; # my @coverage = $xml->getElementsByTagNameNS( $namespaces{'dc'}, 'coverage' ); # foreach my $coverage (@coverage) # { # push @{$epdata->{rioxx2_coverage_input}}, $plugin->xml_to_text( $coverage ); # } #} #dateSubmitted (not official RIOXX) #my $submission_date = $plugin->getNameSpaceValue( $xml, $namespaces{'dcterms'}, 'dateSubmitted' ); #if( $submission_date && $dataset->has_field( "dates" ) ) #{ # $submission_date =~ /(^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1]))/; # $epdata->{dates} ||= []; # push @{$epdata->{dates}}, { # date => $1, # date_type => "submitted", # }; #} #source #$epdata->{book_title} = $source; #$epdata->{event_title} = $source; #use Data::Dumper; #my $url="http://citations.eprints-hosting.org/124095/14/hello.pdf"; #my $tmp_file = new File::Temp; #EPrints::Utils::wget( $session, $url, $tmp_file ); #seek($tmp_file,0,0); #my $size = (-s $tmp_file); #print STDERR "size.....>$size\n"; #documents... #$epdata->{documents} = []; #add XML as a document #my $tmp_file = new File::Temp; #binmode($tmp_file, ":utf8"); #print $tmp_file $session->xml->to_string( $xml ); #seek($tmp_file,0,0); #push @{$epdata->{documents}}, { # mime_type => 'application/pdf', # main => 'hello.pdf', # files => [{ # filename => 'hello.pdf', # filesize => (-s $tmp_file), # mime_type => 'application/pdf', # _content => $tmp_file, # }], #}; #loop through all the download links, processing those marked as priamry first #and merge in exsiting docdata retrieved from the metadata for each new document my @documents = $xml->getElementsByTagNameNS( $namespaces{'pr'}, 'download_link' ); my $secondary = []; foreach my $doc ( @documents ) { my $document = $docdata; #copy existing document metadata for this URL #process primary documents first if( $doc->getAttribute( "primary" ) eq "true" ) { my $docFile = $plugin->getDocData( $session, $doc ); if( $docFile ) { my %merge = ( %{$docFile}, %{$document} ); #merge doc information with new file if( !$merge{content} && $merge{mime_type} eq "application/zip" ) { $merge{content} = "other"; } push @{$epdata->{documents}}, \%merge; } } else { push $secondary, $doc; #process these documents later } } #now process non-primary urls foreach my $doc ( @{$secondary} ) { if( $doc->getAttribute( "primary" ) ne "true" ) { my $document = {}; my $docFile = $plugin->getDocData( $session, $doc ); if( $docFile ) { $document->{language} = $code; my %merge = ( %{$docFile}, %{$document} ); #merge doc information with new file if( $merge{mime_type} eq "application/zip" ) { $merge{content} = "other"; } push @{$epdata->{documents}}, \%merge; } } } return $epdata; } sub mergeDoc { my( $plugin, $session, $docdata, $doc ) } sub getDocData { my( $plugin, $session, $doc ) = @_; #get document metadata from the XML my $desc = $plugin->xml_to_text( $doc ); my $public = $doc->getAttribute( "public" ); my $url = $doc->getAttribute( "url" ); if( ! (defined $public && $public eq "true" ) ) { $url .= "?api_key=" . $plugin->param("api_key"); } #check url resolves my $ua = LWP::UserAgent->new; $ua->timeout(10); if (head($url)) #URL resolves to something { #add metadata and file to the document my $filename = $doc->getAttribute( "filename" ); my $mime_type = $doc->getAttribute( "format" ); my $primary = $doc->getAttribute( "primary" ); my $format = "text"; if( $mime_type eq "application/zip" ) { $format = "archive"; } #download the file my $tmp_file = new File::Temp; my $r = EPrints::Utils::wget( $session, $url, $tmp_file ); #check we haver managed to access the file, if we can't access the file abort the whole process if( !$r->is_success ) { die "Error retrieving $url"; } seek($tmp_file,0,0); my $docdata = { format => $format, mime_type => $mime_type, main => $filename, files => [{ filename => $filename, mime_type => $mime_type, filesize => (-s $tmp_file), _content => $tmp_file, }], }; return $docdata; } else #URL doesn't resolve { return 0; } } sub getNameSpaceValue { my( $plugin, $xml, $ns, $field ) = @_; my $value = $xml->getElementsByTagNameNS( $ns, $field )->item(0); return $plugin->xml_to_text( $value ) if defined $value; } sub processName #return an array of names with the family name at the end of the array { my( $plugin, $name ) = @_; #process names accroding to http://www.rioxx.net/profiles/v2-0-final/ my @names = split(', ', $name); push @names, shift @names; return \@names if scalar @names > 1; #else try format used at https://github.com/JiscPER/jper-sword-out/blob/develop/docs/system/XWALK.md @names = split(' ', $name); return \@names if scalar @names > 1; } sub appendToNote { my( $plugin, $note, $label, $append ) = @_; $note = $note . $plugin->phrase( "note_separator") . $label . ": " . $append . " "; return $note; } 1; =head1 COPYRIGHT =for COPYRIGHT BEGIN Copyright 2000-2011 University of Southampton. =for COPYRIGHT END =for LICENSE BEGIN This file is part of EPrints L. EPrints is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. EPrints is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with EPrints. If not, see L. =for LICENSE END