=head1 NAME EPrints::Plugin::Import::OREResource =cut package EPrints::Plugin::Import::OREResource; use strict; our $RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; our $ORE_NS = "http://www.openarchives.org/ore/terms/"; our $DC_NS = "http://purl.org/dc/elements/1.1/"; our $DCTERMS_NS = "http://purl.org/dc/terms/"; our $OAI_DC_NS = "http://www.openarchives.org/OAI/2.0/oai_dc/"; use EPrints::Plugin::Import::DefaultXML; our @ISA = qw/ EPrints::Plugin::Import::DefaultXML /; sub new { my( $class, %params ) = @_; my $self = $class->SUPER::new(%params); $self->{name} = "OAI-ORE Resource"; $self->{visible} = "all"; $self->{produce} = [ 'list/eprint', 'dataobj/eprint' ]; my $rc = EPrints::Utils::require_if_exists("MIME::Types"); unless( $rc ) { $self->{visible} = ""; $self->{error} = "Failed to load required module MIME::Types"; } return $self; } sub input_fh { my( $plugin, %opts ) = @_; my $fh = $opts{"fh"}; my $xml = join "", <$fh>; my $list; if( $xml =~ /^<\?xml/ ) { $list = $plugin->input_fh_xml( $xml, %opts ); } else { $list = $plugin->input_fh_list( $xml, %opts ); } $list ||= EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids => [] ); return $list; } sub input_fh_xml { my( $plugin, $xml, %opts ) = @_; my $doc = EPrints::XML::parse_xml_string( $xml ); my $dataobj = $plugin->xml_to_dataobj( $opts{dataset}, $doc->documentElement ); EPrints::XML::dispose( $doc ); return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids => [$dataobj->get_id] ); } sub input_fh_list { my( $plugin, $url, %opts ) = @_; my $max_records = 10; $url =~ s/\s+//g; my $tmpfile = File::Temp->new; my $r = EPrints::Utils::wget( $plugin->{session}, $url, $tmpfile ); seek($tmpfile,0,0); if( $r->is_error ) { $plugin->error( "Error reading resource map list from $url: ".$r->code." ".$r->message ); return; } my @ids; while(my $url = <$tmpfile>) { $url =~ s/\s+//g; next unless $url =~ /^http/; my $doc; eval { $doc = EPrints::XML::parse_url( $url ) }; if( $@ ) { $plugin->warning( "Error parsing resource map: $url\n" ); } my $dataobj = $plugin->xml_to_dataobj( $opts{dataset}, $doc->documentElement ); EPrints::XML::dispose( $doc ); if( defined $dataobj ) { push @ids, $dataobj->get_id; last unless $max_records--; } } return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids => \@ids ); } sub xml_to_dataobj { # $xml is the PubmedArticle element my( $plugin, $dataset, $xml ) = @_; my $session = $plugin->{session}; my $epdata = {}; my $baseURI = $xml->getAttribute( "xml:base" ); my @descs = $xml->getElementsByTagNameNS( $RDF_NS, "Description" ); my $ore_resource_uri; my $oai_dc; my %resources; my %aggregates; foreach my $desc (@descs) { my $uri = $desc->getAttributeNS( $RDF_NS, "about" ); if( defined($baseURI) ) { $uri = URI->new_abs( $uri, $baseURI ); } my $format; my @ore_aggregates = $desc->getElementsByTagNameNS( $ORE_NS, "aggregates" ); foreach( @ore_aggregates ) { $ore_resource_uri ||= $uri; my $resource = $_->getAttributeNS( $RDF_NS, "resource" ); $aggregates{$resource} = 1; } my @dc_formats = $desc->getElementsByTagNameNS( $DCTERMS_NS, "format" ); push @dc_formats, $desc->getElementsByTagNameNS( $DC_NS, "format" ); foreach( @dc_formats ) { my $format = EPrints::Utils::tree_to_utf8( $_ ); if( defined $uri && $format =~ /\// ) { $resources{"$uri"} = $format; } } my @conforms_to = $desc->getElementsByTagNameNS( $DCTERMS_NS, "conformsTo" ); foreach(@conforms_to) { my $rdf_resource = $_->getAttributeNS( $RDF_NS, "resource" ); $rdf_resource ||= EPrints::Utils::tree_to_utf8( $_ ); $rdf_resource =~ s/\/?$/\//; # fix for bug in Export if( $rdf_resource eq $OAI_DC_NS ) { $oai_dc = $desc->getAttributeNS( $RDF_NS, "about" ); if( defined($baseURI) ) { $oai_dc = URI->new_abs( $oai_dc, $baseURI ); } } } } if( !$oai_dc ) { if( $ore_resource_uri ) { $plugin->warning( "No OAI_DC found in resource map for $ore_resource_uri: ignoring!\n" ); } return; } my $tmpfile = File::Temp->new; EPrints::Utils::wget( $session, $oai_dc, "$tmpfile" ); seek($tmpfile,0,0); $plugin->handler->parsed( $epdata ); return if( $plugin->{parse_only} ); my $dc_plugin = $session->plugin( "Import::XSLT::OAI_Dublin_Core_XML", processor => $plugin->{processor}, dataset => $dataset, ); my $dc_xml = join "", <$tmpfile>; $dc_xml =~ s/ (<(?:\w+:)?date\s*>)([^>]+)(<\s*\/(?:\w+:)?date>) /&format_dc_date($1,$2,$3)/exg; my $tmpfile2 = File::Temp->new; print $tmpfile2 $dc_xml; seek($tmpfile2,0,0); my $list = $dc_plugin->input_fh( fh => $tmpfile2, dataset => $dataset ); my( $eprint ) = $list->get_records( 0, 1 ); while(my( $uri, $format ) = each %resources) { next unless $aggregates{$uri}; my $cnt_file = File::Temp->new(); my $r = EPrints::Utils::wget($session,$uri,$cnt_file); next unless $r->is_success; my $content_type = $r->header( "Content-Type" ) or next; ($content_type) = split /;/, $content_type; my $mime_type = MIME::Types->new->type( $content_type ) or next; my $doc = EPrints::DataObj::Document->create_from_data( $session, { eprintid => $eprint->get_id, format => $content_type, }, $session->dataset( "document" ) ); my( $ext ) = $mime_type->extensions; $doc->upload( $cnt_file, "main.$ext" ); } $plugin->handler->object( $dataset, $eprint ); return $eprint; } sub format_dc_date { my( $open, $date, $close ) = @_; $date =~ s/\s+//g; if( $date =~ /^\d{4}(-\d{2}(-\d{2})?)?$/ ) { return "$open$date$close"; } return ""; } 1; =head1 COPYRIGHT =for COPYRIGHT BEGIN Copyright 2000-2011 University of Southampton. =for COPYRIGHT END =for LICENSE BEGIN This file is part of EPrints L. EPrints is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. EPrints is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with EPrints. If not, see L. =for LICENSE END