package EPrints::Plugin::Import::OPFXML; use strict; # Declare Namespaces our $DC_NS = "http://purl.org/dc/elements/1.1/"; our $DCTERMS_NS = "http://purl.org/dc/terms/"; our $OPF_NS = "http://www.idpf.org/2007/opf"; # This is just a variant of the DefaultXML plug-in use EPrints::Plugin::Import::DefaultXML; our @ISA = qw/ EPrints::Plugin::Import::DefaultXML /; sub new { my( $class, %params ) = @_; my $self = $class->SUPER::new(%params); $self->{name} = "OPF Resource"; # Make it visible on the import menu and elsewhere $self->{visible} = "all"; $self->{produce} = [ 'list/eprint', 'dataobj/eprint' ]; # Functionality to recognise XML types on import by recognising the base namespace, works with the sword packaging format, dc:conformsTo or similar. $self->{xmlns} = "http://www.idpf.org/2007/opf"; my $rc = EPrints::Utils::require_if_exists("MIME::Types"); unless( $rc ) { $self->{visible} = ""; $self->{error} = "Failed to load required module MIME::Types"; } return $self; } # Input File Handle Method, for when files are uploaded sub input_fh { my( $plugin, %opts ) = @_; my $fh = $opts{"fh"}; my $xml = join "", <$fh>; my $list; if( $xml =~ /^<\?xml/ ) { $list = $plugin->input_fh_xml( $xml, %opts ); } else { $list = $plugin->input_fh_list( $xml, %opts ); } $list ||= EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids => [] ); return $list; } # Handle direct XML input sub input_fh_xml { my( $plugin, $xml, %opts ) = @_; my $doc = EPrints::XML::parse_xml_string( $xml ); my $dataobj = $plugin->xml_to_dataobj( $opts{dataset}, $doc->documentElement ); EPrints::XML::dispose( $doc ); return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids => [defined($dataobj) ? $dataobj->get_id : ()] ); } # Go grab input from a URL sub input_fh_list { my( $plugin, $url, %opts ) = @_; $url =~ s/\s+//g; my $tmpfile = File::Temp->new; my $r = EPrints::Utils::wget( $plugin->{session}, $url, $tmpfile ); seek($tmpfile,0,0); if( $r->is_error ) { $plugin->error( "Error reading resource from $url: ".$r->code." ".$r->message ); return; } my @ids; while(my $url = <$tmpfile>) { $url =~ s/\s+//g; next unless $url =~ /^http/; my $doc; eval { $doc = EPrints::XML::parse_url( $url ) }; if( $@ ) { $plugin->warning( "Error parsing: $url\n" ); } my $dataobj = $plugin->xml_to_dataobj( $opts{dataset}, $doc->documentElement ); EPrints::XML::dispose( $doc ); if( defined $dataobj ) { push @ids, $dataobj->get_id; } } return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids => \@ids ); } # Translate this XML into an EPrint sub xml_to_dataobj { # $xml is the PubmedArticle element my( $plugin, $dataset, $xml ) = @_; my $session = $plugin->{session}; # Locate the metadata element my $metadata = $xml->getElementsByTagNameNS( $OPF_NS, "metadata" )->[0]; # Load the DC plugin my $dc_plugin = $session->plugin( "Import::XSLT::DC", processor => $plugin->{processor}, dataset => $dataset, ); $dc_plugin->{Handler} = $plugin->{Handler}; $dc_plugin->{parse_only} = $plugin->{parse_only}; # Spew the metadata element to a temp file my $tmpfile2 = File::Temp->new; print $tmpfile2 $metadata->toString(); seek($tmpfile2,0,0); # Parse the file using the plug-in to get back a list of eprints my $list = $dc_plugin->input_fh( fh => $tmpfile2, dataset => $dataset ); my( $eprint ) = $list->get_records( 0, 1 ); return $eprint; } 1;