package EPrints::Plugin::Import::OpenAlex; use strict; use base 'EPrints::Plugin::Import'; use LWP::UserAgent; use JSON qw(decode_json); use URI::Escape qw(uri_escape); # Keep requests polite and predictable sub _ua { my( $plugin ) = @_; my $ua = LWP::UserAgent->new; $ua->env_proxy; $ua->timeout( 30 ); my $repo = $plugin->{session}; my $ua_str = $repo->get_conf( "openalex", "user_agent" ) || "EPrints OpenAlex Importer"; $ua->default_header( 'Accept' => 'application/json' ); $ua->default_header( 'User-Agent' => $ua_str ); return $ua; } sub new { my( $class, %params ) = @_; my $self = $class->SUPER::new(%params); $self->{name} = "OpenAlex (DOI / Title / OpenAlex ID)"; $self->{visible} = "all"; # We create eprints (new records). If you later want "update existing eprint by eprintid", # we can implement a separate tool/screen action. $self->{produce} = [ 'list/eprint', 'dataobj/eprint' ]; # Make it usable from Import Items $self->{accept} = [ "text/plain" ]; return $self; } sub can_input { my( $plugin, $type ) = @_; return 1 if $type eq "textarea"; return 1 if $type eq "file"; return 0; } sub input_fh { my( $plugin, %opts ) = @_; my $session = $plugin->{session}; my $fh = $opts{fh}; my @ids; while( my $line = <$fh> ) { $line =~ s/\r?\n$//; $line =~ s/^\s+|\s+$//g; next if $line eq ""; my $work = $plugin->_lookup_work( $line ); if( !defined $work ) { $plugin->warning( "OpenAlex: no match for '$line'" ); next; } my $epdata = $plugin->_work_to_epdata( $work ); if( !defined $epdata ) { $plugin->warning( "OpenAlex: could not convert result for '$line'" ); next; } my $dataobj = $plugin->epdata_to_dataobj( $opts{dataset}, $epdata ); push @ids, $dataobj->get_id if defined $dataobj; } return EPrints::List->new( dataset => $opts{dataset}, session => $session, ids => \@ids ); } # ----------------------------- # OpenAlex lookup logic # ----------------------------- sub _lookup_work { my( $plugin, $query ) = @_; my $api = $plugin->{session}->get_conf( "openalex", "api" ) || "https://api.openalex.org"; my $api_key = $plugin->{session}->get_conf( "openalex", "api_key" ); my $mailto = $plugin->{session}->get_conf( "openalex", "mailto" ); # Trim wrappers around DOI URLs etc. $query =~ s/^https?:\/\/(dx\.)?doi\.org\///i; $query =~ s/^doi:\s*//i; my $url; # OpenAlex Work ID formats: # - W123... # - https://openalex.org/W123... if( $query =~ m#openalex\.org/(W\d+)#i ) { $query = $1; } if( $query =~ /^W\d+$/ ) { $url = "$api/works/$query"; } elsif( $query =~ /^10\.\S+\/\S+/ ) { # DOI lookup $url = "$api/works/doi:$query"; } else { # Title (search). We pull a few and pick the best by heuristic. my $search = uri_escape( $query ); $url = "$api/works?search=$search&per-page=10"; } # Select only needed fields to reduce payload. # (OpenAlex select works for list endpoints; singleton also accepts select.) my @select = qw( id doi title display_name publication_year publication_date type cited_by_count abstract_inverted_index authorships host_venue biblio primary_location concepts keywords ); $url .= ( $url =~ /\?/ ? "&" : "?" ) . "select=" . join(",", @select); # Add auth params if configured if( defined $api_key && $api_key ne "" ) { $url .= "&api_key=" . uri_escape( $api_key ); } if( defined $mailto && $mailto ne "" ) { $url .= "&mailto=" . uri_escape( $mailto ); } my $ua = $plugin->_ua; my $res = $ua->get( $url ); return undef if !$res->is_success; my $data = eval { decode_json( $res->decoded_content ) }; return undef if !$data; # If search endpoint, pick best match. if( defined $data->{results} && ref($data->{results}) eq "ARRAY" ) { return $plugin->_pick_best_search_match( $query, $data->{results} ); } # Singleton return $data if defined $data->{id}; return undef; } sub _pick_best_search_match { my( $plugin, $query, $results ) = @_; return undef if !@$results; my $q = _norm( $query ); my $best; my $best_score = -1; foreach my $w ( @$results ) { next if !defined $w->{title} && !defined $w->{display_name}; my $title = _norm( $w->{title} || $w->{display_name} || "" ); # Simple scoring: exact/contains + year boost if query contains year my $score = 0; $score += 100 if $title eq $q; $score += 60 if index($title, $q) >= 0 || index($q, $title) >= 0; # Prefer items that have a DOI (usually cleaner metadata) $score += 10 if defined $w->{doi} && $w->{doi} ne ""; # Prefer higher cited_by_count slightly $score += int(($w->{cited_by_count}||0) / 100); if( $score > $best_score ) { $best_score = $score; $best = $w; } } # If our best score is too low, still return first result (OpenAlex relevance is usually ok), # but you can tighten this if you want strict imports. return $best || $results->[0]; } sub _norm { my( $s ) = @_; $s //= ""; $s = lc($s); $s =~ s/[^\p{L}\p{N}]+/ /g; $s =~ s/^\s+|\s+$//g; return $s; } # ----------------------------- # Convert OpenAlex work -> epdata # ----------------------------- sub _work_to_epdata { my( $plugin, $w ) = @_; return undef if !defined $w || !defined $w->{id}; my %ep; # Title $ep{title} = $w->{title} || $w->{display_name}; # Abstract (OpenAlex: abstract_inverted_index) if( defined $w->{abstract_inverted_index} && ref($w->{abstract_inverted_index}) eq "HASH" ) { my $abs = $plugin->_abstract_from_inverted_index( $w->{abstract_inverted_index} ); $ep{abstract} = $abs if defined $abs && $abs ne ""; } # DOI (OpenAlex returns DOI as a URL) if( defined $w->{doi} && $w->{doi} ne "" ) { my $doi = $w->{doi}; $doi =~ s/^https?:\/\/doi\.org\///i; $ep{doi} = $doi if $doi =~ /^10\./; } # Publication date/year if( defined $w->{publication_date} && $w->{publication_date} ne "" ) { $ep{date} = $w->{publication_date}; } elsif( defined $w->{publication_year} ) { $ep{date} = $w->{publication_year} . "-01-01"; } # Type mapping (basic; tweak for your repo types) if( defined $w->{type} ) { # OpenAlex types are like: journal-article, book, book-chapter, proceedings-article, dissertation, report, dataset, etc. # Map roughly to common EPrints types. my $t = $w->{type}; $ep{type} = ($t eq "journal-article") ? "article" : ($t eq "proceedings-article") ? "conference_item" : ($t eq "book") ? "book" : ($t eq "book-chapter") ? "book_section" : ($t eq "report") ? "monograph" : ($t eq "dataset") ? "dataset" : "other"; } # Citation count + OpenAlex ID # These require you to add fields (see section 3 below). $ep{openalex_id} = $w->{id}; $ep{openalex_cited_by_count} = $w->{cited_by_count} || 0; # Authors if( defined $w->{authorships} && ref($w->{authorships}) eq "ARRAY" ) { my @creators; foreach my $a ( @{$w->{authorships}} ) { next if !defined $a->{author} || !defined $a->{author}->{display_name}; my $display = $a->{author}->{display_name}; my $name = $plugin->_parse_name( $display ); my %creator = ( name => $name ); # ORCID if available (OpenAlex author.orcid is typically a URL) if( defined $a->{author}->{orcid} && $a->{author}->{orcid} ne "" ) { my $orcid = $a->{author}->{orcid}; $orcid =~ s/^https?:\/\/orcid\.org\///i; $creator{id} = $orcid if $orcid =~ /^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$/; } push @creators, \%creator; } # EPrints expects creators_name to be list of name compounds, and optionally creators_id. # Many repos use 'creators' compound field, but in XML it’s creators_name/creators_id. # We'll populate the compound-style expected by typical EPrints datasets: $ep{creators} = \@creators if @creators; } # Publication details # host_venue: journal/conference container if( defined $w->{host_venue} && ref($w->{host_venue}) eq "HASH" ) { # journal title if( defined $w->{host_venue}->{display_name} ) { $ep{publication} = $w->{host_venue}->{display_name}; } # ISSN if( defined $w->{host_venue}->{issn} && ref($w->{host_venue}->{issn}) eq "ARRAY" ) { # Many repos have issn multiple $ep{issn} = $w->{host_venue}->{issn}; } } # Biblio: volume, issue, first_page, last_page if( defined $w->{biblio} && ref($w->{biblio}) eq "HASH" ) { $ep{volume} = $w->{biblio}->{volume} if defined $w->{biblio}->{volume}; $ep{number} = $w->{biblio}->{issue} if defined $w->{biblio}->{issue}; if( defined $w->{biblio}->{first_page} || defined $w->{biblio}->{last_page} ) { my $fp = $w->{biblio}->{first_page} // ""; my $lp = $w->{biblio}->{last_page} // ""; if( $fp ne "" && $lp ne "" ) { $ep{pagerange} = "$fp-$lp"; } elsif( $fp ne "" ) { $ep{pagerange} = $fp; } } } # Official URL (landing page / best location) if( defined $w->{primary_location} && ref($w->{primary_location}) eq "HASH" ) { my $landing = $w->{primary_location}->{landing_page_url}; $ep{official_url} = $landing if defined $landing && $landing ne ""; } # Keywords: combine OpenAlex keywords + concepts (top concepts) # This requires you to decide where to store them: # - 'keywords' (text) is common in EPrints # - 'subjects' is controlled, avoid auto-mapping unless you have a mapping table my @kw; # OpenAlex keywords (if present in your API account/version) if( defined $w->{keywords} && ref($w->{keywords}) eq "ARRAY" ) { foreach my $k ( @{$w->{keywords}} ) { next if !defined $k->{display_name}; push @kw, $k->{display_name}; } } # Concepts (broad classification) my @concepts; if( defined $w->{concepts} && ref($w->{concepts}) eq "ARRAY" ) { # store display_name list; optionally top N only foreach my $c ( @{$w->{concepts}} ) { next if !defined $c->{display_name}; push @concepts, $c->{display_name}; } # Also push top concepts into keywords for discoverability push @kw, @concepts[0..($#concepts < 4 ? $#concepts : 4)] if @concepts; } # De-duplicate keywords if( @kw ) { my %seen; my @uniq = grep { defined $_ && $_ ne "" && !$seen{lc($_)}++ } @kw; # If your repo uses 'keywords' as a single text field: # $ep{keywords} = join( ", ", @uniq ); # If your repo uses 'keywords' multiple: $ep{keywords} = \@uniq; } # Store full concept list in openalex_concepts (recommended new field) $ep{openalex_concepts} = \@concepts if @concepts; # Default workflow status for imports $ep{eprint_status} = "inbox" if !defined $ep{eprint_status}; return \%ep; } sub _parse_name { my( $plugin, $display ) = @_; $display //= ""; # If "Family, Given" if( $display =~ /^\s*([^,]+)\s*,\s*(.+?)\s*$/ ) { return { family => $1, given => $2 }; } # Else: last token as family, rest given my @parts = split /\s+/, $display; if( @parts <= 1 ) { return { family => $display, given => "" }; } my $family = pop @parts; my $given = join( " ", @parts ); return { family => $family, given => $given }; } sub _abstract_from_inverted_index { my( $plugin, $inv ) = @_; # abstract_inverted_index is { word => [pos1,pos2,...], ... } # Reconstruct by placing words into position array my %pos_to_word; foreach my $word ( keys %$inv ) { my $positions = $inv->{$word}; next if ref($positions) ne "ARRAY"; foreach my $p ( @$positions ) { $pos_to_word{$p} = $word if defined $p; } } return "" if !%pos_to_word; my @positions = sort { $a <=> $b } keys %pos_to_word; my @words; foreach my $p ( @positions ) { push @words, $pos_to_word{$p}; } # Basic clean-up my $abs = join( " ", @words ); $abs =~ s/\s+([.,;:!?])/$1/g; return $abs; } 1;