package EPrints::Plugin::Import::OpenAlex;

use strict;
use base 'EPrints::Plugin::Import';

use LWP::UserAgent;
use JSON qw(decode_json);
use URI::Escape qw(uri_escape);

# Keep requests polite and predictable
sub _ua
{
	my( $plugin ) = @_;

	my $ua = LWP::UserAgent->new;
	$ua->env_proxy;
	$ua->timeout( 30 );

	my $repo = $plugin->{session};
	my $ua_str = $repo->get_conf( "openalex", "user_agent" ) || "EPrints OpenAlex Importer";

	$ua->default_header( 'Accept' => 'application/json' );
	$ua->default_header( 'User-Agent' => $ua_str );

	return $ua;
}

sub new
{
	my( $class, %params ) = @_;

	my $self = $class->SUPER::new(%params);

	$self->{name}    = "OpenAlex (DOI / Title / OpenAlex ID)";
	$self->{visible} = "all";

	# We create eprints (new records). If you later want "update existing eprint by eprintid",
	# we can implement a separate tool/screen action.
	$self->{produce} = [ 'list/eprint', 'dataobj/eprint' ];

	# Make it usable from Import Items
	$self->{accept} = [ "text/plain" ];

	return $self;
}

sub can_input
{
	my( $plugin, $type ) = @_;
	return 1 if $type eq "textarea";
	return 1 if $type eq "file";
	return 0;
}

sub input_fh
{
	my( $plugin, %opts ) = @_;

	my $session = $plugin->{session};
	my $fh      = $opts{fh};

	my @ids;

	while( my $line = <$fh> )
	{
		$line =~ s/\r?\n$//;
		$line =~ s/^\s+|\s+$//g;
		next if $line eq "";

		my $work = $plugin->_lookup_work( $line );
		if( !defined $work )
		{
			$plugin->warning( "OpenAlex: no match for '$line'" );
			next;
		}

		my $epdata = $plugin->_work_to_epdata( $work );
		if( !defined $epdata )
		{
			$plugin->warning( "OpenAlex: could not convert result for '$line'" );
			next;
		}

		my $dataobj = $plugin->epdata_to_dataobj( $opts{dataset}, $epdata );
		push @ids, $dataobj->get_id if defined $dataobj;
	}

	return EPrints::List->new(
		dataset => $opts{dataset},
		session => $session,
		ids     => \@ids
	);
}

# -----------------------------
# OpenAlex lookup logic
# -----------------------------

sub _lookup_work
{
	my( $plugin, $query ) = @_;

	my $api = $plugin->{session}->get_conf( "openalex", "api" ) || "https://api.openalex.org";

	my $api_key = $plugin->{session}->get_conf( "openalex", "api_key" );
	my $mailto  = $plugin->{session}->get_conf( "openalex", "mailto" );

	# Trim wrappers around DOI URLs etc.
	$query =~ s/^https?:\/\/(dx\.)?doi\.org\///i;
	$query =~ s/^doi:\s*//i;

	my $url;

	# OpenAlex Work ID formats:
	# - W123...
	# - https://openalex.org/W123...
	if( $query =~ m#openalex\.org/(W\d+)#i )
	{
		$query = $1;
	}
	if( $query =~ /^W\d+$/ )
	{
		$url = "$api/works/$query";
	}
	elsif( $query =~ /^10\.\S+\/\S+/ )
	{
		# DOI lookup
		$url = "$api/works/doi:$query";
	}
	else
	{
		# Title (search). We pull a few and pick the best by heuristic.
		my $search = uri_escape( $query );
		$url = "$api/works?search=$search&per-page=10";
	}

	# Select only needed fields to reduce payload.
	# (OpenAlex select works for list endpoints; singleton also accepts select.)
	my @select = qw(
		id doi title display_name publication_year publication_date type cited_by_count
		abstract_inverted_index authorships host_venue biblio primary_location concepts keywords
	);

	$url .= ( $url =~ /\?/ ? "&" : "?" ) . "select=" . join(",", @select);

	# Add auth params if configured
	if( defined $api_key && $api_key ne "" )
	{
		$url .= "&api_key=" . uri_escape( $api_key );
	}
	if( defined $mailto && $mailto ne "" )
	{
		$url .= "&mailto=" . uri_escape( $mailto );
	}

	my $ua  = $plugin->_ua;
	my $res = $ua->get( $url );

	return undef if !$res->is_success;

	my $data = eval { decode_json( $res->decoded_content ) };
	return undef if !$data;

	# If search endpoint, pick best match.
	if( defined $data->{results} && ref($data->{results}) eq "ARRAY" )
	{
		return $plugin->_pick_best_search_match( $query, $data->{results} );
	}

	# Singleton
	return $data if defined $data->{id};

	return undef;
}

sub _pick_best_search_match
{
	my( $plugin, $query, $results ) = @_;

	return undef if !@$results;

	my $q = _norm( $query );

	my $best;
	my $best_score = -1;

	foreach my $w ( @$results )
	{
		next if !defined $w->{title} && !defined $w->{display_name};

		my $title = _norm( $w->{title} || $w->{display_name} || "" );

		# Simple scoring: exact/contains + year boost if query contains year
		my $score = 0;
		$score += 100 if $title eq $q;
		$score += 60  if index($title, $q) >= 0 || index($q, $title) >= 0;

		# Prefer items that have a DOI (usually cleaner metadata)
		$score += 10 if defined $w->{doi} && $w->{doi} ne "";

		# Prefer higher cited_by_count slightly
		$score += int(($w->{cited_by_count}||0) / 100);

		if( $score > $best_score )
		{
			$best_score = $score;
			$best = $w;
		}
	}

	# If our best score is too low, still return first result (OpenAlex relevance is usually ok),
	# but you can tighten this if you want strict imports.
	return $best || $results->[0];
}

sub _norm
{
	my( $s ) = @_;
	$s //= "";
	$s = lc($s);
	$s =~ s/[^\p{L}\p{N}]+/ /g;
	$s =~ s/^\s+|\s+$//g;
	return $s;
}

# -----------------------------
# Convert OpenAlex work -> epdata
# -----------------------------

sub _work_to_epdata
{
	my( $plugin, $w ) = @_;

	return undef if !defined $w || !defined $w->{id};

	my %ep;

	# Title
	$ep{title} = $w->{title} || $w->{display_name};

	# Abstract (OpenAlex: abstract_inverted_index)
	if( defined $w->{abstract_inverted_index} && ref($w->{abstract_inverted_index}) eq "HASH" )
	{
		my $abs = $plugin->_abstract_from_inverted_index( $w->{abstract_inverted_index} );
		$ep{abstract} = $abs if defined $abs && $abs ne "";
	}

	# DOI (OpenAlex returns DOI as a URL)
	if( defined $w->{doi} && $w->{doi} ne "" )
	{
		my $doi = $w->{doi};
		$doi =~ s/^https?:\/\/doi\.org\///i;
		$ep{doi} = $doi if $doi =~ /^10\./;
	}

	# Publication date/year
	if( defined $w->{publication_date} && $w->{publication_date} ne "" )
	{
		$ep{date} = $w->{publication_date};
	}
	elsif( defined $w->{publication_year} )
	{
		$ep{date} = $w->{publication_year} . "-01-01";
	}

	# Type mapping (basic; tweak for your repo types)
	if( defined $w->{type} )
	{
		# OpenAlex types are like: journal-article, book, book-chapter, proceedings-article, dissertation, report, dataset, etc.
		# Map roughly to common EPrints types.
		my $t = $w->{type};
		$ep{type} =
			($t eq "journal-article")       ? "article" :
			($t eq "proceedings-article")   ? "conference_item" :
			($t eq "book")                  ? "book" :
			($t eq "book-chapter")          ? "book_section" :
			($t eq "report")                ? "monograph" :
			($t eq "dataset")               ? "dataset" :
			"other";
	}

	# Citation count + OpenAlex ID
	# These require you to add fields (see section 3 below).
	$ep{openalex_id} = $w->{id};
	$ep{openalex_cited_by_count} = $w->{cited_by_count} || 0;

	# Authors
	if( defined $w->{authorships} && ref($w->{authorships}) eq "ARRAY" )
	{
		my @creators;
		foreach my $a ( @{$w->{authorships}} )
		{
			next if !defined $a->{author} || !defined $a->{author}->{display_name};
			my $display = $a->{author}->{display_name};

			my $name = $plugin->_parse_name( $display );
			my %creator = ( name => $name );

			# ORCID if available (OpenAlex author.orcid is typically a URL)
			if( defined $a->{author}->{orcid} && $a->{author}->{orcid} ne "" )
			{
				my $orcid = $a->{author}->{orcid};
				$orcid =~ s/^https?:\/\/orcid\.org\///i;
				$creator{id} = $orcid if $orcid =~ /^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$/;
			}

			push @creators, \%creator;
		}

		# EPrints expects creators_name to be list of name compounds, and optionally creators_id.
		# Many repos use 'creators' compound field, but in XML it’s creators_name/creators_id.
		# We'll populate the compound-style expected by typical EPrints datasets:
		$ep{creators} = \@creators if @creators;
	}

	# Publication details
	# host_venue: journal/conference container
	if( defined $w->{host_venue} && ref($w->{host_venue}) eq "HASH" )
	{
		# journal title
		if( defined $w->{host_venue}->{display_name} )
		{
			$ep{publication} = $w->{host_venue}->{display_name};
		}

		# ISSN
		if( defined $w->{host_venue}->{issn} && ref($w->{host_venue}->{issn}) eq "ARRAY" )
		{
			# Many repos have issn multiple
			$ep{issn} = $w->{host_venue}->{issn};
		}
	}

	# Biblio: volume, issue, first_page, last_page
	if( defined $w->{biblio} && ref($w->{biblio}) eq "HASH" )
	{
		$ep{volume} = $w->{biblio}->{volume} if defined $w->{biblio}->{volume};
		$ep{number} = $w->{biblio}->{issue}  if defined $w->{biblio}->{issue};

		if( defined $w->{biblio}->{first_page} || defined $w->{biblio}->{last_page} )
		{
			my $fp = $w->{biblio}->{first_page} // "";
			my $lp = $w->{biblio}->{last_page}  // "";
			if( $fp ne "" && $lp ne "" )
			{
				$ep{pagerange} = "$fp-$lp";
			}
			elsif( $fp ne "" )
			{
				$ep{pagerange} = $fp;
			}
		}
	}

	# Official URL (landing page / best location)
	if( defined $w->{primary_location} && ref($w->{primary_location}) eq "HASH" )
	{
		my $landing = $w->{primary_location}->{landing_page_url};
		$ep{official_url} = $landing if defined $landing && $landing ne "";
	}

	# Keywords: combine OpenAlex keywords + concepts (top concepts)
	# This requires you to decide where to store them:
	# - 'keywords' (text) is common in EPrints
	# - 'subjects' is controlled, avoid auto-mapping unless you have a mapping table
	my @kw;

	# OpenAlex keywords (if present in your API account/version)
	if( defined $w->{keywords} && ref($w->{keywords}) eq "ARRAY" )
	{
		foreach my $k ( @{$w->{keywords}} )
		{
			next if !defined $k->{display_name};
			push @kw, $k->{display_name};
		}
	}

	# Concepts (broad classification)
	my @concepts;
	if( defined $w->{concepts} && ref($w->{concepts}) eq "ARRAY" )
	{
		# store display_name list; optionally top N only
		foreach my $c ( @{$w->{concepts}} )
		{
			next if !defined $c->{display_name};
			push @concepts, $c->{display_name};
		}

		# Also push top concepts into keywords for discoverability
		push @kw, @concepts[0..($#concepts < 4 ? $#concepts : 4)] if @concepts;
	}

	# De-duplicate keywords
	if( @kw )
	{
		my %seen;
		my @uniq = grep { defined $_ && $_ ne "" && !$seen{lc($_)}++ } @kw;

		# If your repo uses 'keywords' as a single text field:
		# $ep{keywords} = join( ", ", @uniq );
		# If your repo uses 'keywords' multiple:
		$ep{keywords} = \@uniq;
	}

	# Store full concept list in openalex_concepts (recommended new field)
	$ep{openalex_concepts} = \@concepts if @concepts;

	# Default workflow status for imports
	$ep{eprint_status} = "inbox" if !defined $ep{eprint_status};

	return \%ep;
}

sub _parse_name
{
	my( $plugin, $display ) = @_;
	$display //= "";

	# If "Family, Given"
	if( $display =~ /^\s*([^,]+)\s*,\s*(.+?)\s*$/ )
	{
		return { family => $1, given => $2 };
	}

	# Else: last token as family, rest given
	my @parts = split /\s+/, $display;
	if( @parts <= 1 )
	{
		return { family => $display, given => "" };
	}
	my $family = pop @parts;
	my $given  = join( " ", @parts );
	return { family => $family, given => $given };
}

sub _abstract_from_inverted_index
{
	my( $plugin, $inv ) = @_;

	# abstract_inverted_index is { word => [pos1,pos2,...], ... }
	# Reconstruct by placing words into position array
	my %pos_to_word;
	foreach my $word ( keys %$inv )
	{
		my $positions = $inv->{$word};
		next if ref($positions) ne "ARRAY";
		foreach my $p ( @$positions )
		{
			$pos_to_word{$p} = $word if defined $p;
		}
	}

	return "" if !%pos_to_word;

	my @positions = sort { $a <=> $b } keys %pos_to_word;
	my @words;
	foreach my $p ( @positions )
	{
		push @words, $pos_to_word{$p};
	}

	# Basic clean-up
	my $abs = join( " ", @words );
	$abs =~ s/\s+([.,;:!?])/$1/g;

	return $abs;
}

1;