package EPrints::Plugin::Event::UpdateTweetStreamAbstracts;

use Date::Calc qw/ Week_of_Year Delta_Days Add_Delta_Days /;
use Storable qw/ store retrieve /;
use Number::Bytes::Human qw/ format_bytes /;
use EPrints::Plugin::Event::LockingEvent;

@ISA = qw( EPrints::Plugin::Event::LockingEvent );

use strict;

#opts
#
# update_from_zero --> deletes the cache and regenerates everything;
sub action_update_tweetstream_abstracts
{
	my ($self, %opts) = @_;

        $self->{log_data}->{start_time} = scalar localtime time;
	my $repo = $self->repository;

	if ($self->is_locked)
	{
		$self->repository->log( (ref $self) . " is locked.  Unable to run.\n");
		return;
	}
	$self->create_lock;

	$self->{cache_file} = $repo->config('archiveroot') . '/var/' . 'tweetstream_update.cache';

	if ($opts{update_from_zero})
	{
		#remove the cache
		unlink $self->{cache_file} if -e $self->{cache_file}; 
		$self->{update_from_zero} = 1;
	}

	if ($opts{verbose})
	{
		$self->{verbose} = 1;
	}

	if ($opts{recommit_tweets})
	{
		$self->{recommit_tweets} = 1;
	}

	#global cache of this update's profile_image_urls
	$self->{profile_image_urls} = $self->read_cache_data('profile_image_urls');
	$self->{profile_image_urls} = {} unless $self->{profile_image_urls}; #initialise if unset

	$self->update_tweetstream_abstracts();

	$self->remove_lock;
        $self->{log_data}->{end_time} = scalar localtime time;
	$self->write_log;

}

sub generate_log_string
{
	my ($self) = @_;

	my $l = $self->{log_data};

	my @r;

	push @r, '===========================================================================';
	push @r, '';
        push @r, "Aggregation started at:        " . $l->{start_time};
	push @r, "Tweetstream abstracts updated  " . join(',',sort {$a <=> $b} @{$l->{tweetstreams_updated}});
	push @r, '';
	push @r, "Iterated over                  " . $l->{iterate_tweet_count} . " tweets";
	push @r, "Iteration Low ID               " . $l->{lowest_tweetid};
	push @r, "Iteration High ID              " . ( $l->{highest_tweetid} ? $l->{highest_tweetid} : 'none');
	push @r, "Started iteration at           " . $l->{iterate_start_time};
	push @r, "Finished iteration at          " . $l->{iterate_end_time};
	push @r, '';
	push @r, "Updating Objects started at    " . $l->{update_objects_start_time};
	push @r, "Updating Objects finished at   " . $l->{update_objects_end_time};
	push @r, "Number of sleeps while blocked " . $l->{update_tweetstreams_sleeps};
	push @r, '';
	push @r, '';
	push @r, '';
	my $size = $l->{start_cache_file_size};
	$size = 0 unless $size;
	push @r, "Cache size at start             $size (" . format_bytes($size) . ")";
	$size = $l->{end_cache_file_size};
	push @r, "Cache size at end               $size (" . format_bytes($size) . ")";
	push @r, '';
	push @r, "Aggregation finished at:       " . $l->{end_time};
	push @r, '';
	push @r, '===========================================================================';


	return join("\n", @r);
}


sub update_tweetstream_abstracts
{
	my ($self) = @_;

	$self->output_status('Starting to update abstracts');

	my $repo = $self->repository;
	my $tweet_ds = $repo->dataset('tweet');
	my $tweetstream_ds = $repo->dataset('tweetstream');

	my $high_id = $self->get_highest_tweetid(); #the first thing we do!
	if (!$high_id)
	{
		$repo->log("Couldn't find highest tweet id\n");
		$high_id = 1;
	}

	#set the low ID to the previous update's high ID
	my $low_id = $self->read_cache_data('highest_tweet_processed');
	$low_id = 0 unless $low_id;
	$low_id = 0 if $self->{update_from_zero}; #should be unneccesary as update_from_zero will remove the cache

	$low_id += 1; #start at the *next* ID
	$self->{log_data}->{lowest_tweetid} = $low_id;

	$self->{log_data}->{iterate_start_time} = scalar localtime time;

	$self->output_status("Iterating from $low_id to $high_id");

	my $page_size = 100000; #number of tweets we process before tidying the data
	my $i = 0;
	my $data = {};
	my $tweet_count = 0;
	foreach my $tweetid ($low_id..$high_id)
	{
		$self->wait; #let blocked_by plugins run their course

		my $tweet = $tweet_ds->dataobj($tweetid);
		next unless $tweet;

		next unless $tweet->is_set('tweetstreams');

		#this will reprocess the json of the tweet
		if ($self->{recommit_tweets})
		{
			$tweet->set_value('newborn', 'TRUE');
			$tweet->commit;
		}

		$tweet_count++; #number of processed tweets, for logging
		$self->{log_data}->{highest_tweetid} = $tweetid; #highest ID processed, for logging

		my $tweet_data = $self->tweet_to_data($tweet);

		my $tsids = $tweet->value('tweetstreams');

		foreach my $tsid (@{$tsids})
		{
			$data->{$tsid} = {} if (!$data->{$tsid}); #make sure we have an entry in the has for this tweetstream 

			$self->merge_in($data->{$tsid}, $tweet_data);
		}


		#tidy the accumulated data after a page of tweets
		$i++;
		$self->output_status('10000 processed') if $i % 10000 == 0;
		if ($i > $page_size)
		{
			$self->output_status("Page completed.  Currently on id $tweetid");
			#remove the least significant bits of count data to cut down on memory use
			$self->tidy_tweetstream_data($data);
			$i = 0;
		}
	}
	#one more tidy, for the last page
	$self->tidy_tweetstream_data($data);
	$self->output_status('Iteration Complete, starting updating of dataobjs');

	$self->{log_data}->{iterate_end_time} = scalar localtime time;
	$self->{log_data}->{iterate_tweet_count} = $tweet_count;

	#cache the profile_image_urls stored in each tweetstream, as we will be merging new values with old ones.
	foreach my $tsid (sort keys %{$data})
	{
		my $tweetstream = $tweetstream_ds->dataobj($tsid);
		next unless $tweetstream;
		next unless $tweetstream->is_set('top_from_users');

		my $tfu = $tweetstream->value('top_from_users');
		foreach my $val (@{$tfu})
		{
			my $user = $val->{from_user};
			next if ($self->{profile_image_urls}->{$user});

			my $self->{profile_image_urls}->{$user} = $val->{profile_image_url};
		}
	}

	$self->{log_data}->{update_objects_start_time} = scalar localtime time;
	$self->{log_data}->{update_tweetstreams_sleeps} = 0;
	my @updated_tweetstreams;
	foreach my $tsid (sort keys %{$data})
	{
		#prepare the data
		my $ts_data = $data->{$tsid};

		my $cached_ts_data = $self->read_cache_data('tweetstreams', $tsid);
		if ($cached_ts_data)
		{
			$self->merge_in($ts_data, $cached_ts_data);
		}

		$self->wait; #let blocked_by plugins run their course

		my $tweetstream = $tweetstream_ds->dataobj($tsid);
		next unless $tweetstream;

		push @updated_tweetstreams, $tsid;

		$self->update_tweetstream($tweetstream, $ts_data); 
	}
	$self->{log_data}->{tweetstreams_updated} = \@updated_tweetstreams;
	$self->{log_data}->{update_objects_end_time} = scalar localtime time;

	$self->output_status('Updating Complete, tidying up');

	foreach my $tsid (keys %{$data})
	{
		$self->write_cache_data($data->{$tsid}, 'tweetstreams', $tsid);
	}
	$self->write_cache_data($high_id, 'highest_tweet_processed');
	$self->write_cache_data($self->{profile_image_urls}, 'profile_image_urls');

	$self->tidy_cache;

	$self->write_cache;
	$self->output_status('Finished');
}


sub tweet_to_data
{
	my ($self, $tweet) = @_;

	my $repo = $self->repository;
	my $tweet_ds = $repo->dataset('tweet');

	#tweet fieldnames are keys, tweetstream fieldnames are values
	my $fieldmap = $repo->config('update_tweetstream_abstracts','fieldmap');

	my $data = {};

	#handle multiple and non-multiple simple fields
	foreach my $field (keys %{$fieldmap})
	{
		next unless $tweet->is_set($field);
		my $val = $tweet->value($field);

		if ($field eq 'created_at')
		{
			#convert from a datetime to a date
			my ($date, $time) = split(/ /, $val);
			$val = $date;
		}

		if ($field eq 'retweeted_status')
		{

		}

		if (ref $val eq 'ARRAY')
		{
			foreach my $v (@{$val})
			{
				$data->{$field}->{$v}++;
			}
		}
		else
		{
			$data->{$field}->{$val}++;
		}
	}

	#a bit of a hack, but store the profile_image_urls for each from user at the top level of the object
	my $from_user = $tweet->value('from_user');
	if ($from_user && !$self->{profile_image_urls}->{$from_user})
	{
		$self->{profile_image_urls}->{$from_user} = $tweet->value('profile_image_url');
	}

	return $data;
}

sub merge_in
{
	my ($self, $destination_hashref, $new_data_hashref) = @_;

	foreach my $data_category (keys %{$new_data_hashref})
	{
		foreach my $data_point (keys %{$new_data_hashref->{$data_category}})
		{
			$destination_hashref->{$data_category}->{$data_point} += $new_data_hashref->{$data_category}->{$data_point};
		}
	}
}

sub get_highest_tweetid
{
	my ($self) = @_;

	my $db = $self->repository->database;

	my $sql = 'SELECT MAX(tweetid) FROM tweet';

	my $sth = $db->prepare( $sql );

	$sth->execute;

	return $sth->fetchrow_arrayref->[0];
}

#remove data that is no longer needed (cached stuff for expired tweetstreams)
sub tidy_cache
{
	my ($self) = @_;

	my $ts_ds = $self->repository->dataset('tweetstream');
	my $ts_cache = $self->read_cache_data('tweetstreams');

	foreach my $tsid (keys %{$ts_cache})
	{
		my $ts = $ts_ds->dataobj($tsid);
		if (
			!$ts #no tweetstream in database
			|| ($ts->value('status') ne 'active') #inactive or retired, cache no longer needed
		)
		{
			delete $ts_cache->{$tsid};
		}
	}
}

sub write_cache_data
{
	my ($self, $data, @path) = @_;

	if (!$self->{cache})
	{
		$self->load_cache;
	}

	$self->_insert_into_hashref($data, $self->{cache}, @path);
}

sub read_cache_data
{
	my ($self, @path) = @_;

	if (!$self->{cache})
	{
		$self->load_cache;
	}

	my $c = $self->{cache};

	foreach my $k (@path)
	{
		if (exists $c->{$k})
		{
			$c = $c->{$k};
		}
		else
		{
			return undef;
		}
	}
	return $c;
}


#write the cache to the disk
sub write_cache
{
	my ($self) = @_;
	my $repo = $self->repository;
	my $cache_file = $self->{cache_file};

	store($self->{cache}, $cache_file) or $repo->log("Error updating tweetstream.  Couldn't write to $cache_file\n");

	$self->{log_data}->{end_cache_file_size} = -s $cache_file;
}

sub _insert_into_hashref
{
	my ($self, $data, $hashref, @path) = @_;

	my $c = $hashref;
	my $last_key = pop @path;

	foreach my $k (@path)
	{
		if (!exists $c->{$k})
		{
			$c->{$k} = {};
		}
		$c = $c->{$k};
	}

	$c->{$last_key} = $data;
}

#read from the cache and pull out data for any defined keys in $self->{tweetstream_data} (including the 'context' key)
sub load_cache
{
	my ($self) = @_;
	my $repo = $self->repository;
	my $cache_file = $self->{cache_file};

	if (!-e $cache_file)
	{
		$self->{log_data}->{start_cache_file_size} = 0;
		$self->{cache} = {};
		return;
	}

	my $cache_data = retrieve($cache_file);

	if (!defined $cache_data)
	{
		$repo->log("Error updating tweetstream.  Couldn't read from $cache_file\n");
		$self->{log_data}->{start_cache_file_size} = 0;
		$self->{cache} = {};
		return;
	}
	
	$self->{log_data}->{start_cache_file_size} = -s $cache_file;
	$self->{cache} = $cache_data;
}

sub update_tweetstream
{
	my ($self, $tweetstream, $data) = @_;

	my $repo = $self->repository;

	#tweet fieldnames are keys, tweetstream fieldnames are values
	my $fieldmap = $repo->config('update_tweetstream_abstracts','fieldmap');

	foreach my $fieldname (keys %{$fieldmap})
	{
		if ($fieldname eq 'created_at')
		{
			my ($period, $pairs) = $self->date_data_to_field_data($data->{$fieldname});
			$tweetstream->set_value('frequency_period',$period);
			$tweetstream->set_value('frequency_values',$pairs);
		}
		else
		{
			my $ts_fieldname = $fieldmap->{$fieldname}->{fieldname};
			my $subname = $fieldmap->{$fieldname}->{subname};

			my $n = $repo->config('tweetstream_tops',$ts_fieldname, 'n');

			my $val = $self->counts_to_field_data($subname, $data->{$fieldname}, $n);

			$tweetstream->set_value($ts_fieldname, $val);
		}
	}
	$tweetstream->commit;
}

sub date_data_to_field_data
{
	my ($self, $date_counts) = @_;

	my @sorted_dates = sort {$a cmp $b} keys %{$date_counts};

	my $first = $sorted_dates[0];
	my $last = $sorted_dates[$#sorted_dates];

	return (undef,undef) unless ($first && $last); #we won't bother generating graphs based on hours or minutes
	my $delta_days = Delta_Days($self->parse_datestring($first),$self->parse_datestring($last));

	return (undef,undef) unless $delta_days; #we won't bother generating graphs based on hours or minutes

#maximum day delta in each period class
	my $thresholds = {
		daily => (30*1),
		weekly => (52*7),
		monthly => (48*30),
	};

	my $period = 'yearly';
	foreach my $period_candidate (qw/ monthly weekly daily /)
	{
		$period = $period_candidate if $delta_days <= $thresholds->{$period_candidate};
	}

	my $label_values = {};
	my $pairs = [];

	$self->initialise_date_structures($label_values, $pairs, $first, $last, $period);

	foreach my $date (@sorted_dates)
	{
		my $label = $self->YMD_to_label($self->parse_datestring($date), $period);
		$label_values->{$label}->{value} += $date_counts->{$date};
	}

	return ($period, $pairs);
}	

sub initialise_date_structures
{
	my ($self, $label_values, $pairs, $first_date, $last_date, $period) = @_;

	my $current_date = $first_date;
	my $current_label = $self->YMD_to_label($self->parse_datestring($current_date),$period);
	my $last_label = $self->YMD_to_label($self->parse_datestring($last_date),$period);

	my ($year, $month, $day) = $self->parse_datestring($first_date);

	while ($current_label ne $last_label)
	{
		$label_values->{$current_label}->{label} = $current_label;
		$label_values->{$current_label}->{value} = 0;
		push @{$pairs}, $label_values->{$current_label};

		($year, $month, $day, $current_label) = $self->next_YMD_and_label($year, $month, $day, $current_label, $period);
	}

	$label_values->{$last_label}->{label} = $last_label;
	$label_values->{$last_label}->{value} = 0;
	push @{$pairs}, $label_values->{$last_label};
}

sub next_YMD_and_label
{
	my ($self, $year, $month, $day, $label, $period) = @_;

	my $new_label = $label;

	while ($new_label eq $label)
	{
		($year, $month, $day) = Add_Delta_Days ($year, $month, $day, 1);
		$new_label = $self->YMD_to_label($year, $month, $day, $period);
	}
	return ($year, $month, $day, $new_label);
}

sub YMD_to_label
{
	my ($self, $year, $month, $day, $period) = @_;

	return $year if $period eq 'yearly';
	return join('-',(sprintf("%04d",$year), sprintf("%02d",$month))) if $period eq 'monthly';
	return join('-',(sprintf("%04d",$year), sprintf("%02d",$month),sprintf("%02d",$day))) if $period eq 'daily';

	if ($period eq 'weekly')
	{
		my ($week, $wyear) = Week_of_Year($year, $month, $day);
		return "Week $week, $wyear";
	}

	return undef;
}


sub parse_datestring
{
        my ($self, $date) = @_;

        my ($year,$month,$day) = split(/[- ]/,$date);
        return ($year,$month,$day);
}


#takes a hashref of the form { 'foo' => 403, 'bar' => 600 ...}
#returns an ordered arrayref of the form [ { 'fieldid' => 'foo', count => '403', } ...]
#size is an optional argument that will trim the array to a specific size
sub counts_to_field_data
{
	my ($self, $fieldid, $data, $size) = @_;

	my @r;
	foreach my $k (sort {$data->{$b} <=> $data->{$a}} keys %{$data})
	{
		my $h = { $fieldid => $k, 'count' => $data->{$k} };
		if ($fieldid eq 'from_user')
		{
			$h->{profile_image_url} = $self->{profile_image_urls}->{$k};
		}
		push @r, $h
	}

	if ($size && (scalar @r > $size))
	{
		my @n = @r[0 .. ($size-1)];
		@r = @n;
	}

	return \@r;
}

#throw away the data that probably doesn't matter as we're processing lots and don't want to hammer the ram.
sub tidy_tweetstream_data
{
	my ($self, $data) = @_;
	my $repo = $self->repository;
	my $ts_ds = $repo->dataset('tweetstream');
	my $fieldmap = $repo->config('update_tweetstream_abstracts','fieldmap');

	TWEETSTREAM: foreach my $ts_id (keys %{$data})
	{
		my $tweetstream = $ts_ds->dataobj($ts_id);
		next TWEETSTREAM unless $tweetstream;
		my $tweet_count = $tweetstream->value('tweet_count');
		my $ts_data = $data->{$ts_id};

		COUNTSET: foreach my $fieldname (keys %{$ts_data})
		{
			next COUNTSET unless $fieldmap->{$fieldname}->{tidy};

			my $counts = $ts_data->{$fieldname};
			my @values = keys %{$counts};

			#how many shall we hold on to?  10% of the number of tweets + 10 times the number we will display.
			#bigger set for bigger streams and big enough sets for very small streams
			#this may need tweaking
			my $n = $repo->config('tweetstream_tops',$fieldname, 'n');
			$n = 50 unless $n;
			my $max = $n * 10;
			$max += int ($tweet_count / 10);

			next COUNTSET unless scalar @values > $max;

			@values = sort { $ts_data->{$fieldname}->{$b} <=> $ts_data->{$fieldname}->{$a} } @values;

			my @to_remove = @values[$max..$#values]; #take from index $max to the end

			foreach my $key (@to_remove)
			{
				delete $counts->{$key};
				#if we're removing a user, also remove the user's image URL (stored at object level)
				if ($fieldname eq 'top_from_users')
				{
					delete $self->{profile_image_urls}->{$key};
				}
			}
		}
	}
}

1;