#!/usr/bin/perl -w #this needs to be moved into a locking event so that it can give way use strict; use warnings; use EPrints; use Archive::Zip qw( :ERROR_CODES :CONSTANTS ); use JSON; use Archive::Zip::MemberRead; #bugfix #sub Archive::Zip::MemberRead::opened { 1 } use Data::Dumper; my $STATE = {tweets_created => 0, tweets_existing => 0}; my ($repoid, $userid, $filename) = @ARGV; die "export_tweetstream_packages.pl *repositoryid* *username/userid* *file*\n" unless $filename; chomp $repoid; chomp $filename; die "file $filename doesn't exist\n" unless -e $filename; my $ep = EPrints->new; my $repo = $ep->repository($repoid); die "couldn't create repository for '$repoid'\n" unless $repo; my $user; if( $userid =~ m/^\d+/ ) { $user = $repo->dataset('user')->dataobj($userid); } else { $user = EPrints::DataObj::User::user_with_username( $repo, $userid ); } if( !defined $user ) { die "Can't find user with userid/username [$userid]\n"; } my $files; # Read a Zip file my $zip = Archive::Zip->new(); unless ( $zip->read( $filename ) == AZ_OK ) { die 'Unable to read $filename as ZIP'; } foreach my $member ($zip->members) { next if $member->isDirectory; my $filename = $member->fileName; $filename =~ m/\.([^\.]*)$/; my $extension = $1; next unless $extension; push @{$files->{$extension}}, $filename; } #validate presence of correctly named XML file: die "no object XML file in zip file\n" unless $files->{xml}->[0] =~ m/tweetstream[0-9]+\/tweetstream\.xml/; #check that we have some json files die unless $files->{json} and scalar @{$files->{json}} > 0; my $ds = $repo->dataset('tweetstream'); my $plugin = $repo->plugin('Import::XML'); die "Couldn't load import plugin\n" unless $plugin; if( $plugin->broken ) { print STDERR "Plugin Import::XML could not run because:\n"; print STDERR $plugin->error_message."\n"; $repo->terminate; exit 1; } my $ts = create_tweetstream_dataobj($zip, $files, $repo); die "problem creating tweetstream object\n" unless $ts; $STATE->{new_tweetstreamid} = $ts->id; add_tweets_to_tweetstream($zip, $files, $ts); print "Tweetstream package imported successfully:\n"; print "\tTweetstream ID: " . $STATE->{new_tweetstreamid} . "\n"; print $STATE->{tweets_created} . " tweets created\n"; print $STATE->{tweets_existing} . " existing tweets in this stream\n"; print "Now run update_tweetstream_abstracts to generate the new tweetstream's abstract page\n"; if ($STATE->{tweets_existing}) { print "Note that some tweets were already existing. You may need to run update_tweetstream_abstracts with the 'update_from_zero' option, though be aware this may take some time as it removes all cached data and regenerates all tweetstream abstracts from scratch."; } sub add_tweets_to_tweetstream { my ($zip, $files, $ts) = @_; my $repo = $ts->repository; my $json = JSON->new->allow_nonref; foreach my $filename (sort sort_json_filenames @{$files->{json}}) { my $fh = file_in_zip_to_fh($filename, $zip); my @json_txt = <$fh>; my $tweets = eval { $json->utf8()->decode(join('',@json_txt)); }; if ($@) { print STDERR "problem parsing $filename in zip file:\n$@"; print STDERR "rolling back changes (deleting tweetstream object)\n"; $ts->remove; print STDERR "rollback successful\n"; exit; } my $summary_data = {}; foreach my $json_tweet (@{$tweets->{tweets}}) { my $twitterid = $json_tweet->{id}; #We might want to do something with the added value fields, but for now # we won't. #In future it will be particularly important to make a sensible decision about this # if we have some fully resolved URLs (i.e. we've followed the short URL redirects) #In any case, we need to delete it from the hash before creating the tweet object # and do something sensible with the data on the created (or already existing) object my $eprints_value_added = delete $json_tweet->{eprints_value_added}; #check to see if we already have a tweet with this twitter id in this repository my $tweetobj = EPrints::DataObj::Tweet::tweet_with_twitterid($repo,$twitterid); if (!defined $tweetobj) { $tweetobj = EPrints::DataObj::Tweet->create_from_data( $repo, { twitterid => $twitterid, json_source => $json_tweet, # tweetstreams => [$ts->id], } ); $STATE->{tweets_created} += 1; } else { $STATE->{tweets_existing} += 1; } $tweetobj->commit; #this is an interim solution. It would be better to collect a few hundred and then add them. $ts->add_tweets([$tweetobj]); } } } sub sort_json_filenames { $a =~ m/([0-9]*)\.json/; my $a_int = $1; $b =~ m/([0-9]*)\.json/; my $b_int = $1; return $a_int <=> $b_int; } sub create_tweetstream_dataobj { my ($zip, $files, $repo) = @_; my $fh = file_in_zip_to_fh($files->{xml}->[0], $zip); $fh = wrap_with_tag('tweetstreams', $fh); my $ds = $repo->dataset('tweetstream'); my $list = $plugin->input_fh( dataset => $ds, fh => $fh ); return undef unless $list; my $ts = $list->item(0); return undef unless $ts; $ts->set_value('userid', $user->id); $ts->commit; return $ts; } ##now create the tweet objects from the json data, rebuilding the abstract with the data gleaned from that. #File::Zip's function to provide a handle to a zipped file #doesn't seem to work, so we'll write to a temp file and give a handle to that sub file_in_zip_to_fh { my ($filename, $zip) = @_; my $tmp_fh = File::Temp->new( TEMPLATE => "ep-ts-import_unzipXXXXX", TMPDIR => 1 ); my $member = $zip->memberNamed($filename); $member->extractToFileHandle($tmp_fh); #move to start of file seek($tmp_fh, 0, 0); return $tmp_fh; } #the plugin is expecting a list, so make the XML look like it's a list sub wrap_with_tag { my ($tagname, $fh) = @_; my $tmp_fh = File::Temp->new( TEMPLATE => "ep-ts-import_unzipXXXXX", TMPDIR => 1 ); print $tmp_fh "<$tagname>"; print $tmp_fh $_ while (<$fh>); print $tmp_fh ""; #move to start of file seek($tmp_fh, 0, 0); return $tmp_fh; }