package EPrints::Plugin::Export::WordleLink; use EPrints::Plugin::Export; @ISA = ( "EPrints::Plugin::Export" ); use strict; sub new { my( $class, %opts ) = @_; my( $self ) = $class->SUPER::new( %opts ); $self->{name} = "Wordle Link"; $self->{accept} = [ 'list/eprint', 'dataobj/eprint', 'dataobj/tweetstream' ]; $self->{visible} = "all"; $self->{mimetype} = 'text/html; charset=utf-8'; $self->{extension} = '.html'; #Global vars, set in functions $self->{item_dataset_id} = ''; $self->{wordle_data} = {}; $self->{stopwords} = $self->initialise_stopwords(); $self->{tweetstream_search_string} = ''; #to store for preprocessing #aggregate options #concatenate -- all the data in a single string -- field rendered and tree_to_utf8 called #values -- full values as rendered and tree_to_utf8'ed #words -- Render and tree_to_utf8 called, then similar values folded together and the most popular one used (e.g. MIT = M.I.T = mit) #barevalues -- values as stored in the database, rather than rendered values #note that this operates on the fields of the tweet objects $self->{wordle_configs}->{tweetstream} = [ { id => 'text', aggregate => 'concatenate', fields => ['text'], force_lower => 1, title => 'Text', remove_urls => 1, }, { id => 'textw', aggregate => 'words', fields => ['text'], force_lower => 1, title => 'Text (word frequency)', remove_urls => 1, }, { id => 'tweeters', aggregate => 'barevalues', fields => ['from_user'], force_lower => 0, title => 'Tweeters', remove_urls => 1, }, { id => 'hashtags', aggregate => 'barevalues', fields => ['hashtags'], force_lower => 1, title => 'Hashtags', remove_urls => 1, }, ]; $self->{wordle_configs}->{eprint} = [ { id => 'title', aggregate => 'words', fields => ['title'], force_lower => 1, title => 'Titles (word frequency)' }, { id => 'titlew', aggregate => 'concatenate', fields => ['title'], force_lower => 1, title => 'Titles (all text)' }, { id => 'people', aggregate => 'values', fields => ['creators_name', 'editors_name', 'contributors_name'], force_lower => 0, limit => 5000, title => 'People' }, { id => 'abstract', aggregate => 'words', fields => ['abstract'], force_lower => 1, limit => 5000, title => 'Abstracts (word frequency)' }, ]; return $self; } sub generate_html_output { my ($self) = @_; my $html = ' Wordle Links

Generate a Wordle

Please submit one of the textboxes below to wordle. Feel free to tweak the data if you want to merge synonyms. Stop words and words in the search field of the twitter stream have been removed (they make the wordle uninteresting).

'; my @boxes; foreach my $wordle (@{$self->{wordle_configs}->{$self->{item_dataset_id}}}) { push @boxes, $self->generate_wordle_input($wordle); } my $i = 0; while ($boxes[$i]) { $html .= ''; if ($boxes[$i+1]) { $html .= ''; } $html .= ''; $i+=2; } $html .= '
'. $boxes[$i] . ''. $boxes[$i+1] . '
'; return $html; } sub generate_wordle_input { my ($self, $wordle) = @_; my $html; my $title = $wordle->{title}; $html .= "

$title

"; $html .= '
'; if ($wordle->{aggregate} eq 'concatenate') { $html .= '
'; return $html; } sub generate_wordle_input_text { my ($self, $wordle) = @_; my $text = ''; my $agg = $wordle->{aggregate}; my $wordleid = $wordle->{id}; if ($agg eq 'concatenate') { my @words = split(/[\s\.\,()\!?]/,$self->{wordle_data}->{$wordleid}); my $filtered_words = $self->remove_stop_words(\@words); $text = join(' ', @{$filtered_words}); } elsif ($agg eq 'values' || $agg eq 'barevalues' || $agg eq 'words') { my $r; my $vals = {}; if ($agg eq 'words') { my $wordsets = $self->{wordle_data}->{$wordleid}; foreach my $wordset (values %{$wordsets}) { my ($word, $count) = $self->collapse_wordset($wordset); $vals->{$word} = $count; } } else { $vals = $self->{wordle_data}->{$wordleid}; } my @words = keys %{$vals}; my $filtered_words = $self->remove_stop_words(\@words); my %OK; foreach my $w (@{$filtered_words}) { $OK{$w} = 1; } my $i = 0; foreach my $v (sort {$vals->{$b} <=> $vals->{$a}} keys %{$vals}) { next unless $OK{$v}; $r .= $v . ':' . $vals->{$v} . "\n"; if ($wordle->{limit}) { $i++; last if $i >= $wordle->{limit}; } } $text = $r; } return $text; #should never happen } sub collapse_wordset { my ($self, $wordset) = @_; # wordset looks like this: # { # 'total count' => 53, # 'MIT' => 20, # 'M.I.T' => 23, # 'mit.' => 5, # 'mit,' => 5, # } #find most common instance my $word_actual; my $max = 0; foreach my $word (keys %{$wordset}) { next if $word eq 'total count'; if ($wordset->{$word} > $max) { $word_actual = $word; $max = $wordset->{$word}; } } return ($word_actual,$wordset->{'total count'}); } sub generate_field_data { my ($self, $dataobj, $fieldid, $wordle) = @_; return unless $dataobj->exists_and_set($fieldid); if ($wordle->{aggregate} eq 'values' || $wordle->{aggregate} eq 'barevalues') { my $vals; if ($wordle->{aggregate} eq 'values') { $vals = $self->get_field_values($dataobj, $fieldid); } else { $vals = $self->get_field_barevalues($dataobj, $fieldid); } foreach my $v (@{$vals}) { $v = lc($v) if $wordle->{force_lower}; $v =~ s/https?:\/\/[^\s]*//g if $wordle->{remove_urls}; $self->{wordle_data}->{$wordle->{id}}->{$v}++; } } elsif ($wordle->{aggregate} eq 'concatenate' || $wordle->{aggregate} eq 'words') { my $html = $dataobj->render_value($fieldid); my $text = EPrints::Utils::tree_to_utf8($html); $text = lc($text) if $wordle->{force_lower}; #quick and dirty $text =~ s/https?:\/\/[^\s]*//g if $wordle->{remove_urls}; if ($wordle->{aggregate} eq 'concatenate') { $self->{wordle_data}->{$wordle->{id}} .= ' ' . $text; } elsif ($wordle->{aggregate} eq 'words') { my @words = split(/[\s\.\,()\!?]/, $text); foreach my $word (@words) { next unless $word =~ m/\w/; my $cv = $self->generate_compareval($word); $self->{wordle_data}->{$wordle->{id}}->{$cv}->{$word}++; $self->{wordle_data}->{$wordle->{id}}->{$cv}->{'total count'}++; #cheeky, but there's a space so there won't be a collision } } } } sub generate_compareval { my ($self, $word) = @_; my $val = $word; $val = lc($val); $val =~ s/[^\w]//g; return $val; } sub get_field_barevalues { my ($self, $dataobj, $fieldid) = @_; my $val = $dataobj->value($fieldid); return $val if ref $val eq 'ARRAY'; return [ $val ]; } sub get_field_values { my ($self, $dataobj, $fieldid) = @_; my $r = []; #put the values in here. my $f = $dataobj->dataset->field($fieldid); $f = $f->clone; if ($f->property('multiple')) { $f->set_property('multiple',0); my $vals = $dataobj->value($fieldid); foreach my $val (@{$vals}) { my $html = $f->render_value($self->repository,$val); push @{$r}, EPrints::Utils::tree_to_utf8($html); } } else { my $html = $dataobj->render_value($fieldid); push @{$r}, EPrints::Utils::tree_to_utf8($html); } return $r; } sub generate_dataobj_data { my ($self, $item, %opts) = @_; $self->{item_dataset_id} = $item->get_dataset_id unless $self->{item_dataset_id}; #only set it once my $wordle_configs = $self->{wordle_configs}->{$self->{item_dataset_id}}; if ($item->get_dataset_id eq 'tweetstream') { #don't process if it's to big! return if ($item->value('tweet_count') > 100000); $self->{tweetstream_search_string} = $item->value('search_string'); my $tweetlist = $item->tweets; $tweetlist->map(sub{ my ($session, $dataset, $dataobj, $info) = @_; my $plugin = $info->{plugin}; my %opts = %{$info->{opts}}; $plugin->generate_dataobj_data($dataobj, %opts); }, { plugin => $self, opts => \%opts }); } else { foreach my $wordle (@{$self->{wordle_configs}->{$self->{item_dataset_id}}}) { foreach my $fieldid (@{$wordle->{fields}}) { $self->generate_field_data($item, $fieldid, $wordle); } } } } #remove stop words sub remove_stop_words { my ($self, $words) = @_; my %stopwords = %{$self->{stopwords}}; if ($self->{item_dataset_id} eq 'tweetstream') { $stopwords{rt} = 1; #get rid of retweet labels my @search_string_stops = split(/[\s\.\,()\!?]/,$self->{tweetstream_search_string}); foreach my $stop (@search_string_stops) { my $stop = lc($stop); next unless $stop =~ m/[a-z0-9]/; $stop =~ s/[^a-z0-9]//g; #remove non alphanumeric $stopwords{$stop} = 1; } } my $filtered_words = []; foreach my $word (@{$words}) { my $cmpval = lc($word); $cmpval =~ s/[^a-z0-9]//g; #remove non alphanumeric push @{$filtered_words}, $word unless $stopwords{$cmpval}; } return $filtered_words; } sub output_list { my( $self, %opts ) = @_; $opts{list}->map( sub { my( $session, $dataset, $item ) = @_; $self->generate_dataobj_data( $item, %opts ); } ); my $html = $self->generate_html_output; if( defined $opts{fh} ) { print {$opts{fh}} $html; return; } return $html; } sub output_dataobj { my( $self, $dataobj, %opts ) = @_; $self->generate_dataobj_data($dataobj); my $html = $self->generate_html_output; if( defined $opts{fh} ) { print {$opts{fh}} $html; return; } return $html; } sub initialise_stopwords { #apostrophes have been removed for ease of processing my %StopWords = map { lc $_, 1 } qw( a about above after again against all am an and any are arent as at be because been before being below between both but by cant cannot could couldnt did didnt do does doesnt doing dont down during each few for from further had hadnt has hasnt have havent having he hed hell hes her here heres hers herself him himself his how hows i id ill im ive if in into is isnt it its its itself lets me more most mustnt my myself no nor not of off on once only or other ought our ours ourselves out over own same shant she shed shell shes should shouldnt so some such than that thats the their theirs them themselves then there theres these they theyd theyll theyre theyve this those through to too under until up very was wasnt we wed well were weve were werent what whats when whens where wheres which while who whos whom why whys with wont would wouldnt you youd youll youre youve your yours yourself yourselves ); return \%StopWords; } 1;