User:TotoBaggins

From Wikipedia, the free encyclopedia

Contents

[edit] My name

My name is Sean, and I sign my posts as such.

[edit] Names of note

[edit] Random stuff

Articles with 30 or more disambiguators


[edit] Wikiblame

Here's a hack to find who put some nefarious text into an article. Please let me know if you have any comments or suggestions.

#!/usr/bin/perl -w
 # FILE: wiki-blame.pl
 # AUTHOR: TotoBaggins on en.wikipedia.org
 # 
 # LICENSE: GPL
 #
 # DESCRIPTION: This program outputs a URL of the first revision of an article
 # to contain a particular bit of text.  We only download log_2 N articles,
 # so it's reasonably fast and kind to the servers.
 # 
 # USAGE:
 #
 #    ./wiki-blame.pl {article or history URL} {offending text}
 #
 # The URL must be url-encoded.
 # The offending text should be quoted as a single argument.
 # 
 # EXAMPLES:
 #
 # Find which revision inserted some text at or before an old revision
 # of the C++ article:
 # 
 #   ./wiki-blame.pl 'http://en.wikipedia.org/w/index.php?title=C%2B%2B&oldid=101608911' 'Evil Text'
 #
 #
 # Find which revision inserted some text at or before the current revision
 # of the C++ article:
 #
 #   ./wiki-blame.pl 'http://en.wikipedia.org/wiki/C%2B%2B' 'Evil Text'
 #
 # 
 # BUGS:
 #
 # -- We only look back 2000 edits worth.
 # -- We could be politer and faster if we stepped back through
 #    history exponentially.
 # -- We are too dependent on wikipedia.org's URL layout.
 # 
 
 use strict;
 use LWP::UserAgent;
 use HTML::Parser;
 use HTML::LinkExtor;
 use Carp;
 use Data::Dumper;
 
 my $WebAgent = LWP::UserAgent->new(keep_alive => 1);
 
 sub get_page
 {
     my $url = shift;
     my $response = $WebAgent->get($url);
     $response->is_success() or croak $response->status_line();
     return $response->content();
 }
 
 sub get_links
 {
     my $url = shift;
     my $url_front = $url;
     $url_front =~ s,^([^/]+//[^/]+)/.*,$1,;
     my $page = get_page($url);
     my $linky = HTML::LinkExtor->new();
     $linky->utf8_mode(1);
     $linky->parse($page) or croak "Can't parse: $page";
     my @urls;
     foreach my $link ($linky->links())
     {
         my ($tag, %links) = @$link;
         my $url = $links{href} or next;
         push @urls, "$url_front$url";
     }
     return @urls;
 }
 
 
 use constant YES_MARKER => 100;
 use constant NO_MARKER  => 0;
 
 {
     my %MarkerCache;
     sub url_has_text
     {
         my ($url, $text) = @_;
         unless (defined $MarkerCache{$url})
         {
             my $page = get_page($url);
             use File::Slurp;
             $url =~ /oldid=(\d+)/;
             write_file("oldid-$1.html", $page);
             $MarkerCache{$url} = index($page, $text) >= 0
                                     ? YES_MARKER : NO_MARKER;
         }
         return $MarkerCache{$url};
     }
 }
 
 # This is from List::Search, which had a bug.  It can be
 # removed when "List::Search::nlist_search(2, [2, 2, 2])" returns 0 and not 1
 sub custom_list_search
 {
     my ($cmp_code, $key, $array_ref) = @_;
 
     my $max_index = scalar(@$array_ref) - 1;
 
     my $low  = 0;
     my $mid  = undef;
     my $high = $max_index;
     my $lowest_match = undef;
 
     while ($low <= $high)
     {
         $mid = int($low + (($high - $low) / 2));
         my $mid_val = $array_ref->[$mid];
 
         my $cmp_result = $cmp_code->($key, $mid_val);
 
         if ($cmp_result > 0)
         {
             $low = $mid + 1;
         }
         else
         {
             if ($cmp_result == 0
                  && (!defined($lowest_match) || $lowest_match > $mid))
             {
                 $lowest_match = $mid;
             }
             $high = $mid - 1;
         }
     }
 
     # Look at the values here and work out what to return.
     # Perhaps there are no matches in the array
     return -1 if $cmp_code->($key, $array_ref->[-1]) == 1;
 
     # Perhaps $mid is just before the best match
     return $mid + 1 if $cmp_code->($key, $array_ref->[$mid]) == 1;
 
     # $mid is correct
     return $mid;
 }
 
 sub snarf_history_urls
 {
     my ($article_url, $limit) = @_;
 
     my $idx_url = $article_url;
     $idx_url =~ s/\&oldid=(\d+)$/\&action=history&limit=$limit/
         ||
     $idx_url =~ s,/wiki/(.+),/w/index.php?title=$1&limit=$limit&action=history,
         or die $idx_url;
 
     my @all_urls = get_links($idx_url);
 
     my @history_urls;
     foreach my $url (@all_urls)
     {
         # only old article urls
         next unless $url =~ m,/index.php\?title=[^&]+&oldid=\d+$,;
 
         push @history_urls, $url;
     }
     # make chronological
     @history_urls = reverse @history_urls;
 
     return @history_urls;
 }
 
 sub get_first_history_url
 {
     my $url = shift;
     return $url if $url =~ /&oldid=\d/;
     my @history_urls = snarf_history_urls($url, 1);
     return $history_urls[0];
 }
 
 sub find_rev_that_added
 {
     my ($offending_history_url, $offending_text) = @_;
     my $history_index_url = $offending_history_url;
 
     # FIXME: we limit it to 2000 urls to be kind to the wikiservers
     # We should really bite off pieces of history stepwise.
     my $max_urls = 2000;
     my @history_urls = snarf_history_urls($offending_history_url, $max_urls);
 
     $offending_history_url =~ /\&oldid=(\d+)/ or die $offending_history_url;
     my $offending_id = $1;
 
 #     my %url2index;
     my $saw_offender;
     my @before_offender_urls;
     foreach my $url (@history_urls)
     {
 #         $url2index{$url} = @before_offender_urls;
         push @before_offender_urls, $url;
         last if $url =~ /\&oldid=$offending_id\b/;
     }
 
     my $url2marker = sub {
         my ($key, $url) = @_;
         my $has_it = url_has_text($url, $offending_text);
         my $ret = $key <=> $has_it;
 #         warn "has($has_it), ret($ret), u2i($url2index{$url}), $url\n";
         return $key <=> $has_it;
     };
     my $first_with = custom_list_search($url2marker, YES_MARKER,
                                         \@before_offender_urls);
     return unless $first_with >= 0;
 
     if ($first_with == $max_urls)
     {
         warn "Warning: It looks like that edit occurred further in "
              . "the past than I feel comfortable looking (beyond "
              . "$max_urls edits).\n";
         return;
     }
 
     return $before_offender_urls[$first_with];
 }
 
 @ARGV == 2 or die "usage: $0 {article or history URL} {offending text}\n";
 my $url = shift;
 my $offending_text = shift;
 
 my $offending_history_url = get_first_history_url($url);
 
 if (my $found_url = find_rev_that_added($offending_history_url,
                                         $offending_text))
 {
     if ($found_url eq $offending_history_url)
     {
         print "No earlier revisions found.\n";
     }
     else
     {
         print "Earliest revision: $found_url\n";
     }
 }
 else
 {
     print "Not found\n";
 }