User:TotoBaggins
From Wikipedia, the free encyclopedia
Contents |
[edit] My name
My name is Sean, and I sign my posts as such.
[edit] Names of note
[edit] Random stuff
Articles with 30 or more disambiguators
[edit] Wikiblame
Here's a hack to find who put some nefarious text into an article. Please let me know if you have any comments or suggestions.
#!/usr/bin/perl -w # FILE: wiki-blame.pl # AUTHOR: TotoBaggins on en.wikipedia.org # # LICENSE: GPL # # DESCRIPTION: This program outputs a URL of the first revision of an article # to contain a particular bit of text. We only download log_2 N articles, # so it's reasonably fast and kind to the servers. # # USAGE: # # ./wiki-blame.pl {article or history URL} {offending text} # # The URL must be url-encoded. # The offending text should be quoted as a single argument. # # EXAMPLES: # # Find which revision inserted some text at or before an old revision # of the C++ article: # # ./wiki-blame.pl 'http://en.wikipedia.org/w/index.php?title=C%2B%2B&oldid=101608911' 'Evil Text' # # # Find which revision inserted some text at or before the current revision # of the C++ article: # # ./wiki-blame.pl 'http://en.wikipedia.org/wiki/C%2B%2B' 'Evil Text' # # # BUGS: # # -- We only look back 2000 edits worth. # -- We could be politer and faster if we stepped back through # history exponentially. # -- We are too dependent on wikipedia.org's URL layout. # use strict; use LWP::UserAgent; use HTML::Parser; use HTML::LinkExtor; use Carp; use Data::Dumper; my $WebAgent = LWP::UserAgent->new(keep_alive => 1); sub get_page { my $url = shift; my $response = $WebAgent->get($url); $response->is_success() or croak $response->status_line(); return $response->content(); } sub get_links { my $url = shift; my $url_front = $url; $url_front =~ s,^([^/]+//[^/]+)/.*,$1,; my $page = get_page($url); my $linky = HTML::LinkExtor->new(); $linky->utf8_mode(1); $linky->parse($page) or croak "Can't parse: $page"; my @urls; foreach my $link ($linky->links()) { my ($tag, %links) = @$link; my $url = $links{href} or next; push @urls, "$url_front$url"; } return @urls; } use constant YES_MARKER => 100; use constant NO_MARKER => 0; { my %MarkerCache; sub url_has_text { my ($url, $text) = @_; unless (defined $MarkerCache{$url}) { my $page = get_page($url); use File::Slurp; $url =~ /oldid=(\d+)/; write_file("oldid-$1.html", $page); $MarkerCache{$url} = index($page, $text) >= 0 ? YES_MARKER : NO_MARKER; } return $MarkerCache{$url}; } } # This is from List::Search, which had a bug. It can be # removed when "List::Search::nlist_search(2, [2, 2, 2])" returns 0 and not 1 sub custom_list_search { my ($cmp_code, $key, $array_ref) = @_; my $max_index = scalar(@$array_ref) - 1; my $low = 0; my $mid = undef; my $high = $max_index; my $lowest_match = undef; while ($low <= $high) { $mid = int($low + (($high - $low) / 2)); my $mid_val = $array_ref->[$mid]; my $cmp_result = $cmp_code->($key, $mid_val); if ($cmp_result > 0) { $low = $mid + 1; } else { if ($cmp_result == 0 && (!defined($lowest_match) || $lowest_match > $mid)) { $lowest_match = $mid; } $high = $mid - 1; } } # Look at the values here and work out what to return. # Perhaps there are no matches in the array return -1 if $cmp_code->($key, $array_ref->[-1]) == 1; # Perhaps $mid is just before the best match return $mid + 1 if $cmp_code->($key, $array_ref->[$mid]) == 1; # $mid is correct return $mid; } sub snarf_history_urls { my ($article_url, $limit) = @_; my $idx_url = $article_url; $idx_url =~ s/\&oldid=(\d+)$/\&action=history&limit=$limit/ || $idx_url =~ s,/wiki/(.+),/w/index.php?title=$1&limit=$limit&action=history, or die $idx_url; my @all_urls = get_links($idx_url); my @history_urls; foreach my $url (@all_urls) { # only old article urls next unless $url =~ m,/index.php\?title=[^&]+&oldid=\d+$,; push @history_urls, $url; } # make chronological @history_urls = reverse @history_urls; return @history_urls; } sub get_first_history_url { my $url = shift; return $url if $url =~ /&oldid=\d/; my @history_urls = snarf_history_urls($url, 1); return $history_urls[0]; } sub find_rev_that_added { my ($offending_history_url, $offending_text) = @_; my $history_index_url = $offending_history_url; # FIXME: we limit it to 2000 urls to be kind to the wikiservers # We should really bite off pieces of history stepwise. my $max_urls = 2000; my @history_urls = snarf_history_urls($offending_history_url, $max_urls); $offending_history_url =~ /\&oldid=(\d+)/ or die $offending_history_url; my $offending_id = $1; # my %url2index; my $saw_offender; my @before_offender_urls; foreach my $url (@history_urls) { # $url2index{$url} = @before_offender_urls; push @before_offender_urls, $url; last if $url =~ /\&oldid=$offending_id\b/; } my $url2marker = sub { my ($key, $url) = @_; my $has_it = url_has_text($url, $offending_text); my $ret = $key <=> $has_it; # warn "has($has_it), ret($ret), u2i($url2index{$url}), $url\n"; return $key <=> $has_it; }; my $first_with = custom_list_search($url2marker, YES_MARKER, \@before_offender_urls); return unless $first_with >= 0; if ($first_with == $max_urls) { warn "Warning: It looks like that edit occurred further in " . "the past than I feel comfortable looking (beyond " . "$max_urls edits).\n"; return; } return $before_offender_urls[$first_with]; } @ARGV == 2 or die "usage: $0 {article or history URL} {offending text}\n"; my $url = shift; my $offending_text = shift; my $offending_history_url = get_first_history_url($url); if (my $found_url = find_rev_that_added($offending_history_url, $offending_text)) { if ($found_url eq $offending_history_url) { print "No earlier revisions found.\n"; } else { print "Earliest revision: $found_url\n"; } } else { print "Not found\n"; }

