From Wikipedia, the free encyclopedia
#!/usr/bin/perl
# Wikipedia bot to fix redirects to lists.
# e.g., if [[Random Thing 3]] is a redirect to [[List of random things]],
# which has a section #Random Thing 3, change all instances of
# [[Random Thing 3]] to [[List of random things#Random Thing 3]]
# Usage: listredir.pl [--nostart] Name of list
# --nostart causes it to not edit the user page to mark start; use when restarting after crash
# Copyright (C) 2005 Nick Tarleton (Wikipedia username: Nickptar)
# This code may be redistributed, modified or unmodified, without restriction,
# provided that this copyright notice and license remain in place.
# We apologize for the ad-hoc-ness.
use warnings;
use strict;
use LWP;
use HTTP::Cookies;
use POSIX qw(strftime);
# modify as necessary
my $wikihost = 'en.wikipedia.org';
my $wikipath = 'w/index.php';
my $wikiuser = 'N-Bot';
my $wikipass = '';
my $sleeptime = 30;
my $jar = HTTP::Cookies->new();
my %sectionsbyredir; # contains redirect->section header mappings
my %redirsbypage; # contains pages needing fixing and the redirects on them
my %pagesbysection; # contains section->page mappings, for talk page message
$| = 1; # why not
# format: [[4 August]] [[2005]]
my $date = strftime('[[%d %B]] [[%Y]]', gmtime);
$date =~ s/\[\[0/[[/; # get rid of stupid leading zero on day
my $nostart = 0;
if($ARGV[0] eq "--nostart") {
$nostart = 1;
shift @ARGV;
}
my $listname = join(' ', @ARGV);
die "Usage: $0 listname" unless $listname =~ /\S/;
print "listredir.pl - by Nick Tarleton (User:Nickptar)\n";
print "processing $listname\n----\n";
login();
getMap();
getPages();
setWorking();
fixPages();
writeNote();
setDone();
# make and initialize a UserAgent
sub get_ua {
my $ua = LWP::UserAgent->new();
$ua->agent("listredir.pl operated by User:$wikiuser");
$ua->cookie_jar($jar);
push @{ $ua->requests_redirectable }, 'POST';
return $ua;
}
my $starttime;
my $edittime;
my $edittoken;
my $editingpage;
my $editingpage_orig;
# get the text of a page, and be prepared to write it back
sub checkout {
$editingpage_orig = $editingpage = shift;
$editingpage =~ s/ /_/g;
$editingpage =~ s/&/%26/g;
print "Retrieving $editingpage_orig... ";
my $ua = get_ua();
my $addr = "http://$wikihost/$wikipath?title=$editingpage&action=edit";
my $response = $ua->get($addr);
die $response->status_line unless $response->is_success;
$response->content =~ /<textarea[^>]*>([^<]*)<\/textarea>/s
or die "syntax error: no textarea";
my $result = $1;
$result =~ s/</</g;
$result =~ s/>/>/g;
$result =~ s/"/"/g;
$result =~ s/&/&/g;
$response->content =~ /<input type='hidden' value="([^"]*)" name="wpStarttime" \/>/
or die "syntax error: no starttime";
$starttime = $1;
$response->content =~ /<input type='hidden' value="([^"]*)" name="wpEdittime" \/>/
or die "syntax error: no edittime";
$edittime = $1;
$response->content =~ /<input type='hidden' value="([^"]*)" name="wpEditToken" \/>/
or die "syntax error: no edittoken";
$edittoken = $1;
print "Success.\n";
return $result;
}
# save the page being edited
sub checkin {
my ($text, $summary, $minor) = @_;
print "Saving... ";
my $ua = get_ua();
my $addr = "http://$wikihost/$wikipath?title=$editingpage&action=submit";
my $response = $ua->post($addr,
Content_Type => 'form-data',
Content => {
'wpTextbox1' => $text,
'wpSummary' => $summary,
'wpSave' => 'Save page',
'wpMinoredit' => ($minor ? '1' : ''),
'wpSection' => '',
'wpStarttime' => $starttime,
'wpEdittime' => $edittime,
'wpEditToken' => $edittoken
}
);
die $response->status_line unless $response->is_success;
# crappy crappy success check
$editingpage_orig =~ s/\(/\\(/g;
$editingpage_orig =~ s/\)/\\)/g;
$editingpage_orig =~ s/&/&/g;
unless($response->content =~ /<title>$editingpage_orig/) {
open OUT,">results" or die "could not open results";
print OUT $response->content;
close OUT;
die "error. Dumped to results."
}
print "Success. zzz...\n";
sleep $sleeptime;
}
# add a section to a talk page
sub addsection {
my ($page, $text, $title) = @_;
my $page_orig = $page;
$page =~ s/ /_/g;
$page =~ s/&/%26/g;
print "Adding to $page_orig... ";
my $ua = get_ua();
my $addr = "http://$wikihost/$wikipath?title=$page&action=edit";
my $response = $ua->get($addr);
die $response->status_line unless $response->is_success;
$response->content =~ /<input type='hidden' value="([^"]*)" name="wpEdittime" \/>/
or die "syntax error: no edittime";
my $section_edittime = $1;
$response->content =~ /<input type='hidden' value="([^"]*)" name="wpEditToken" \/>/
or die "syntax error: no edittoken";
my $section_edittoken = $1;
$addr = "http://$wikihost/$wikipath?title=$page&action=submit";
$response = $ua->post($addr,
Content_Type => 'form-data',
Content => {
'wpSummary' => $title,
'wpTextbox1' => $text,
'wpSave' => 'Save page',
'wpSection' => 'new',
'wpEdittime' => $section_edittime,
'wpEditToken' => $section_edittoken
}
);
die $response->status_line unless $response->is_success;
$page_orig =~ s/\(/\\(/g;
$page_orig =~ s/\)/\\)/g;
$page_orig =~ s/&/&/g;
unless($response->content =~ /<title>$page_orig/) {
open OUT,">results" or die "could not open results";
print OUT $response->content;
close OUT;
die "error. Dumped to results."
}
print "Success. zzz...\n";
sleep $sleeptime;
}
# log in
sub login {
print "Logging in $wikiuser... ";
my $ua = get_ua();
my $addr = "http://$wikihost/$wikipath?title=Special:Userlogin&action=submitlogin";
my $response = $ua->post($addr,
{
'wpName' => $wikiuser,
'wpPassword' => $wikipass
}
);
die $response->status_line unless $response->is_success;
die "Login failed." unless $response->content =~ /You are now logged in/;
print "Success.\n";
}
my $whatlinkshere;
# populate %sectionsbyredir
sub getMap {
my $listtext = checkout($listname);
my @headings;
push @headings, $1 while $listtext =~ /^=+ *([^=]+) *=+/gm;
# get whatlinkshere
{
print "Retreiving whatlinkshere... ";
my $ua = get_ua();
my $addr = "http://$wikihost/$wikipath/Special:Whatlinkshere/$listname?limit=5000";
$addr =~ s/ /_/g;
my $response = $ua->get($addr);
die $response->status_line unless $response->is_success;
print "Success.\n\n";
$whatlinkshere = $response->content;
$whatlinkshere =~ s/</</g;
$whatlinkshere =~ s/>/>/g;
$whatlinkshere =~ s/"/"/g;
$whatlinkshere =~ s/&/&/g;
}
while($whatlinkshere =~ />([^<]+)<\/a> \(redirect page\)$/mg) {
my $redir = $1;
# straightforward
if(grep(/^$redir$/i, @headings) > 0) {
print "$redir -> $redir\n";
# fix regexp special characters; more may be necessary
$redir =~ s/\(/\\(/g;
$redir =~ s/\)/\\)/g;
$sectionsbyredir{$redir} = $redir;
} else {
# handle plurals
if($redir =~ /^(.*)s$/) {
my $nonplural = $1;
if(grep(/^$nonplural$/i, @headings) > 0) {
print "$redir -> $nonplural\n";
# fix regexp special characters; more may be necessary
$redir =~ s/\(/\\(/g;
$redir =~ s/\)/\\)/g;
$sectionsbyredir{$redir} = $nonplural;
next;
}
}
# handle disambig-names like Foo (Fictional Universe)
if($redir =~ /^(.*) \(.*\)$/) {
my $realname = $1;
if(grep(/^$realname$/i, @headings) > 0) {
print "$redir -> $realname\n";
# fix regexp special characters; more may be necessary
$redir =~ s/\(/\\(/g;
$redir =~ s/\)/\\)/g;
$sectionsbyredir{$redir} = $realname;
next;
}
}
# catch-all for forms like:
# First Last -> Last, First
# Mr. Foo -> Mr. and Mrs. Foo
# (and more?)
# get all subnames of the redirect
my @names = split(/ /, $redir);
my @filter = @headings;
# find headings containing all of the subnames
foreach my $name (@names) {
# fix regexp special characters; more may be necessary
$name =~ s/\(/\\(/g;
$name =~ s/\)/\\)/g;
@filter = grep(/$name/i, @filter);
last if @filter == 1;
# don't let anything eliminate it too early
@filter = @headings if @filter == 0;
}
if(@filter == 1) {
print "$redir -> " . $filter[0] . "\n";
# fix regexp special characters; more may be necessary
$redir =~ s/\(/\\(/g;
$redir =~ s/\)/\\)/g;
$sectionsbyredir{$redir} = $filter[0];
} elsif(@filter == 0 || @filter == @headings) {
# keep everything from being listed
print "$redir -> ------------nothing\n";
} else {
print "$redir ->\n";
print "\t$_\n" foreach @filter;
}
}
}
return if $nostart;
print "Is this OK (y/n)? ";
$_ = <STDIN>;
exit unless /^y/;
print "\n";
}
# populate %redirsbypage and %pagesbysection
sub getPages {
while(my ($redir, $section) = each %sectionsbyredir) {
# handle redirects with no links thereto
next if $whatlinkshere =~ />$redir<\/a> \(redirect page\)\n<\/li>/;
$whatlinkshere =~ />$redir<\/a> \(redirect page\)\n(.+?)<\/ul>\n<\/li>/s
or die "$redir not found properly in whatlinkshere";
my $curpages = $1;
my @pagesthissection;
while($curpages =~ />([^<]+?)<\/a><\/li>$/mg) {
my $page = $1;
# filter out non-main namespace
next if $page =~ /^\w*:/;
next if $page =~ /^\w* talk:/;
push @{$redirsbypage{$page}}, $redir;
push @{$pagesbysection{$section}}, $page;
}
}
}
# mark user page as working
sub setWorking {
return if $nostart;
my $text = checkout("User:$wikiuser");
my $message = "[[$listname]], started ~~~~~";
$text =~ s/^('''Currently working on:''').*$/$1 $message/m;
checkin $text, "Mark as working on: [[$listname]]";
}
# do the work
sub fixPages {
foreach my $page (keys %redirsbypage) {
my $text = checkout($page);
# handle editing the list itself
my $presection = ($page eq $listname) ? '' : $listname;
foreach my $redir (@{$redirsbypage{$page}}) {
my $section = $sectionsbyredir{$redir};
$text =~ s/\[\[ *($redir) *\]\]/[[$presection#$section|$1]]/gi;
$text =~ s/\[\[ *$redir *\| *([^\]]*?) *\]\]/[[$presection#$section|$1]]/gi;
}
checkin $text, "Changing links-to-redirects-to-lists to links-to-list-sections. See userpage for more info.", 1;
}
}
# leave a note on the talk page
sub writeNote {
my ($exampleRedir, $exampleSection) = each %sectionsbyredir;
my $note = <<EOM;
This page has been processed by [[User:$wikiuser|$wikiuser]], which, for browsing convenience, changes links to redirects to lists to links to the relevant list sections: e.g. <nowiki>[[$exampleRedir]] is changed to [[$listname#$exampleSection|$exampleRedir]]</nowiki>.
As a result, anyone who intends to split a section out of this page should be aware that, as of $date, the following sections were linked to from the following pages:
EOM
foreach my $section (keys %pagesbysection) {
$note .= "* [[$listname#$section|$section]]: ";
foreach my $page (@{$pagesbysection{$section}}) {
$note .= "[[$page]], ";
}
chop $note; # take off last ", ";
chop $note;
$note .= "\n";
}
$note .= "\n~~~~";
addsection "Talk:$listname", $note, 'Note to anyone intending on splitting off a section';
}
# mark user page as done
sub setDone {
my $text = checkout("User:$wikiuser");
$text =~ s/^('''Currently working on:''').*$/$1 nothing/m;
{
# chomp off all trailing newlines
local $/ = '';
chomp $text;
}
$text .= "\n# [[$listname]] (finished $date)";
checkin $text, "Mark as done."
}