User:N-Bot/listredir.pl

From Wikipedia, the free encyclopedia

#!/usr/bin/perl

# Wikipedia bot to fix redirects to lists.
# e.g., if [[Random Thing 3]] is a redirect to [[List of random things]],
# which has a section #Random Thing 3, change all instances of
# [[Random Thing 3]] to [[List of random things#Random Thing 3]]

# Usage: listredir.pl [--nostart] Name of list
# --nostart causes it to not edit the user page to mark start; use when restarting after crash

# Copyright (C) 2005 Nick Tarleton (Wikipedia username: Nickptar)
# This code may be redistributed, modified or unmodified, without restriction,
# provided that this copyright notice and license remain in place.

# We apologize for the ad-hoc-ness.

use warnings;
use strict;

use LWP;
use HTTP::Cookies;
use POSIX qw(strftime);

# modify as necessary

my $wikihost = 'en.wikipedia.org';
my $wikipath = 'w/index.php';
my $wikiuser = 'N-Bot';
my $wikipass = '';
my $sleeptime = 30;

my $jar = HTTP::Cookies->new();

my %sectionsbyredir; # contains redirect->section header mappings
my %redirsbypage; # contains pages needing fixing and the redirects on them
my %pagesbysection; # contains section->page mappings, for talk page message

$| = 1; # why not

# format: [[4 August]] [[2005]]
my $date = strftime('[[%d %B]] [[%Y]]', gmtime);
$date =~ s/\[\[0/[[/; # get rid of stupid leading zero on day

my $nostart = 0;
if($ARGV[0] eq "--nostart") {
        $nostart = 1;
        shift @ARGV;
}

my $listname = join(' ', @ARGV);
die "Usage: $0 listname" unless $listname =~ /\S/;

print "listredir.pl - by Nick Tarleton (User:Nickptar)\n";
print "processing $listname\n----\n";

login();
getMap();
getPages();
setWorking();
fixPages();
writeNote();
setDone();

# make and initialize a UserAgent
sub get_ua {
        my $ua = LWP::UserAgent->new();
        $ua->agent("listredir.pl operated by User:$wikiuser");
        $ua->cookie_jar($jar);
        push @{ $ua->requests_redirectable }, 'POST';
        return $ua;
}

my $starttime;
my $edittime;
my $edittoken;
my $editingpage;
my $editingpage_orig;

# get the text of a page, and be prepared to write it back
sub checkout {
        $editingpage_orig = $editingpage = shift;
        $editingpage =~ s/ /_/g;
        $editingpage =~ s/&/%26/g;
        print "Retrieving $editingpage_orig... ";

        my $ua = get_ua();
        my $addr = "http://$wikihost/$wikipath?title=$editingpage&action=edit";
        my $response = $ua->get($addr);
        die $response->status_line unless $response->is_success;

        $response->content =~ /<textarea[^>]*>([^<]*)<\/textarea>/s
                or die "syntax error: no textarea";
        my $result = $1;
        $result =~ s/&lt;/</g;
        $result =~ s/&gt;/>/g;
        $result =~ s/&quot;/"/g;
        $result =~ s/&amp;/&/g;

        $response->content =~ /<input type='hidden' value="([^"]*)" name="wpStarttime" \/>/
                or die "syntax error: no starttime";
        $starttime = $1;
        $response->content =~ /<input type='hidden' value="([^"]*)" name="wpEdittime" \/>/
                or die "syntax error: no edittime";
        $edittime = $1;
        $response->content =~ /<input type='hidden' value="([^"]*)" name="wpEditToken" \/>/
                or die "syntax error: no edittoken";
        $edittoken = $1;

        print "Success.\n";
        return $result;
}

# save the page being edited
sub checkin {
        my ($text, $summary, $minor) = @_;
        print "Saving... ";

        my $ua = get_ua();
        my $addr = "http://$wikihost/$wikipath?title=$editingpage&action=submit";
        my $response = $ua->post($addr,
                Content_Type => 'form-data',
                Content => {
                        'wpTextbox1' => $text,
                        'wpSummary' => $summary,
                        'wpSave' => 'Save page',
                        'wpMinoredit' => ($minor ? '1' : ''),
                        'wpSection' => '',
                        'wpStarttime' => $starttime,
                        'wpEdittime' => $edittime,
                        'wpEditToken' => $edittoken
                }
        );
        die $response->status_line unless $response->is_success;
        # crappy crappy success check
        $editingpage_orig =~ s/\(/\\(/g;
        $editingpage_orig =~ s/\)/\\)/g;
        $editingpage_orig =~ s/&/&amp;/g;
        unless($response->content =~ /<title>$editingpage_orig/) {
                open OUT,">results" or die "could not open results";
                print OUT $response->content;
                close OUT;
                die "error. Dumped to results."
        }
        print "Success. zzz...\n";
        sleep $sleeptime;
}

# add a section to a talk page
sub addsection {
        my ($page, $text, $title) = @_;
        my $page_orig = $page;
        $page =~ s/ /_/g;
        $page =~ s/&/%26/g;
        print "Adding to $page_orig... ";

        my $ua = get_ua();
        my $addr = "http://$wikihost/$wikipath?title=$page&action=edit";
        my $response = $ua->get($addr);
        die $response->status_line unless $response->is_success;

        $response->content =~ /<input type='hidden' value="([^"]*)" name="wpEdittime" \/>/
                or die "syntax error: no edittime";
        my $section_edittime = $1;
        $response->content =~ /<input type='hidden' value="([^"]*)" name="wpEditToken" \/>/
                or die "syntax error: no edittoken";
        my $section_edittoken = $1;

        $addr = "http://$wikihost/$wikipath?title=$page&action=submit";
        $response = $ua->post($addr,
                Content_Type => 'form-data',
                Content => {
                        'wpSummary' => $title,
                        'wpTextbox1' => $text,
                        'wpSave' => 'Save page',
                        'wpSection' => 'new',
                        'wpEdittime' => $section_edittime,
                        'wpEditToken' => $section_edittoken
                }
        );
        die $response->status_line unless $response->is_success;
        $page_orig =~ s/\(/\\(/g;
        $page_orig =~ s/\)/\\)/g;
        $page_orig =~ s/&/&amp;/g;
        unless($response->content =~ /<title>$page_orig/) {
                open OUT,">results" or die "could not open results";
                print OUT $response->content;
                close OUT;
                die "error. Dumped to results."
        }
        print "Success. zzz...\n";
        sleep $sleeptime;
}

# log in
sub login {
        print "Logging in $wikiuser... ";

        my $ua = get_ua();
        my $addr = "http://$wikihost/$wikipath?title=Special:Userlogin&action=submitlogin";
        my $response = $ua->post($addr,
                {
                        'wpName' => $wikiuser,
                        'wpPassword' => $wikipass
                }
        );
        die $response->status_line unless $response->is_success;
        die "Login failed." unless $response->content =~ /You are now logged in/;
        print "Success.\n";
}

my $whatlinkshere;

# populate %sectionsbyredir
sub getMap {
        my $listtext = checkout($listname);
        my @headings;
        push @headings, $1 while $listtext =~ /^=+ *([^=]+) *=+/gm;

        # get whatlinkshere
        {
                print "Retreiving whatlinkshere... ";
                my $ua = get_ua();
                my $addr = "http://$wikihost/$wikipath/Special:Whatlinkshere/$listname?limit=5000";
                $addr =~ s/ /_/g;
                my $response = $ua->get($addr);
                die $response->status_line unless $response->is_success;
                print "Success.\n\n";
                $whatlinkshere = $response->content;
                $whatlinkshere =~ s/&lt;/</g;
                $whatlinkshere =~ s/&gt;/>/g;
                $whatlinkshere =~ s/&quot;/"/g;
                $whatlinkshere =~ s/&amp;/&/g;
        }

        while($whatlinkshere =~ />([^<]+)<\/a> \(redirect page\)$/mg) {
                my $redir = $1;
                # straightforward
                if(grep(/^$redir$/i, @headings) > 0) {
                        print "$redir -> $redir\n";
                        # fix regexp special characters; more may be necessary
                        $redir =~ s/\(/\\(/g;
                        $redir =~ s/\)/\\)/g;
                        $sectionsbyredir{$redir} = $redir;
                } else {
                        # handle plurals
                        if($redir =~ /^(.*)s$/) {
                                my $nonplural = $1;
                                if(grep(/^$nonplural$/i, @headings) > 0) {
                                        print "$redir -> $nonplural\n";
                                        # fix regexp special characters; more may be necessary
                                        $redir =~ s/\(/\\(/g;
                                        $redir =~ s/\)/\\)/g;
                                        $sectionsbyredir{$redir} = $nonplural;
                                        next;
                                }
                        }

                        # handle disambig-names like Foo (Fictional Universe)
                        if($redir =~ /^(.*) \(.*\)$/) {
                                my $realname = $1;
                                if(grep(/^$realname$/i, @headings) > 0) {
                                        print "$redir -> $realname\n";
                                        # fix regexp special characters; more may be necessary
                                        $redir =~ s/\(/\\(/g;
                                        $redir =~ s/\)/\\)/g;
                                        $sectionsbyredir{$redir} = $realname;
                                        next;
                                }
                        }

                        # catch-all for forms like:
                        # First Last -> Last, First
                        # Mr. Foo -> Mr. and Mrs. Foo
                        # (and more?)

                        # get all subnames of the redirect
                        my @names = split(/ /, $redir);
                        my @filter = @headings;
                        # find headings containing all of the subnames
                        foreach my $name (@names) {
                                # fix regexp special characters; more may be necessary
                                $name =~ s/\(/\\(/g;
                                $name =~ s/\)/\\)/g;
                                @filter = grep(/$name/i, @filter);
                                last if @filter == 1;
                                # don't let anything eliminate it too early
                                @filter = @headings if @filter == 0;
                        }
                        if(@filter == 1) {
                                print "$redir -> " . $filter[0] . "\n";
                                # fix regexp special characters; more may be necessary
                                $redir =~ s/\(/\\(/g;
                                $redir =~ s/\)/\\)/g;
                                $sectionsbyredir{$redir} = $filter[0];
                        } elsif(@filter == 0 || @filter == @headings) {
                                # keep everything from being listed
                                print "$redir -> ------------nothing\n";
                        } else {
                                print "$redir ->\n";
                                print "\t$_\n" foreach @filter;
                        }
                }
        }

        return if $nostart;

        print "Is this OK (y/n)? ";
        $_ = <STDIN>;
        exit unless /^y/;
        print "\n";
}

# populate %redirsbypage and %pagesbysection
sub getPages {
        while(my ($redir, $section) = each %sectionsbyredir) {
                # handle redirects with no links thereto
                next if $whatlinkshere =~ />$redir<\/a> \(redirect page\)\n<\/li>/;
                $whatlinkshere =~ />$redir<\/a> \(redirect page\)\n(.+?)<\/ul>\n<\/li>/s
                        or die "$redir not found properly in whatlinkshere";
                my $curpages = $1;
                my @pagesthissection;
                while($curpages =~ />([^<]+?)<\/a><\/li>$/mg) {
                        my $page = $1;
                        # filter out non-main namespace
                        next if $page =~ /^\w*:/;
                        next if $page =~ /^\w* talk:/;
                        push @{$redirsbypage{$page}}, $redir;
                        push @{$pagesbysection{$section}}, $page;
                }
        }
}

# mark user page as working
sub setWorking {
        return if $nostart;
        my $text = checkout("User:$wikiuser");
        my $message = "[[$listname]], started ~~~~~";
        $text =~ s/^('''Currently working on:''').*$/$1 $message/m;
        checkin $text, "Mark as working on: [[$listname]]";
}

# do the work
sub fixPages {
        foreach my $page (keys %redirsbypage) {
                my $text = checkout($page);
                # handle editing the list itself
                my $presection = ($page eq $listname) ? '' : $listname;
                foreach my $redir (@{$redirsbypage{$page}}) {
                        my $section = $sectionsbyredir{$redir};
                        $text =~ s/\[\[ *($redir) *\]\]/[[$presection#$section|$1]]/gi;
                        $text =~ s/\[\[ *$redir *\| *([^\]]*?) *\]\]/[[$presection#$section|$1]]/gi;
                }
                checkin $text, "Changing links-to-redirects-to-lists to links-to-list-sections. See userpage for more info.", 1;
        }
}

# leave a note on the talk page
sub writeNote {
        my ($exampleRedir, $exampleSection) = each %sectionsbyredir;
        my $note = <<EOM;
This page has been processed by [[User:$wikiuser|$wikiuser]], which, for browsing convenience, changes links to redirects to lists to links to the relevant list sections: e.g. <nowiki>[[$exampleRedir]] is changed to [[$listname#$exampleSection|$exampleRedir]]</nowiki>.

As a result, anyone who intends to split a section out of this page should be aware that, as of $date, the following sections were linked to from the following pages:

EOM

        foreach my $section (keys %pagesbysection) {
                $note .= "* [[$listname#$section|$section]]: ";
                foreach my $page (@{$pagesbysection{$section}}) {
                        $note .= "[[$page]], ";
                }
                chop $note; # take off last ", ";
                chop $note;
                $note .= "\n";
        }

        $note .= "\n~~~~";
        addsection "Talk:$listname", $note, 'Note to anyone intending on splitting off a section';
}

# mark user page as done
sub setDone {
        my $text = checkout("User:$wikiuser");
        $text =~ s/^('''Currently working on:''').*$/$1 nothing/m;
        {
                # chomp off all trailing newlines
                local $/ = '';
                chomp $text;
        }
        $text .= "\n# [[$listname]] (finished $date)";
        checkin $text, "Mark as done."
}