User:Coren/csb2.js

From Wikipedia, the free encyclopedia

Note: After saving, you have to bypass your browser's cache to see the changes. In Internet Explorer and Firefox, hold down the Ctrl key and click the Refresh or Reload button. Opera users have to clear their caches through Tools→Preferences, see the instructions for Opera. Konqueror and Safari users can just click the Reload button.

#! /usr/bin/perl
 
use LWPx::ParanoidAgent;
use HTTP::Cookies;
use URI::Escape;
use Text::Align::WagnerFischer;
 
 
$ua = LWPx::ParanoidAgent->new(timeout => 20);
$ua->agent("CorenSearchBot/1.0 ");
$cookie_jar = HTTP::Cookies->new(file => "$ENV{'HOME'}/lwp_cookies.dat", autosave => 1,);
$ua->cookie_jar($cookie_jar);
 
sub Doing($) {
    my($msg) = @_;
    print "\t$msg\n";
}
 
sub significant($) {
    my @in = split "\n", $_[0];
    my @out;
    foreach my $l (@in) {
        next if $l =~ m/ Categor(y|ies) /;
        next if $l =~ m/align/;
        my $words = 0;
        if($l =~ m/\b[a-z]{5,}\b/) {
            $words++ while $l =~ m//g;
        }
        if($l =~ m/\b\*\b/) {
            $words-=2 while $l =~ m//g;
        }
        next if $words < 3;
#$l .= " [$words]";
        push @out, $l;
    }
    return @out;
}
 
sub complete($) {
    my @in = split "\n", $_[0];
    my @out;
    foreach my $l (@in) {
        next if $l =~ m/ Categor(y|ies) /;
        push @out, $l;
    }
    return @out;
}
 
sub tokenize(@) {
    my @t;
    foreach my $l (@_) {
        foreach my $t (split / /, $l) {
            push @t, $t if length($t) > 3;
        }
    }
    return @t;
}
 
sub statementize($) {
    ($_, undef) = @_;
    s/---*/ /g;
    tr/!-?/ /;
#s/  */ /g;
    s/^ *//g;
    s/ *$//g;
    s/\*([^ .])/\1/g;
    s/\.  */.\n/g;
#while(s/([^. \n]) *([A-Z][a-zA-Z0-9_]*)/\1 */gs) { }
#while(s/\*  *\*/* /gs) { }
    s/\.([A-Z])/\n\1/sg;
    s/  *\././g;
    s/\n\n*/\n/gs;
    s/\.\n/\n/gs;
    return $_;
}
 
sub normalizewikitext($) {
    ($_, undef) = @_;
    tr/*#/::/;
    s/&lt;ref&gt;.*?&lt;\/ref&gt;/ /igs;
    s/&lt;.*?&gt;/ /igs;
    s/&[^;]*;/ /gs;
    while(s/('''*)(.*?)\1/ \2 /gs) { }
    s/\[\[([^|\]]*)]]/ \1 /gs;
    s/\[\[.*?\|(.*?)]]/ \1 /gs;
    s/\[[^ ]* (.*?)]/ \1 /gs;
    s/\[.*?]/ /gs;
    s/^(===*)(.*?)\1/\2. /g;
    s/{{.*?}}/ /gs;
    return statementize $_;
}
 
sub normalizewebtext($) {
    ($_, undef) = @_;
    s/<.*?>/ /igs;
    s/\&.*?;/ /gs;
    return statementize $_;
}
 
sub WPRequest(@) {
    my $req = HTTP::Request->new(POST => 'http://en.wikipedia.org/w/api.php');
    $req->content_type('application/x-www-form-urlencoded');
    $req->content(join '&', @_);
    my $res = $ua->request($req);
    return $res->is_success? $res->content: undef;
}
 
sub WPLogin($$) {
    my ($uname, $pwd) = @_;
    $pwd = uri_escape($pwd);
    my $req = HTTP::Request->new(POST => 'http://en.wikipedia.org/w/index.php?title=Special:Userlogin&action=submitlogin&type=login');
    $req->content_type('application/x-www-form-urlencoded');
    $req->content("wpName=$uname&wpPassword=$pwd&wpRemember=1&wpLoginattempt=Log+in");
    my $res = $ua->request($req);
    $cookie_jar->extract_cookies($req);
    return "Ok";
}
 
sub WPStartEdit($) {
    my ($title) = @_;
    $title = uri_escape($title);
    my $req = HTTP::Request->new(GET => "http://en.wikipedia.org/w/index.php?title=$title&action=edit");
    my $res = $ua->request($req);
    my $txt;
    $txt = $1 if $res->content =~ m/<textarea[^>]*>(.*)<\/textarea>/s;
    $txt =~ s/&lt;/</gs;
    $txt =~ s/&gt;/>/gs;
    $txt =~ s/&quot;/"/gs;
    $txt =~ s/&amp;/\&/gs;
    my $et;
    $et = $1 if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEditToken" \/>/s;
    my $more;
    $more .= '&wpStarttime='.uri_escape($1) if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpStarttime" \/>/s;
    $more .= '&wpEdittime='.uri_escape($1) if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEdittime" \/>/s;
    if($res->is_success) {
        return ($title, $et, $more, $txt);
    }
    return undef;
}
 
sub WPTryEdit($$$$$) {
    my($title, $et, $more, $txt, $es) = @_;
    my $req = HTTP::Request->new(POST => "http://en.wikipedia.org/w/index.php?title=$title&action=submit");
    $req->content_type('application/x-www-form-urlencoded');
    $req->content(
          'wpSection='
        . '&wpSummary='.uri_escape($es)
        . '&wpSave=wpSave'
        . '&wpEditToken='.uri_escape($et)
        . '&wpTextbox1='.uri_escape($txt)
        . $more
    );
    my $res = $ua->request($req);
    $et = $1 if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEditToken" \/>/s;
    return undef  if $res->content =~ m/<textarea/;
    return 1;
}
 
sub WPArticle($) {
    my($title) = @_;
    my $art = WPRequest('action=query',
                        'prop=revisions',
                        'titles='.uri_escape($title),
                        'rvprop=content',
                        'rvlimit=1',
                        'format=xml');
    $art = $1 if $art =~ m/<rev>(.*?)<\/rev>/s;
    return $art;
}
 
sub WPNewPages() {
    my $list = WPRequest('action=query',
                         'list=recentchanges',
                         'rclimit=500',
                         'rcnamespace=0',
                         'format=xml');
    my @news;
    my $maxrid = 0;
    if($list =~ m/<rc type="1" .*? title="([^"]*)" .*? revid="([0-9]+)"/g) {
        while(1) {
            last if $2 <= $last_revid;
            $maxrid = $2 if $2>$maxrid;
            push @news, $1;
            last if not $list =~ m//g;
        }
    }
    $last_revid = $maxrid  if $maxrid>$last_revid;
    return @news;
}
 
sub WPCreator($) {
    my($title) = @_;
    my $art = WPRequest('action=query',
                        'prop=revisions',
                        'titles='.uri_escape($title),
                        'rvprop=user',
                        'rvlimit=1',
                        'rvdir=newer',
                        'format=xml');
    return $1 if $art =~ m/<rev user="([^"]*?)" \/>/s;
    return undef;
}
 
 
sub YahooFind($) {
    my $req = HTTP::Request->new(GET => 'http://search.yahooapis.com/WebSearchService/V1/webSearch?appid=SANITIZED&query='.uri_escape(join(' ',@_)).'&results=5&language=en');
    my $res = $ua->request($req);
    my @uri;
    my $r = $res->content;
    $r =~ s/<Cache>.*?<\/Cache>//sg;
    my @re = $r =~ m/<Url>([^<]*?)\/?<\/Url>/gs;
    Doing "Search \"".join(' ',@_)."\" found $#re+1 results";
    return @re;
}
 
sub top3($) {
    my($q) = @_;
    my @uri, YahooFind($q);
    $#uri=2 if $#uri>2;
    SITE:
    foreach my $uri (@uri) {
        next if $uri =~ m/\.[pP][Dd][Ff]/;
        foreach my $q (@web) {
            next SITE if $q eq $uri;
        }
        my $site;
        $site = $1 if $uri =~ m{^[^:]*://([^/]*)/};
        if($site eq 'en.wikipedia.org' and $uri=~m{/wiki/}) {
            $uri =~ s{.*/wiki/(.*)}{\1};
            $uri = uri_unescape($uri);
            $uri =~ tr/_/ /;
            foreach my $q (@enwiki) {
                next SITE if $q eq $uri;
            }
            push @enwiki, $uri;
            next SITE;
        }
        foreach my $re (@exclude) {
            next SITE  if $site =~ $re;
        }
        push @web, $uri;
        return if $#web > 5;
    }
}
                                                                                                                                                                        sub findmatches($) {
    my $article = WPArticle($_[0]);
    my @atokens = tokenize complete normalizewikitext $article;
#print "article <", join(' ', @atokens), ">\n";
    my @paras = significant normalizewikitext $article;
 
    my $why = undef;
    my $score = $config{MinScore};
    my $what = undef;
    my $what_ok;
    my $score_ok = 50000;
 
    local @web;
    local @enwiki;
 
    return undef if $#atokens < 5;
    $#atokens = 200 if $#atokens > 200;
 
    my @uri;
    my $ln = 0;
 
    my $title = $_[0];
    $title =~ s/\(.*?\) *//;
    foreach my $l (@paras) {
        if($ln==1 or $ln==7 or $ln==($#paras-1)) {
            if($l =~ m/ (.*)\.?/) {
                my @tq = split ' ', $1;
                my @q;
                my $num = 0;
                foreach my $w (@tq) {
                    push @q, $w if $w =~ m/[a-zA-Z0-9*]/;
                    $num++ if not $w eq '*';
                    last if $num > 9;
                }
                my $q = join ' ', @q;
                top3 "\"$title\" $q";
            }
        }
        $ln++;
    }
    return undef if $#paras < 0;                                                                                                                                            top3 "\"$title\"";
 
    foreach my $uri (@web) {
        Doing "checking $uri";
        my @src = eval {
            local $SIG{ALRM} = sub { die "alarm\n" };
            alarm 25;
            my $req = HTTP::Request->new(GET => $uri);
            alarm 0;
            my $res = $ua->request($req);
            if($res->is_success) {
                my @src = tokenize complete normalizewebtext $res->content;
#print "webpage <", join(' ', @src), ">\n";
                return @src if $#src > 9;
            }
            return undef;
        };
        next if $#src < 10;
        next if $@ eq "alarm\n";
 
        $#src = 100000/$#atokens  if $#src*$#atokens > 100000;
 
        my $alignment = Text::Align::WagnerFischer->new(
                                                    left => \@src,
                                                    right => \@atokens,
                                                    weights => [0,1,2]
                                                   );
 
        my $maybe = 'pageincluded';
        my $dif = abs ($#src-$#atokens);
        $sina = ($alignment->cost()-$dif)*1000/$#src;
        $ains = ($alignment->cost()-$dif)*1000/$#atokens;
        Doing "$#src/$#atokens $dif gives cost ".($alignment->cost()-$dif)." for $sina/$ains";
        if($ains > $sina) {
            $maybe = 'pageincludes';
            $sina = $ains;
        }
        my $need = $config{MinScore};
        $need = ($need*$#atokens)/30 if $#atokens<30;
        if($sina < $need and $sina < $score) {
            $why = $maybe;                                                                                                                                                          $score = $sina;
            $what = $uri;
        }
        if($sina < $score_ok) {
            $score_ok = $sina;
            $what_ok = $uri;
        }
    }
 
    foreach $uri (@enwiki) {
        next if $uri eq $_[0];
        my $test = WPArticle($uri);
        my @src = tokenize complete normalizewikitext $test;
        next if $#src < 10;
        my $alignment = Text::Align::WagnerFischer->new(
                                                        left => \@src,
                                                        right => \@atokens,
                                                        weights => [-1,1,2]
                                                       );
        $sina = $alignment->cost()*1000/$#src;
        $ains = $alignment->cost()*1000/$#atokens;
        $sina = $ains if $ains < $sina;
        if($sina<-400 and $sina < $score) {
            $why = 'wikipage';
            $what = $uri;
            $score = $sina;
        }
        if($sina < $score_ok) {
            $score_ok = $sina;
            $what_ok = $uri;
        }
    }
 
    return ($why, $what, ($score)/10) if $score < $config{MinScore};
    Doing "Best match was $what_ok with $score_ok";
    return ('', '', 1000);
}
 
sub TagPage($$$) {
    my($title, $type, $what) = @_;
    my $tag = "{{csb-$type|1=$what}}";                                                                                                                                  
    my $user = WPCreator($title);
    foreach my $ally (@allies) {
        return "creator trusted" if $user eq $ally;
    }
    $user = "User talk:$user" if defined $user;
 
    while(1) {
        my($ttl, $token, $more, $text) = WPStartEdit($title);
        return "article is (now) a redirect"    if $text =~ m/^#REDIRECT/;
        return "attributed"                     if $text =~ m/{{DANFS}}/i;
        return "attributed"                     if $text =~ m/{{[cC]atholic}}/i;
        return "speedied"                       if $text =~ m/{{db/;
        return "marked copyvio"                 if $text =~ m/{{copyvio/;
        return "already tagged"                 if $text =~ m/{{csb-/;
        return "page gone"                      if length($text)<20;
 
        $text = "$tag\n\n" . $text;
        if(WPTryEdit($ttl, $token, $more, $text, "Tagging for copyvio of $what"))
          {
            while(defined $user) {
                ($ttl, $token, $more, $text) = WPStartEdit($user);
                $text .= "\n{{subst:csb-notice-$type|$title|url=$what}} &mdash;&nbsp;[[User:Coren|Coren]]&nbsp;<sup>[[User Talk:Coren|(talk)]]</sup> 22:41, 18 August 2007 (UTC)\n";
                last if WPTryEdit($ttl, $token, $more, $text, "Notifying user of copyvio on $title");
            }
            while(1) {
                ($ttl, $token, $more, $text) = WPStartEdit($config{ReportTo});
                my $re = qr/\[\[$title]]/s;
                last if $text =~ $re;
                if($type eq 'wikipage') {
                    $text .= "* [[$title]] &mdash; [[$what]]. Reported by [[User:CorenSearchBot|CSBot]] at 22:41, 18 August 2007 (UTC)\n";
                } else {
                    $text .= "* [[$title]] &mdash; [$what $what]. Reported by [[User:CorenSearchBot|CSBot]] at 22:41, 18 August 2007 (UTC)\n";
                }
                last if WPTryEdit($ttl, $token, $more, $text, "Adding violation on $title");
            }
            return undef
          }
    }
}
                                                                                                                                                                        sub configstatus() {
    undef %config;
    undef @exclude;
    undef @allies;
    foreach $l (split "\n", WPArticle("User:CorenSearchBot/config")) {
        $config{$1} = $2  if $l =~ m/ *([A-Za-z]+)=(.*)/;
    }
    foreach $l (split "\n", WPArticle("User:CorenSearchBot/exclude")) {
        push @exclude, qr/$1$/i  if $l =~ m/ *([^=]*\.[a-z]{2,4})$/;
    }
    foreach $l (split "\n", WPArticle("User:CorenSearchBot/allies")) {
        push @allies, $1  if $l =~ m/  *([^=]*)$/;
    }
}
 
my @npq;
 
my $ok = WPLogin('CorenSearchBot', SANITIZED);
 
configstatus;
print "Configuration read.\n";
print "(", $#exclude+1, " exclusions)\n";
print "(", $#allies+1, " allies)\n";
print "Report to '$config{ReportTo}'\n";
print "Is a copy below $config{MinScore}\n";
print "\n";
 
push @npq, @ARGV;
my @manuals;
 
while(1) {
    if($#npq < 1) {
        print "Fetching new pages\n";
        push @npq, WPNewPages if $#npq < 1;
        print $#npq+1, " page(s) to check. (last revid $last_revid)\n";
        if($#npq<0) {
            if($#manuals<0) {
                foreach $l (split "\n", WPArticle("User:CorenSearchBot/manual")) {
                    push @manuals, $1  if $l =~ m/\[\[([^]]*)]]$/;
                }
                while($#manuals >= 0) {                                                                                                                                                     my ($ttl, $token, $more, $text) = WPStartEdit("User:CorenSearchBot/manual");
                    $text =~ s/==Unprocessed requests==.*==Recent Results==/==Unprocessed requests==\n\n==Recent Results==/s;
                    last if WPTryEdit($ttl, $token, $more, $text, "Removing pending requests");
                }
            }
            if($#manuals>=0) {
                my $page = pop @manuals;
                my $result = "{{User:CorenSearchBot/result-no|$page|22:41, 18 August 2007 (UTC)}}\n";
                print "Manually checking [[$page]]\n";
                my($why, $what, $score) = findmatches($page);
                $score = int(100-$score);
                $result = "{{User:CorenSearchBot/result-unknown|$page|22:41, 18 August 2007 (UTC)}}\n" if $score>-10;
                if(defined $why and not $why eq '') {
                    print "\t\033[31;1m[[$page]] is $why of [$what] with confidence $score\033[0m\n";
                    $result = "{{User:CorenSearchBot/result-yes|$page|$score|22:41, 18 August 2007 (UTC)|url=$what}}\n";
                }
                while(1) {
                    my ($ttl, $token, $more, $text) = WPStartEdit("User:CorenSearchBot/results");
                    $text .= $result;
                    last if WPTryEdit($ttl, $token, $more, $text, "Posting result of manual check");
                }
            } else {
                print "Sleeping.\n";
                sleep 20;
                configstatus;
            }
        }
    }
    if($#npq >= 0) {
        my $page = $npq[0];
        shift @npq;
        print "Checking [[$page]]\n";
        my($why, $what, $score) = findmatches($page);
        if(defined $why and not $why eq '') {
            $score = int(100-$score);
            print "\t\033[31;1m[[$page]] is $why of [$what] with confidence $score\033[0m\n";
            my $res = TagPage($page, $why, $what);
            if(defined $res) {
                print "\tTagging: $res\n";
            } else {
                print "\tTags placed\n";
            }
        }
    }
}