User:Lar/ClassificationTableGen
From Wikipedia, the free encyclopedia
Old version at: User:Lar/ClassificationTableGen/Backlev
Perl code: This code generated User:Lar/Sandbox2 (version 8).. There is a lot of work to do on it yet but if you stumble across this, feedback welcome. Not ready for public release yet (if ever).
Signature shows update time/date ++Lar: t/c 03:34, 27 April 2006 (UTC)
note: requires perl 5.8.... only tested on Wintel platform.
[edit] Helper module, based on Pearle Wisebot code
... this module needs some better commenting and trimming out of code not needed for what I need. It ALSO needs the user/cookie/token stuff made generic. It is hard coded to use my userid so may not work for anyone else (no you can't have my cookie file. Read the User:Pearle pages on how to set up your own and look for Lar in there and change it to your userid.
Filename WP_util_pearlebot.pm
package WP_util_pearlebot; # assumes WP_util_pearlebot.pm
# based on boilerplate module declaration found here:
# http://perldoc.perl.org/perlmod.html#Perl-Modules-module
#
# based on code that is part of the "Pearle Wisebot"
# http://en.wikipedia.org/wiki/User:Pearle
# http://en.wikipedia.org/wiki/User:Pearle/pearle-documentation.txt
# http://en.wikipedia.org/wiki/User:Pearle/pearle.pl
# which was created by [[User:Beland]]:
# http://en.wikipedia.org/wiki/User:Beland
# Mods by Larry Pieniazek ( [[user:Lar]] )
use strict;
use warnings;
use Data::Dumper;
use Getopt::Std;
use Time::HiRes;
use utf8;
#use encoding 'utf8';
# Initialization
use LWP;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST);
use HTML::Entities;
BEGIN {
use Exporter ();
our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
# set the version for version checking
$VERSION = 1.00;
# if using RCS/CVS, this may be preferred
# $VERSION = sprintf "%d.%03d", q$Revision: 1.1 $ =~ /(\d+)/g;
@ISA = qw(Exporter);
@EXPORT = qw(&func1 &func2 &func4);
%EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
# your exported package globals go here,
# as well as any optionally exported functions
# @EXPORT_OK = qw($Var1 %Hashit &func3);
# we do not have any externals
}
our @EXPORT_OK;
# exported package globals go here
# our $Var1;
# our %Hashit;
# non-exported package globals go here
# initialize package globals, first exported ones
# then the others (which are still accessible as $WP_util_pearlebot::stuff)
# all file-scoped lexicals must be created before
# the functions below that use them.
# file-private lexicals go here
# here's a file-private function as a closure,
# callable as &$priv_func; it cannot be prototyped.
# my $priv_func = sub {
# stuff goes here.
# };
# make all your functions, whether exported or not;
# remember to put something interesting in the {} stubs
sub myLog;
sub getPage;
sub postPage;
sub retry;
sub printWikitext;
sub test;
END { } # module clean-up code here (global destructor)
## YOUR CODE GOES HERE
# LWP:UserAgent is a library which allows us to create a "user agent"
# object that handles the low-level details of making HTTP requests.
$::ua = LWP::UserAgent->new(timeout => 300);
$::ua->agent("LarUtil/0.1");
$::ua->cookie_jar(HTTP::Cookies->new(file => "cookies.lar.txt",
autosave => 1));
# $::ua->cookie_jar->load();
# $::ua->
#
# $ua = LWP::UserAgent->new;
# $req = HTTP::Request->new(GET => 'http://www.linpro.no/secret/');
# $req->authorization_basic('aas', 'mypassword');
# print $ua->request($req)->as_string;
# Hot pipes
$| = 1;
#set default speedlimit
$::speedLimit = 10;
##---
## test();
sub test
{
my ($target, $text, $editTime, $startTime, $token);
#$target = "Special:Userlogin";
#($text, $editTime, $startTime, $token) = getPage($target);
# temporary
$::nullOK = "yes";
$target = "Wikipedia:Sandbox";
($text, $editTime, $startTime, $token) = getPage($target);
print $text;
# die ("nopost Test complete.");
$text .= "\nEat my electrons! -- testing Pearle clone ([[User:Lar]]) \n";
print "---\n";
postPage ($target, $editTime, $startTime, $token, $text, "Test 028", "najor"); # (no it is not minor)
die ("Test complete.");
}
##---
sub getPage
{
my ($target, $request, $response, $reply, $text, $text2,
$editTime, $startTime, $attemptStartTime, $attemptFinishTime,
$token);
$target = $_[0];
if ($target =~ m/^\s*$/)
{
myLog("getPage: Null target.");
die("getPage: Null target.");
}
# urlSafe ($target);
# Monitor wiki server responsiveness
$attemptStartTime = Time::HiRes::time();
# Create a request-object
print "GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n";
myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n");
$request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit");
$response = $::ua->request($request);
if ($response->is_success)
{
$reply = $response->content;
# Monitor wiki server responsiveness
$attemptFinishTime = Time::HiRes::time();
retry ("success", "getPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));
# This detects whether or not we're logged in.
unless ($reply =~ m%<a href="/wiki/User_talk:Lar">My talk</a>%)
{
# We've lost our identity.
myLog ("Wiki server is not recognizing me (1).\n---\n${reply}\n---\n");
die ("Wiki server is not recognizing me (1).\n");
}
my $saveReply=$reply;
myLog ("Dump reply prior to regex processing in getPage... \n---\n${saveReply}\n---\n");
$reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s;
$text = $1;
# $reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s;
# $reply =~ m%<textarea\s*tabindex='1'\s*accesskey=","\s*name="wpTextbox1"\s*rows='25'\s*cols='80'\s*>(.*?)</textarea>%s;
# $text = $1;
# print "debug: 1: ".$1."\n";
$reply =~ m/value="(\d+)" name="wpEdittime"/;
$editTime = $1;
# Added 22 Aug 2005 to correctly handle articles that have
# been undeleted
$reply =~ m/value="(\d+)" name="wpStarttime"/;
$startTime = $1;
# Added 9 Mar 2005 after recent software change.
$reply =~ m/value="(\w+)" name="wpEditToken"/;
$token = $1;
###
if (($text =~ m/^\s*$/)
and ($::nullOK ne "yes"))
{
myLog ("getPage($target): Null text!\n");
myLog "\n---\n$reply\n---\n";
die ("getPage($target): Null text!\n");
}
if (($editTime =~ m/^\s*$/)
and ($::nullOK ne "yes"))
{
myLog ("getPage($target): Null time!\n");
myLog "\n---\n$reply\n---\n";
die ("getPage($target): Null time!\n");
}
if (($text =~ m/>/) or
($text =~ m/</))
{
print $text;
myLog "\n---\n$text\n---\n";
myLog ("getPage($target): Bad text suck!\n");
die ("getPage($target): Bad text suck!\n");
}
# Change ( " -> " ) etc
# This function is from HTML::Entities.
decode_entities($text);
# This may or may not actually work
$::ua->cookie_jar->save();
return ($text, $editTime, $startTime, $token);
}
else
{
myLog ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n");
print ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n");
# 50X HTTP errors mean there is a problem connecting to the wiki server
if (($response->status_line =~ m/^500/)
or ($response->status_line =~ m/^502/)
or ($response->status_line =~ m/^503/))
{
return(retry("getPage", @_));
}
else
{
# Unhandled HTTP response
die ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n");
}
}
}
sub postPage
{
my ($request, $response, $pageName, $textToPost, $summaryEntry,
$editTime, $startTime, $actual, $expected, $attemptStartTime,
$attemptFinishTime, $date, $editToken, $minor);
$pageName = $_[0];
$editTime = $_[1];
$startTime = $_[2];
$editToken = $_[3];
$textToPost = $_[4];
$summaryEntry = $_[5]; # Max 200 chars!
$minor = $_[6];
$summaryEntry = substr($summaryEntry, 0, 200);
if ($pageName eq "")
{
myLog ("postPage(): Empty pageName.\n");
die ("postPage(): Empty pageName.\n");
}
if ($summaryEntry eq "")
{
$summaryEntry = "Automated editing.";
}
# Monitor server responsiveness
$attemptStartTime = Time::HiRes::time();
if ($minor eq "yes")
{
$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit",
[wpTextbox1 => $textToPost,
wpSummary => $summaryEntry,
wpSave => "Save page",
wpMinoredit => "on",
wpEditToken => $editToken,
wpStarttime => $startTime,
wpEdittime => $editTime];
# Optional: wpWatchthis
}
else
{
$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit",
[wpTextbox1 => $textToPost,
wpSummary => $summaryEntry,
wpSave => "Save page",
wpEditToken => $editToken,
wpStarttime => $startTime,
wpEdittime => $editTime];
# Optional: wpWatchthis, wpMinoredit
}
# ---
## If posts are failing, you can uncomment the below to see what
## HTTP request is being made.
# myLog($request->as_string());
# print $request->as_string(); $::speedLimit = 60 * 10;
# print $::ua->request($request)->as_string;
# ---
myLog("POSTing...");
print "POSTing...";
# Pass request to the user agent and get a response back
$response = $::ua->request($request);
myLog("POSTed.\n");
print "POSTed.\n";
if ($response->content =~ m/Please confirm that really want to recreate this article./)
{
myLog ($response->content."\n");
die ("Deleted article conflict! See log!");
}
# Check the outcome of the response
if (($response->is_success) or ($response->is_redirect))
{
# Monitor server responsiveness
$attemptFinishTime = Time::HiRes::time();
retry ("success", "postPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));
$expected = "302 Moved Temporarily";
$actual = $response->status_line;
if (($expected ne $actual)
and ($actual ne "200 OK"))
{
myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n");
myLog ($request->as_string());
myLog ("EXPECTED: '${expected}'\n");
myLog (" ACTUAL: '${actual}'\n");
die ("postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n");
}
$expected = "http://en.wikipedia.org/wiki/${pageName}";
$expected =~ s/\'/%27/g;
$expected =~ s/\*/%2A/g;
# $expected = urlEncode($expected);
$actual = $response->headers->header("Location");
if (($expected ne $actual)
and !(($actual eq "") and ($response->status_line eq "200 OK")))
{
myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n");
myLog ("EXPECTED: '${expected}'\n");
myLog (" ACTUAL: '${actual}'\n");
die ("postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n");
}
if ($response->content =~ m/<h1 class="firstHeading">Edit conflict/)
{
myLog ("Edit conflict on '$pageName' at '$editTime'!\n");
die ("Edit conflict on '$pageName' at '$editTime'!\n");
}
$::ua->cookie_jar->save();
return ($response->content);
}
else
{
$date = `date`;
$date =~ s/\n//g;
myLog ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n");
# 50X HTTP errors mean there is a problem connecting to the wiki server
if (($response->status_line =~ m/^500/)
or ($response->status_line =~ m/^502/)
or ($response->status_line =~ m/^503/))
{
print "Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n";
return(retry("postPage", @_));
}
else
{
# Unhandled HTTP response
die ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n");
}
}
}
sub myLog
{
open (LOG, ">>pearle-wisebot.ersatz.log.txt")
|| die "Could not append to log!";
print LOG $_[0];
close (LOG);
}
# A call to this recursive function handles any retries necessary to
# wait out network or server problems. It's a bit of a hack.
sub retry
{
my ($callType, @args, $i, $normalDelay, $firstRetry,
$secondRetry, $thirdRetry);
($callType, @args) = @_;
### ATTENTION ###
# Increasing the speed of the bot to faster than 1 edit every 10
# seconds violates English Wikipedia rules as of April, 2005, and
# will cause your bot to be banned. So don't change $normalDelay
# unless you know what you are doing. Other sites may have
# similar policies, and you are advised to check before using your
# bot at the default speed.
#################
# HTTP failures are usually an indication of high server load.
# The retry settings here are designed to give human editors
# priority use of the server, by allowing it ample recovering time
# when load is high.
# Time to wait before retry on failure, in seconds
$normalDelay = 10; # Normal interval between edits is 10 seconds
$firstRetry = 60; # First delay on fail is 1 minute
$secondRetry = 60 * 10; # Second delay on fail is 10 minutes
$thirdRetry = 60 * 60; # Third delay on fail is 1 hour
# SUCCESS CASE
# e.g. retry ("success", "getPage", "0.23");
if ($callType eq "success")
{
myLog("Response time for ".$args[0]." (sec): ".$args[1]."\n");
$::retryDelay = $normalDelay;
if ($args[0] eq "postPage")
{
# If the response time is greater than 20 seconds...
if ($args[1] > 20)
{
print "Wikipedia is very slow. Increasing minimum wait to 10 min...\n";
myLog("Wikipedia is very slow. Increasing minimum wait to 10 min...\n");
$::speedLimit = 60 * 10;
}
# If the response time is between 10 and 20 seconds...
elsif ($args[1] > 10)
{
print "Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n";
myLog("Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n");
$::speedLimit = 60;
}
# If the response time is less than 10 seconds...
else
{
if ($::speedLimit > 10)
{
print "Returning to normal minimum wait time.\n";
myLog("Returning to normal minimum wait time.\n");
$::speedLimit = 10;
}
}
}
return();
}
# e.g. retry ("getPage", "George_Washington")
# FAILURE CASES
elsif (($::retryDelay == $normalDelay)
or ($::retryDelay == 0))
{
print "First retry for ".$args[0]."\n";
myLog("First retry for ".$args[0]."\n");
$::retryDelay = $firstRetry;
$::speedLimit = 60 * 10;
}
elsif ($::retryDelay == $firstRetry)
{
print "Second retry for ".$args[0]."\n";
myLog("Second retry for ".$args[0]."\n");
$::retryDelay = $secondRetry;
$::speedLimit = 60 * 10;
}
elsif ($::retryDelay == $secondRetry)
{
print "Third retry for ".$args[0]."\n";
myLog("Third retry for ".$args[0]."\n");
$::retryDelay = $thirdRetry;
$::speedLimit = 60 * 10;
}
elsif ($::retryDelay == $thirdRetry)
{
print "Nth retry for ".$args[0]."\n";
myLog("Nth retry for ".$args[0]."\n");
$::retryDelay = $thirdRetry;
$::speedLimit = 60 * 10;
}
else
{
die ("retry(): Internal error - unknown delay factor '".$::retryDelay."'\n");
}
# DEFAULT TO FAILURE CASE HANDLING
$i = $::retryDelay;
while ($i >= 0)
{
sleep (1);
print STDERR "Waiting $i seconds for retry...\r";
$i--;
}
print " \r";
# DO THE ACTUAL RETRY
if ($callType eq "getPage")
{
return(getPage(@args));
}
elsif ($callType eq "postPage")
{
return(postPage(@args));
}
elsif ($callType eq "getCategoryArticles")
{
return(getCategoryArticles(@args));
}
elsif ($callType eq "getSubcategories")
{
return(getSubcategories(@args));
}
elsif ($callType eq "getURL")
{
return(getURL(@args));
}
else
{
myLog ("retry(): Unknown callType: $callType\n");
die ("retry(): Unknown callType: $callType\n");
}
}
# perl pearle.pl PRINT_WIKITEXT Article_you_want_to_get
## Warning: Saves to a file in the current directory with the same name
## as the article, plus another file with the .html extention.
sub printWikitext
{
my ($editTime, $startTime, $text, $target, $token);
$target = $_[0];
$target =~ s/^\[\[://;
$target =~ s/\]\]$//;
($text, $editTime, $startTime, $token) = getPage($target);
# Save the wikicode version to a file.
open (WIKITEXT, ">./${target}");
print WIKITEXT $text;
close (WIKITEXT);
# Save the HTML version to a file.
print `wget http://en.wikipedia.org/wiki/${target} -O ./${target}.html`;
}
1; # don't forget to return a true value from the file
[edit] Main code
Invoke as (for example)
perl genClassTable.pl -d 2 -C Wikipedia:WikiProject_The_Beatles/Categories -a leaveOrdered.txt -q C:\shortprg\AWB\enwiki-20060303-categorylinks.sql -o bigone2c.txt >runlog2c.txt
Fillename: genClassTable.pl
#!/usr/bin/perl -w
#---------------------------------------------------------------------------#
# process files and generate a category table
# Author: Larry Pieniazek (IBM/Ascential Software) as hobby project
# Adapted from stuff I cribbed from all over.
# (c)Larry Pieniazek 2006. This library is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
# additionally, can be redistributed and modified under GFDL or CC-SA as you choose
#
# Abstract:
# This perlscript is designed to parse category SQL dumps from wikipedia
# which are found here: http://download.wikimedia.org/enwiki/
# For example the 23 March dump is called
# http://download.wikimedia.org/enwiki/20060323/enwiki-20060323-categorylinks.sql.gz
#
# The parsing is to generate article classification tables such as those found at
# http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification
#
# In addition to the dump (currently must have been converted to linefeed delimited tuples)
# the other input is a list of categories of interest, one per line.
#
#---------------------------------------------------------------------------#
use strict;
use WP_util_pearlebot;
use WP_util_ClassTable;
use Getopt::Std;
#---------------------------------------------------------------------------#
# Main routine -
# process options
# read in categories desired
# build hash of articles by parsing SQL file
# write out table file using hash
#---------------------------------------------------------------------------#
# main
my $rc=0;
# print "prior to getopts\n";
getopts('hvd:q:a:C:c:o:', \%WP_util_ClassTable::options) or WP_util_ClassTable::Usage(); # debug also d
# print "post getopts, pre process\n";
WP_util_ClassTable::ProcessOptions();
if ($WP_util_ClassTable::debug>1) { print "post process, pre read cat\n"; }
if (defined $WP_util_ClassTable::options{'a'}) { # using page with art special key list
WP_util_ClassTable::ReadArtKeyFile();
} else { # no list to read so make it empty
%WP_util_ClassTable::artSpecialKeyHash = ();
}
if (defined $WP_util_ClassTable::options{'C'}) { # using page with cat lists
# $rc=FetchCatPage("Wikipedia:WikiProject_The Beatles/Categories");
$rc=WP_util_ClassTable::FetchCatPage($WP_util_ClassTable::catArtPage);
if ($rc) { die "error fetching category list from Wikipedia\n"; }
} else {
$rc=WP_util_ClassTable::ReadCatFile();
if ($rc) { die "error reading category list\n"; }
}
$rc=WP_util_ClassTable::ParseSQL();
if ($rc) { die "error reading SQL or building structure\n"; }
# $rc=WP_util_ClassTable::WriteHash();
# if ($rc) { die "error writing hash\n"; }
# exit 0;
$rc=WP_util_ClassTable::WriteTable();
if ($rc) { die "error building table\n"; }
exit 0;

