User:Lar/ClassificationTableGen/Backlev
From Wikipedia, the free encyclopedia
Perl code: This code generated User:Lar/Sandbox2 (version 6).. There is a lot of work to do on it yet but if you stumble across this, feedback welcome. Not ready for public release yet (if ever).
Updated as of ++Lar: t/c 05:06, 27 March 2006 (UTC)
#!/usr/bin/perl -w
#---------------------------------------------------------------------------#
# process files and generate a category table
# Author: Larry Pieniazek (IBM/Ascential Software) as hobby project
# Adapted from stuff I cribbed from all over.
# (c)Larry Pieniazek 2006. This library is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
# additionally, can be redistributed and modified under GFDL or CC-SA as you choose
#
# Abstract:
# This perlscript is designed to parse category SQL dumps from wikipedia
# which are found here: http://download.wikimedia.org/enwiki/
# For example the 23 March dump is called
# http://download.wikimedia.org/enwiki/20060323/enwiki-20060323-categorylinks.sql.gz
#
# The parsing is to generate article classification tables such as those found at
# http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification
#
# In addition to the dump (currently must have been converted to linefeed delimited tuples)
# the other input is a list of categories of interest, one per line.
#
#---------------------------------------------------------------------------#
use strict;
use Data::Dumper;
use Getopt::Std;
# things we may want to use at some point
# use File::Spec::Functions;
#---------------------------------------------------------------------------#
# Subroutine prototypes: #
#---------------------------------------------------------------------------#
# setup
sub Usage; # print info message about how to use this
sub ProcessOptions; # Process Command Line Options.
# utility
sub ScoreToBlank; # underscores to blanks
sub BlankToScore; # blanks to underscores
sub FlipComma; # reverse a reversed comma string. "Lennon, John" -> "John Lennon"
sub UnEscape; # remove escapes with a clever rexp
# general
sub ReadCatFile; # read the category file into the catArray and is_catHash
sub ParseSQL; # parse the big SQL file and build the article data (hashref $collect)
sub WriteTable; # create the output
sub WriteTableHeader; # used by above, create header
sub WriteTableSecBreak; # used by above, create a section break (when the leading char changes)
# ------ option switches and related ---------
my(%options); # hash of switches, values
# ----- logging NOT IMPLEMENTED YET (ever?) --
my($logging); # Flag to denote we are writing to log.
my($log_dir); # Log Directory.
my($lfh); # Log File Handle.
my($LOG_FILE_NAME); # Name of Log File to be written -l value or default
# my($verbose); # -v Flag to denote verbose messaging.
my($debug); # -d Flag to denote REALLY verbose messaging.
my($sqlFileName); # -q <file name of SQL file to parse> (or 'enwiki-20060303-categorylinks.sql')
my($catFileName); # -c <file name of categories> (or 'categoryList.txt')
my($tableFileName); # -o <table file to create> (or 'tables.txt')
# ------ Data structures ---------------------
my $inCats=(); # what cats is the article in?
my $nameVersions=(); # what are the versions of the name (lex orders)
my $rec={}; # ref to one article's record
my $collect={}; # ref to all the articles keyed on the $artKey var
# what the data will look like
# my $rec={
# key => "178234", # numeric key from first tuple value (article key, believed unique)
# artLink => "link text" # text to use for link not same as sort
# sortKey => "sort text" # sort text (what order should article come out)
# inCats => [@inCats], # array of categories the article is in
# nameVersions => [@nameVersions] # array of version of the name of the article
# # this one may not be used for anything
# }; # one article's record
#
# my %collect={
# key => $rec
# }; # all the articles keyed on the $artKey var
# ------ work vars ----------------------------
my @catList;
my @catArray;
my %is_catHash;
# file handles
my $sqlH;
my $tableH;
my $cfH;
#---------------------------------------------------------------------------#
# Usage - Print Usage Information and exit.
#---------------------------------------------------------------------------#
sub Usage {
print <<END_USAGE;
Usage: $0 [-h] [-v] [-d] [-q <sqlFile>] [-c <catFile>] [-o <tableFile>]
Switch meanings:
-h --help print this help message.
-v --version print version message.
-d <0|1|2|3> debug:
0: quiet
1: Verbose Mode
2: REALLY verbose mode
3: Every frigging detail.
File switches:
-q <file name of SQL file to parse>
(or 'enwiki-20060303-categorylinks_sample.sql' by default)
-c <file name of categories>
(or 'categoryList.txt' by default)
-o <table file to create>
(or 'tables.txt' by default)
END_USAGE
print "Status: 99\n";
exit(99);
} # End of Usage.
sub Version {
print "\nfilterCategories version 0.04 - 26 March 2006, Larry Pieniazek."
." \n -- released under GFDL and CC-SA -- \n\n";
# really should print something else
}
# this stuff isn't quite right at the moment
# required for getopts to support --help and --version
sub HELP_MESSAGE{
&Usage();
}
# required for getopts to support --help and --version
sub VERSION_MESSAGE{
&Version();
}
#---------------------------------------------------------------------------#
# ProcessOptions - Process Command Line Options.
#---------------------------------------------------------------------------#
sub ProcessOptions {
&Version if ($options{'v'});
&Usage if ($options{'h'});
my %debugHash = (
'0'=>"silent" ,
'1'=>"normal trace",
'2'=>"very chatty",
'3'=>"insanely chatty" );
if (defined $options{'d'}) {
$debug=$options{'d'};
if ($debugHash{$debug}) {
print"...debug switch was ".$options{'d'}." giving setting: ".$debugHash{$debug}."\n"
unless 0 == $options{'d'} ; # if 0, then REALLY quiet
} else {
$debug=1;
print"...debug switch was ".$options{'d'}." defaulting debug to 1 - normal trace\n";
} # recognised option
} else { # default, no switch
$debug=1;
print"...debug switch not found, defaulting debug to 1 - normal trace\n";
}
if (defined $options{'q'}) {
$sqlFileName=$options{'q'};
} else {
$sqlFileName="enwiki-20060303-categorylinks_sample.sql";
}
if (defined $options{'c'}) {
$catFileName=$options{'c'};
} else {
$catFileName="categoryList.txt";
}
if (defined $options{'o'}) {
$tableFileName=$options{'o'};
} else {
$tableFileName="tables.txt";
}
} # End of ProcessOptions.
#---------------------------------------------------------------------------#
# ReadCatFile - read in categories to build article tracking tables for
#---------------------------------------------------------------------------#
sub ReadCatFile {
my $rc=0;
# $catFileName = $_[0]; # now set processOptions()
if ($debug>2) {
stat($catFileName);
print "Exists\n" if -e _;
print "Readable\n" if -r _;
print "Writable\n" if -w _;
print "Executable\n" if -x _;
print "Setuid\n" if -u _;
print "Setgid\n" if -g _;
print "Sticky\n" if -k _;
print "Text\n" if -T _;
print "Binary\n" if -B _;
}
if (( -e $catFileName ) && ( -r $catFileName )) {
if (!open $cfH, "<", $catFileName){ warn "can't open ".$catFileName."\n"; $rc=99; return $rc; }
} else {
print "error with ".$catFileName." ... does not exist or not readable \n";
$rc= 99;
return $rc;
}
%is_catHash = ();
if ($debug>0) {print "reading ".$catFileName."\n";}
# @catList=<$cfH>;
my $catListItem;
for (;;) {
undef $!;
unless (defined( $catListItem = <$cfH> )) {
die $! if $!;
last; # reached EOF
}
chomp $catListItem;
$catListItem=ScoreToBlank($catListItem);
push @catList, $catListItem;
# set up searchable hash...
$is_catHash{$catListItem} = 1;
}
if ($debug>0) {
print "\nCategories to process: \n";
for my $fe(@catList) {print( $fe."\n");};
print "\n";
}
if ($debug>1) {
print "\n\n... corresponding hash values: \n";
while (my ($key, $value) = each %is_catHash) {
print "$key = $value\n";
}
print "\n";
} # end chatty trace
$rc=0;
return $rc;
}
#---------------------------------------------------------------------------#
# ScoreToBlank - convert underscores to blanks
#---------------------------------------------------------------------------#
sub ScoreToBlank {
my $str=$_[0];
if ($debug>3) {print "ScoreToBlank \$str IN: $str\n";}
$str=~ s/_/ /g;
if ($debug>3) {print "ScoreToBlank \$str OUT: $str\n";}
return $str;
} # there
#---------------------------------------------------------------------------#
# BlankToScore - convert blanks to underscores
#---------------------------------------------------------------------------#
sub BlankToScore {
my $str=$_[0];
$str=~ s/ /_/g;
return $str;
} # and back again
#---------------------------------------------------------------------------#
# FlipComma - take a phrase with comma (and 1 blank) and flip it,
# "Lennon, John" -> "John Lennon"
#---------------------------------------------------------------------------#
sub FlipComma {
my $str=$_[0];
my ($first,$second)= split(/, /,$str,2);
if (length($second)>0) { # there is something there to flip
$str=$second." ".$first;
}
return $str;
} # round and round we go
#---------------------------------------------------------------------------#
# StripLeadTrail - strip leading s/^\s+// and trailing s/\s+$// blanks
#---------------------------------------------------------------------------#
sub StripLeadTrail {
my $str=$_[0];
$str=~ s/^\s+//;
$str=~ s/\s+$//;
return $str;
} # and back again
#---------------------------------------------------------------------------#
# UnEscape - remove escape chars unless they're escaped
# this code lifted from John Alden's Escape Delimiters
# http://search.cpan.org/src/JOHNA/Text-EscapeDelimiters-1.004/lib/Text/EscapeDelimiters.pm
# Text::EscapeDelimiters v1.004
# (c) John Alden 2005. This library is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#---------------------------------------------------------------------------#
sub UnEscape {
my($string) = $_[0];
my $eseq = "\\";
return $string unless($eseq); #no-op
#Remove escape characters apart from double-escapes
$string =~ s/\Q$eseq\E(?!\Q$eseq\E)//gs;
#Fold double-escapes down to single escapes
$string =~ s/\Q$eseq$eseq\E/$eseq/gs;
return $string;
}
#---------------------------------------------------------------------------#
# ParseSQL - read through the SQL file and build the data structures
# - read in one tuple at a time (currently one line but change to
# buffered read later)
# - for each tuple parse out the pieces we need
# - add or update record in $collect hash, recording category and lexical key
# (if we find a comma reversed version of the article name, it's probably
# a better lexical key than we have so take it.)
# - update lexical key, article name, category seen
# - possibly strip blanks, change _ to blanks, remove \ escapes,
# and reverse comma fields. (future: use list of articles with commas
# in their names as refinement)
#---------------------------------------------------------------------------#
sub ParseSQL {
my $rc=0;
if (( -e $sqlFileName ) && ( -r $sqlFileName )) {
open ($sqlH, "<", $sqlFileName) or die "can't open ".$sqlFileName." for reading \n";
} else {
print "error with ".$sqlFileName." ... does not exist or not readable \n";
$rc=99;
return $rc;
}
if ($debug>0) {print "reading ".$sqlFileName."\n"; }
my $sqlLine;
my $sqlLC=0;
$Data::Dumper::Indent = 2; # pretty print (3 is with array indices
$Data::Dumper::Useqq = 1; # print strings in double quotes
$Data::Dumper::Pair = " : "; # specify hash key/value separator
$Data::Dumper::Purity = 1; # fill in the holes for eval
$Data::Dumper::Maxdepth = 3; # no deeper than 3 refs down
$Data::Dumper::Deepcopy = 1; # deep copy
for (;;) {
undef $!;
unless (defined( $sqlLine = <$sqlH> )) {
die $! if $!;
last; # reached EOF
}
# we have to process lines that look like any of these
# (12731,'Catholics_not_in_communion_with_Rome','George Harrison',20060228150212),
# ordinary
# (12731,'Deaths_by_lung_cancer','Harrison, George',20050904074730),
# sort order is different (the article name is probably the first 12731 that doesn't
# have a comma in the article name
# (12731,'George_Harrison','',20060303000936),
# self ref... the category contains an article named the same thing
# (2246703,'The_Beatles_songs','Don\'t Pass Me By',20050719071328),
# embedded escaped ' will screw up parse if not careful.
# safe to process line as we got a line
$sqlLC++;
if ($debug>2) { print "line ".$sqlLC." was ".$sqlLine."\n"; }
chomp $sqlLine;
my($firstP, $secondP) = split(/',/, $sqlLine,2);
if ($debug>2) { print "firstP: >".$firstP."< secondP: >".$secondP."< \n";}
my($artKey, $catName) = split(/,'/,$firstP,2);
$artKey=substr($artKey,1);
$catName=ScoreToBlank($catName);
if ($debug>2) { print "artKey: >".$artKey."< catName: >".$catName."< \n"; }
my($artName, $timeStamp)=split(/',/,substr($secondP,1),2);
# $timeStamp=split(/),/,$timeStamp,1);
$timeStamp=substr($timeStamp,0,-2);
if ($debug>2) {print "artName: >".$artName."< timeStamp: >".$timeStamp."< \n";}
if (0==length($artName)) { # empty, this is the case of matching art/cat names
$artName=$catName;
} else {
$artName=StripLeadTrail(UnEscape($artName));
}
my $sortKey="";
my $skHasComma=0;
my $anHasComma=0;
if (exists($is_catHash{$catName}) ) {
if ($debug>1) {
print "artName: >".$artName.
"<\n timeStamp: >".$timeStamp.
"<\n artKey: >".$artKey.
"<\n catName: >".$catName."< \n";
print " ... one of our cats! \n";
}
if (exists($collect->{$artKey}) ) {
if ($debug>1) { print " ... and we have the article already\n"; };
$rec = $collect->{$artKey}; # get ref to existing one
$inCats= $rec->{inCats}; # and to the arrays it carries
$nameVersions = $rec->{nameVersions};
} else {
$rec={}; # make an empty one
$rec->{key}=$artKey; # uses same key
$inCats=();
$nameVersions=();
}
$inCats->{$catName}=1;
$nameVersions->{$artName}=1;
$rec->{'inCats'}=$inCats;
$rec->{'nameVersions'}=$nameVersions;
# put logic to handle making sure name of article for link is non comma
$anHasComma= ( $artName =~/,/ );
my $artNameSave=$artName;
if ($anHasComma) { # if article has comma flip it and save that as name
$artNameSave=FlipComma($artName);
}
if (!(exists($rec->{artLink}))) {
$rec->{artLink}=$artNameSave;
}
if ($debug>1) {print "\$artName: $artName \$artNameSave: $artNameSave \n"}
# put logic for sort key here
if (exists($rec->{sortKey})) {
$sortKey=$rec->{sortKey};
if ($debug>1) {print "sortKey: $sortKey\n"; }
if ($sortKey ne $artName) { # If the keys are the same do nothing
$skHasComma= ( $sortKey =~/,/ );
if ($debug>1) {print "anHasComma: $anHasComma skHasComma: $skHasComma\n";}
if ($anHasComma eq $skHasComma) {
# if neither has a comma, or both have a comma take whichever one is earlier in the alphabet
if ($sortKey gt $artName) {
$rec->{sortKey}= $artName;
} # else not needed because sortKey already earlier, leave it.
} else {
# If the new key has a comma in it, use that one, it's probably the sort key
if ($anHasComma) {
$rec->{sortKey}= $artName;
} # else not needed, leave as is
}
if ($debug>1) {print "sortKey now is ".$rec->{sortKey}."\n"; }
} # end of handling different keys
} else { # we don't have it, save it away
$rec->{sortKey}=$artName; # since it's new, the sort key is the name we found
if ($debug>1) {print "added sortKey: $rec->{sortKey}\n"; }
} # end if sortKey does/doesn't exist
$collect->{$artKey}=$rec;
} # end if category is one we care about
} # end for (;;) (the read loop)
if ($debug>0) {
print "...collect: \n";
print Dumper($collect);
}
if ($debug>0) {print "finished parsing SQL\n"; }
return $rc;
} # end ParseSQL
#---------------------------------------------------------------------------#
# WriteTableHeader - create output table header
#---------------------------------------------------------------------------#
sub WriteTableHeader {
# assumes that $tableH is open and valid
print $tableH <<END_TABLEH;
{|
|valign=top|
{| width="100%" border="1" cellpadding="2" cellspacing="0" style="margin: 1em 1em 1em 0; background: #f9f9f9; border: 1px #aaa solid; border-collapse: collapse; font-size: 85%;"
|-
!width=20%|Article
!width=15%|Categories
!width=7%|Assessed
!width=7%|Status
!width=5%|Uses Infobox
!width=37%|Comments and Pending tasks
!width=8%|Assessed by
END_TABLEH
return 0;
}
#---------------------------------------------------------------------------#
# WriteTableSecBreak - create output table break between sections
#---------------------------------------------------------------------------#
sub WriteTableSecBreak {
my $headChar=$_[0];
print $tableH "|-\n|colspan=\"7\" align=\"left\" style=\"background:white; font-size: 200%;"
." font-weight:bold; border-bottom:4px solid grey; \"| \n"
."====".$headChar."====\n";
return 0;
} # end WriteTableSecBreak
#---------------------------------------------------------------------------#
# WriteTable - create output table
# - sort the data structure by the sort keys (which are the lexical
# (sometimes comma inverted) article names) ... these keys are inside the
# structure
# - using the sorted array of keys, iterate the hash in sort order
# - every time the first letter of the key changes, write out a SecBreak
#---------------------------------------------------------------------------#
sub WriteTable {
my $rc=0;
if ($debug>2) {
print" statting: ".$tableFileName."\n";
stat($tableFileName);
print "Exists\n" if -e _;
print "Readable\n" if -r _;
print "Writable\n" if -w _;
print "Executable\n" if -x _;
print "Setuid\n" if -u _;
print "Setgid\n" if -g _;
print "Sticky\n" if -k _;
print "Text\n" if -T _;
print "Binary\n" if -B _;
}
open ($tableH, '>', $tableFileName) or die "can't open ".$tableFileName." for writing \n";
$rc=&WriteTableHeader();
if ($rc) { die "error building table header\n"; }
# we want to create line pairs of the form
# (with the pipe in col 1)
# |-
# |[[Abbey Road (album)]]||[[:category:The Beatles albums|]]|| ||{{/Unknown}}||unknown|| ||
# |-
# |[[Anthology 1]]||[[:category:The Beatles albums|]]|| ||{{/Unknown}}||unknown|| ||
#
# in sorted order
# make an array of the keys to the hash
# (the article keys, which are not in any particular alpha)
my @keys = sort { $collect->{$a}->{sortKey} # custom sort spec, use the lexical key
cmp # (which is embedded in the rec)
$collect->{$b}->{sortKey} }
keys %{$collect};
my $firstLet=chr(00); # has to be lower than any other character val!
# iterate in sorted order
foreach my $artKey ( @keys ) {
$rec = $collect->{$artKey}; # get easy access to the record
$inCats= $rec->{inCats}; # and to the category array it carries
my $artLink=$rec->{artLink};
my $trialFirst=substr($rec->{sortKey},0,1); # get first char
if ($trialFirst ne $firstLet) {
$firstLet=$trialFirst;
if ($debug>1) {print "Switching to new first letter: $firstLet \n";}
&WriteTableSecBreak($firstLet);
} # end if new first letter in lexical order
my ($catStr,$catV,$catK);
$catStr="";
while (($catK, $catV) = each %{$inCats}) {
if ("" ne $catStr) {
$catStr.="<br>";
}
$catStr.="[[:category:".$catK."|]]";
} # loop through the categories we saw
if ($debug>0) { print "key: ".$artKey." rec key ".$rec->{key}." article Link text ".$artLink."\n"; }
print $tableH "|-\n||[[".$artLink."]]||".$catStr."||| ||"
."{{Wikipedia:WikiProject The Beatles/Article Classification/Unknown}}"
."||unknown|| || \n";
} # end of iteration through the hash in sorted order
# finish off table
print $tableH "\n|}";
return $rc;
}
#---------------------------------------------------------------------------#
# Main routine -
# process options
# read in categories desired
# build hash of articles by parsing SQL file
# write out table file using hash
#---------------------------------------------------------------------------#
# main
my $rc=0;
# print "prior to getopts\n";
getopts('hvd:q:c:o:', \%options) or &Usage; # debug also d
# print "post getopts, pre process\n";
&ProcessOptions();
if ($debug>1) { print "post process, pre read cat\n"; }
$rc=&ReadCatFile();
if ($rc) { die "error reading category list\n"; }
$rc=&ParseSQL();
if ($rc) { die "error reading SQL or building structure\n"; }
$rc=&WriteTable();
if ($rc) { die "error building table\n"; }
exit 0;

