User:Eloquence/Wikinfo import script
From Wikipedia, the free encyclopedia
Copy this from the source, not from here.
This is the first pre-release of the new import script. It runs as a webserver on 8450 and does some cool stuff:
- Filters Special:Newpages for new articles that are not "from Wikipedia"
- Does not require a cookies.txt - uses its own account. Works with Wikinfo's new login requirement
- Auto-generates diffs of new articles that exist in both Wikinfo and Wikipedia
- Strips signatures from titles when importing
Note that to do all this it has to load quite a few pages, which takes some time for a full set of 500 new pages.
To do:
- Cache previous runs in wikinfo.db so we don't have to wait 60 seconds
- Use Special:Export
If you want to use this in some way or another, make sure you install all the used modules first using CPAN.
#!/usr/bin/perl
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Daemon;
use HTTP::Status;
use HTTP::Response;
use URI::Escape;
use Text::ParagraphDiff;
use GDBM_File ;
tie %storage, 'GDBM_File', "wikinfo.db", &GDBM_WRCREAT, 0640;
$SIG{INT} = \&catch_zap; # best strategy
$WKPREFIX="WIKINFO_";
$WKSPREFIX="WIKINFOSIZE_";
$LCPREFIX="LASTCHECK_";
$WPPREFIX="WIKIPEDIA_";
$DIPREFIX="DIFF_";
$browser=LWP::UserAgent->new();
$browser->cookie_jar( {} );
@ns_headers = (
'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, image/png, */*',
'Accept-Charset' => 'iso-8859-1,*,utf-8',
'Accept-Language' => 'en-US',
);
$browser->post("http://www.wikinfo.org/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]);
$browser->post("http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]);
$d=new HTTP::Daemon(LocalHost=>'localhost', LocalPort => '8450', Reuse=>1);
print "Please contact me at: ".$d->url. "\n";
@ns_headers = (
'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, image/png, */*',
'Accept-Charset' => 'iso-8859-1,*,utf-8',
'Accept-Language' => 'en-US',
);
#get_wikinfo_new();
#exit 0;
while ($c = $d->accept) {
$r = $c->get_request;
my $html;
$html.= <<HTML ;
<html>
<head>
<style TYPE="text/css">
<!--
body { margin-left:2em;margin-right:2em;background:#eeeeee;}
a { text-decoration:none;color:blue;}
a.ext { color:green;cursor:help; }
-->
</STYLE>
<body>
<h1>Wikinfo Import Script</h1>
HTML
if ($r) {
if ($r->method eq 'GET' and $r->url->path eq "/") {
my $re=new HTTP::Response();
$re->header("content_type"=>"text/html");
$html.= <<HTML ;
<table border="1" width="100%">
<tr><td><b>Wikinfo page</B></td><td><b>Corresponding Wikipedia page</B></td><td><b>Import</B></td></tr>
HTML
my @lines=get_wikinfo_new();
while(@lines) {
$linkopen=shift @lines;
$linktitle=shift @lines;
$linkclose=shift @lines;
$bytes=shift @lines;
$comment=shift @lines;
$wikipedia=shift @lines;
$diff=shift @lines;
$import=shift @lines;
$html.="<tr valign='top'><td>".$linkopen .$linktitle. $linkclose ." (".$bytes." bytes)";
if($comment) { $html.="<br><I>$comment</I>";}
$html.="</td><td>$wikipedia</td><td>$import</td></tr>";
if($diff ne "N/A") {
$html .= "<tr><td colspan=3 bgcolor=\"#dddddd\"><b>Diff:</B><P><font size=-1>".
"$diff</font></td></tr>";
}
}
$html.= <<HTML ;
</table>
</body>
</html>
HTML
$re->content($html);
$c->send_response($re);
} elsif($r->method eq 'GET' and $r->url->path ne "/") {
my $re=new HTTP::Response();
$re->header("content_type"=>"text/html");
$page=substr($r->url->path,1);
$html.=import_wikinfo($page);
$html.="</body></html>";
$re->content($html);
$c->send_response($re);
}
else {
$c->send_error(RC_FORBIDDEN)
}
}
$c = undef; # close connection
}
sub get_wikinfo_new {
my $response = $browser->get(
"http://www.wikinfo.org/wiki.phtml?title=Special:Newpages&limit=500&offset=0",
@ns_headers);
$response->content=~m/<ol start=.*?>(.*?)<\/ol>/s;
@lines=split(/<LI>/i,$1);
print $#lines;
my @checklines;
foreach $line(@lines) {
if($line=~m/(.*?)(<a href.*?>)(.*?)(<\/a>).*?\((.*?) bytes\)/i) {
$date=$1;
$linkopen=$2;
$linktitle=$3;
$linkclose=$4;
$bytes=$5;
if($line=~m/<em>\((.*)\)<\/em>/i) {
$comment=$1;
} else {
$comment="";
}
$wikititle=to_url($linktitle);
$pediatitle=to_url(strip_sig($linktitle));
if(!($comment=~m/from wikipedia \(note changes here\)/i)) {
push @checklines,$linkopen;
push @checklines,$linktitle;
push @checklines,$linkclose;
push @checklines,$bytes;
push @checklines,$comment;
$tryurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediatitle .
"&action=edit";
$response=$browser->get($tryurl,@ns_headers);
$response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
$pediasource=$1;
if(($pediasource=~m/\w+/)) {
push @checklines,"<a href='$tryurl'>$pediatitle</a>";
if(1) {
$tryurl="http://www.wikinfo.org/wiki.phtml?title=" .
$wikititle .
"&action=edit";
$response=$browser->get($tryurl,@ns_headers);
$response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
$wikinfosource=$1;
$diff=text_diff($pediasource, $wikinfosource, {string=>1, plain=>1, escape=>1});
$diff=~m/<p>(.*)<\/p>/si;
$diff=$1;
$diff=~s/ size="\+1">/>/gi;
push @checklines, $diff;
} else {
push @checklines, "N/A";
}
push @checklines, "N/A"; # exists, no import possible
} else {
push @checklines, "<a href='http://en.wikipedia.org/wiki/$pediatitle'>N/A</A>"; # no Wikipedia URL
push @checklines, "N/A"; # no diff
$importurl=$d->url.$wikititle;
$importlink="<a href='$importurl'>Go!</a>";
push @checklines,$importlink;
}
}
}
}
return @checklines;
}
sub import_wikinfo {
my $title=shift;
my $editurl="http://www.wikinfo.org/wiki.phtml?title=".$title."&action=edit";
my $viewurl="http://www.wikinfo.org/wiki.phtml?title=".$title;
my $response = $browser->get($editurl,@ns_headers);
my $rv;
$pagetitle=to_wiki($title);
$pediaurl=to_url(strip_sig($pagetitle));
# print "Full:\n".$response->content;
$response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
# print "Source:\n".$source;
$source=$1;
$source=~s/\"/"/gi; # unescape
$source=~s/\>/>/gi;
$source=~s/\</</gi;
$source=~s/\&/\&/gi;
if(!($source=~m/\w+/)) {
$rv.= "The page with the specified title was not found: <A HREF='$viewurl'>$viewurl</A> (<a href='$editurl'>edit</a>)";
return $rv;
}
$source.="\n\n''Adapted from the [[Wikinfo]] article [$viewurl $pagetitle], licensed under the [[GNU Free Documentation License]].''";
$rv.="Checking for duplicate of <A HREF='$viewurl'>$viewurl</A>..<P>";
$tryurl="http://en.wikipedia.org/wiki/".$pediaurl;
$response=$browser->get($tryurl,@ns_headers);
if($response->content=~m/There is currently no text in this page/) {
$rv.="Posted new article to <a href='$tryurl'>$tryurl</A>!<P>";
$wpurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediaurl."&action=submit";
$browser->post($wpurl,@ns_headers,Content=>
[
wpTextbox1=>$source,
wpSave=>"Save page",
wpSummary=>"Imported from Wikinfo via [[User:Eloquence/Wikinfo import script]]"
]);
} else {
$rv.="Page already exists on Wikipedia: <a href='$tryurl'>$tryurl</A>! You have to merge by hand. :-("
}
return $rv;
}
sub catch_zap {
my $signame = shift;
untie %storage;
die "Program terminated: Received $signame";
}
sub strip_sig {
my $title=shift;
@names=( "Levan Urushadze", "Fred Bauder");
while ($name=shift(@names)) {
$title=~s/(.*) by $name$/$1/g;
}
return $title;
}
sub to_url {
my $title=shift;
$title=~s/ /_/gi;
$title=uri_escape($title);
$title=~s/\'/\%27/gi;
return $title;
}
sub to_wiki {
my $title=shift;
$title=uri_unescape($title);
$title=~s/\%27/\'/gi;
$title=~s/_/ /gi;
return $title;
}

