Wikipedia:Shortpages/How to update
From Wikipedia, the free encyclopedia
After downloading a current-pages database dump for the English Wikipedia, I use the following commands:
- mkdir data; mkdir todo
- gunzip -c dl/20050909_pages_current.xml.gz | perl ../scripts/parse-entries2.pl >! data/entries.txt
- perl ../scripts/shortpages.pl
The two Perl scripts are shown below, respectively.
-- Beland 06:54, 5 October 2005 (UTC)
# parse-entries2.pl
use strict;
main();
sub main
{
my ($text, $title);
$/ = "</page>";
while (<>)
{
$_ =~ m%^(.*?)<revision>(.*?)</revision>.*$%s;
$title = $1;
$text = $2;
$title =~ s%^.*?<title>(.*?)</title>.*?$%$1%s;
$title =~ s/ /_/g;
$title =~ s/\&/\&/g;
$title =~ s/\>/>/g;
$title =~ s/\</</g;
if ($text =~ m%<text xml:space="preserve" />%)
{
$text = "";
}
else
{
$text =~ s%^.*<text xml:space="preserve">(.*?)</text>.*$%$1%s;
}
$text =~ s/\t/\\t/g;
$text =~ s/\n/\\n/g;
$text =~ s/\&/\&/g;
$text =~ s/\>/>/g;
$text =~ s/\</</g;
print $title."\t".$text."\n";
}
}
# shortpages.pl
use strict;
main();
sub main
{
my ($title, $text, $i);
open (ENTRIES, "<data/entries.txt");
open (SHORT, ">todo/shortpages.txt");
open (SHORTSTUB, ">todo/shortstubs.txt");
while (<ENTRIES>)
{
$_ =~ m/^(.*?)\t(.*)$/;
$title = $1;
$text = $2;
# Protect!
$text =~ s%</nowiki>%%g;
# Remove leading and trailing whitespace
$title =~ s/^\s*//;
$title =~ s/\s*$//;
# Uppercase title
$title = ucfirst($title);
# Underscores, please
$title =~ s/ /_/g;
# Exclude all namespaces except Article,
# Portal, Wikipedia, and Help
if (($title =~ m/^\w+_talk:/)
or ($title =~ m/^Media:/)
or ($title =~ m/^Special:/)
or ($title =~ m/^Talk:/)
or ($title =~ m/^User:/)
or ($title =~ m/^Image:/)
or ($title =~ m/^MediaWiki:/)
or ($title =~ m/^Template:/)
or ($title =~ m/^Category:/)
)
{
next;
}
if (length ($text) < 100)
{
if (($text =~ m/\{\{copyvio/)
or ($text =~ m/^\s*\#\s*redirect.*?\s*\[\[.*?\]\]/i)
or ($text =~ m/\{\{deletedpage\}\}/)
or ($text =~ m/\{\{Deletedpage\}\}/)
or ($text =~ m/\{\{deletedPage\}\}/)
or ($text =~ m/\{\{DeletedPage\}\}/)
or ($text =~ m/\{\{deletedarticle\}\}/)
or ($text =~ m/\{\{disambig\}\}/)
or ($text =~ m/\{\{rfd\}\}/)
)
{
next;
}
if ($text =~ m/\-*stub\}\}/)
{
print SHORTSTUB "<tr><td>".sprintf("%02d", length ($text))."</td><td>[[$title]]</td><td>$text</td></tr>\n";
#print "SHORTSTUB ".length ($text)." [[$title]] $text\n";
}
else
{
print SHORT "<tr><td>".sprintf("%02d", length ($text))."</td><td>[[$title]]</td><td>$text</td></tr>\n";
#print "SHORT ".length ($text)." [[$title]] $text\n";
}
}
if ($i++ % 10000 == 0)
{
print STDERR $i - 1 ."\r";
}
}
close (ENTRIES);
close (SHORT);
close (SHORTSTUB);
}
print `cat ./todo/shortpages.txt | sort -n > ./todo/shortpages-sorted.txt`;
print `cat ./todo/shortstubs.txt | sort -n > ./todo/shortstubs-sorted.txt`;
unlink ("./todo/shortpages.txt");
unlink ("./todo/shortstubs.txt");
The old method is to run the following SQL commands on a database dump. This does not remove stubs.
DROP TABLE IF EXISTS temp_sizesmall;
CREATE TABLE temp_sizesmall (UNIQUE KEY `s_id` (`s_id`))
SELECT
cur_title AS s_title,
cur_id AS s_id,
cur_text AS s_text,
length(cur_text) AS s_size,
cur_namespace AS s_namespace,
cur_is_redirect AS s_is_redirect
FROM cur
WHERE LENGTH(cur_text)<251
LIMIT 1000000;
DELETE
FROM temp_sizesmall
WHERE s_is_redirect=1;
DELETE
FROM temp_sizesmall
WHERE s_namespace<>0;
ALTER TABLE temp_sizesmall DROP COLUMN s_namespace;
ALTER TABLE temp_sizesmall DROP COLUMN s_is_redirect;
SELECT
CONCAT(
'|-\n|',
s_size,
'||[[',
REPLACE(s_title,'_',' '),
']]||',
LEFT((REPLACE(REPLACE(REPLACE(REPLACE(s_text,'\n',' '),'\r',' '),' ',' '),'&','&')),100),
'')
AS List
INTO OUTFILE 'wp_smallpages.txt' #change it to the drive/path you need
FROM temp_sizesmall
WHERE s_text NOT LIKE '%{{disambig}}%'
AND s_text NOT LIKE '%{{disambig}}%'
AND s_text NOT LIKE '%{{copyvio1}}%'
AND s_text NOT LIKE '%{{copyvio%'
AND s_size>0
AND s_text NOT LIKE '%{{List_of_people%'
ORDER BY s_size, Lower(s_title)
LIMIT 170

