User:OrphanBot/tagbot.pl
From Wikipedia, the free encyclopedia
Source code for OrphanBot's upload-tagging task. Requires libBot.pl and libPearle2.pl.
#!/usr/bin/perl
# Tagbot
#
# A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag.
# Known bugs:
# If an image is tagged with a sourcereq tag and a non-tag, but nothing else, it won't be tagged as "no info"
# An image with only a non-tag will be reported as a "untagged with tag"
use strict;
use warnings;
use Date::Calc qw(Month_to_Text Today);
use XML::Simple;
use utf8;
use Data::Dumper;
require "libBot.pl";
my $permit_interruptions = 0; # Allow talkpage messages to stop the bot?
my ($cur_y, $cur_m, $cur_d);
my %users_notified; # List of users notifed. 0, undef = no; 1 = notified once; 2 = notified and second notice
my %notifications; # List of user,image pairs, used to ensure that no user is ever notified about an image twice.
my %dont_notify = (); # List of users to never notify
my %banned_users = (); # List of users banned from uploading
my %exempt_users = (); # List of users exempt from inspection
my $last_upload = undef; # Last upload inspected
my %tags; # List of tags found that are not in either the "good" or "bad" list
my @sourcereq_tags; # List of tags that require a separate source
my $sourcereq_tags;
my @nosource_tags; # List of self-sourcing tags
my $nosource_tags;
my @deletion_tags; # Tags that will eventually lead to the deletion of the image
my $deletion_tags;
my @forbidden_tags; # List of tags that should never be seen
my $forbidden_tags;
my @deprecated_tags; # List of tags that shouldn't be used any more
my $deprecated_tags;
my @nontags; # List of tags that aren't copyright tags
my $nontags;
sub loadTagList
{
my $filename = shift;
my @list = ();
open INFILE, "<", $filename;
while(<INFILE>)
{
$_ =~ s/#.*//; # Remove comments
$_ =~ s/^\s*//; # Remove leading whitespace
$_ =~ s/\s*$//; # Remove trailing whitespace
push @list, $_ if($_ !~ /^\s*$/);
}
close INFILE;
return @list;
}
sub processTagList
{
my $tags = join "|", @_;
$tags =~ s/\(/\\\(/g;
$tags =~ s/\)/\\\)/g;
$tags =~ s/\./\\\./g;
$tags =~ s/\*/[^}|]*?/g;
$tags =~ s/ /[ _]/g;
return $tags;
}
@sourcereq_tags = loadTagList("sourcereq.tags");
$sourcereq_tags = processTagList(@sourcereq_tags);
#print "Sourcereq: $sourcereq_tags\n\n";
print "Sourcereq: Loaded\n\n";
@nosource_tags = loadTagList("nosource.tags");
$nosource_tags = processTagList(@nosource_tags);
#print "Nosource: $nosource_tags\n\n";
print "Nosource: Loaded\n\n";
@forbidden_tags = loadTagList("forbidden.tags");
$forbidden_tags = processTagList(@forbidden_tags);
#print "Forbid: $forbidden_tags\n\n";
print "Forbid: Loaded\n\n";
@deletion_tags = loadTagList("deletion.tags");
$deletion_tags = processTagList(@deletion_tags);
#print "Deletion: $deletion_tags\n\n";
print "Deletion: Loaded\n\n";
@deprecated_tags = loadTagList("deprecated.tags");
$deprecated_tags = processTagList(@deprecated_tags);
#print "Deprecated: $deprecated_tags\n\n";
print "Deprecated: Loaded\n\n";
@nontags = loadTagList("nontags.tags");
$nontags = processTagList(@nontags);
#print "Nontags: $nontags\n\n";
print "Nontags: Loaded\n\n";
# See if the specified category exists, and if not, create it
sub checkImageCategoryTag
{
my $cat;
my ($text, $editTime, $startTime, $token);
$cat = "Category:Images with no copyright tag as of $_[0] $_[1] $_[2]";
($text, $editTime, $startTime, $token) = Pearle::getPage($cat);
if($text !~ /\[\[[Cc]ategory:[Ii]mages with no copyright tag/)
{
$text .= "\n<span class='plainlinks'>[{{fullurl:{{FULLPAGENAME}}|action=purge}} Purge]</span>\n[[Category:Images with no copyright tag| ]]\n";
Pearle::postPage($cat, $editTime, $startTime, $token, $text, "Created category", "no");
}
}
sub tokenSubst
{
my $string = shift;
my $image = shift;
$string =~ s/<IMAGE>/$image/g;
$string =~ s/<DAY>/$cur_d/g;
$string =~ s/<MONTH>/$cur_m/g;
$string =~ s/<YEAR>/$cur_y/g;
return $string;
}
sub loadUserList
{
my $file = shift;
my %notelist;
my $i = 0;
notelog("File: $file\n");
open INFILE, "<", $file;
while(<INFILE>)
{
my ($user, $reason);
$_ =~ s/\s*#.*$//g;
chomp;
($user, $reason) = $_ =~ /([^\t]*)\t(.*)/;
next if(!defined($user) or !defined($reason));
$notelist{$user} = $reason;
$i++;
}
close INFILE;
notelog("$i notifications loaded\n");
return %notelist;
}
exit;
# Initialize
($cur_y, $cur_m, $cur_d) = Today(1); # Today in GMT
$cur_m = Month_to_Text($cur_m);
my $xml_parser = XML::Simple->new(ForceArray => ['page'], KeyAttr => [page => 'title']);
%notifications = loadNotificationList("./orphanbot.note");
%dont_notify = loadNotificationList("./orphanbot.whitelist");
%banned_users = loadUserList("./banneduser.list");
%exempt_users = loadUserList("./exemptuser.list");
Pearle::init(<< INSERT BOT NAME HERE >>, << INSERT PASSWORD HERE >>, "./tagbot.log","./cookies.pearle.txt");
Pearle::config(nullOK => 1, sanityCheck => 1);
config(username => << INSERT BOT NAME HERE >>);
if(!Pearle::login())
{
exit;
}
checkImageCategoryTag($cur_d, $cur_m, $cur_y);
# Get recent uploads
my @articles = Pearle::getLogArticles('upload', 300, 20);
notelog(scalar(@articles) . " found\n");
# Process in batches of 50
while(scalar(@articles) > 0)
{
my @batch = splice(@articles, 0, 50);
my @batch_articles;
my $pair;
foreach $pair (@batch)
{
push @batch_articles, $pair->[0];
}
notelog("Batch: " . scalar(@batch_articles) . "\n");
# For each batch, export the image description pages
my $articles_text = Pearle::Export(@batch_articles);
# Ditch the "siteinfo" block: the parser doesn't fold "<page>" elements into an array if it's present
$articles_text =~ s/<siteinfo>.*<\/siteinfo>//s;
my $parsed_xml = $xml_parser->XMLin($articles_text);
my %pages = %{$parsed_xml->{page}};
my $pageref;
# Verify that the export format hasn't changed
if($parsed_xml->{version} ne '0.3')
{
Pearle::myLog("ERROR: Export format has changed\n");
userwarnlog("ERROR: Export format has changed\n");
exit;
}
foreach $pageref (@batch)
{
my ($image_update_summary, $image_update_process, $image_update_template, $user_note_summary, $user_note_template, $skip_notice);
$skip_notice = 0;
my $image = $pageref->[0];
$image =~ s/&/&/g;
if(!exists($pages{$image}))
{
notelog("Page $image appears to have been deleted\n");
next;
}
my $page_text = $pages{$image}->{revision}->{text}->{content};
my $editor = $pages{$image}->{revision}->{contributor}->{username};
my $uploader = $pageref->[1];
$uploader =~ s/^User://;
my $no_source = 0;
my $no_license = 0;
my $untagged = 0;
my ($raw_image) = $image =~ /Image:(.*)/;
my $image_regex = "[Ii]mage[ _]*:[ _]*" . MakeWikiRegex($raw_image);
notelog("Processing image $image\n");
if($banned_users{$uploader})
{
userwarnlog("*Upload by blacklisted user [[Special:Contributions/$uploader|$uploader]] found.\n");
# # TODO: Add a deletion tag
# my ($text, $editTime, $startTime, $token) = Pearle::getPage($image);
# $text .= "\n{{db|$banned_users{$uploader}}}\n";
# Pearle::postPage($image, $editTime, $startTime, $token, $text, '{{delete}}', "no");
# next;
}
if($uploader =~ /statusquo|status quo|thequo|tehquo|the quo|teh quo/i)
{
userwarnlog("*Questionable uploader found ([[:$image]])\n");
}
if($exempt_users{$uploader})
{
notelog("Upload by exempt user $uploader found.\n");
next;
}
if($pageref->[2] =~ /optimi(z|s)ed using (optipng|PNGCrusher)/i)
{
notelog("Optimize upload found for image $image\n");
Pearle::myLog("Optimize upload found for image $image\n");
$skip_notice = 1;
}
if($pageref->[2] =~ /Reverted to earlier revision/)
{
notelog("Revert upload found for image $image\n");
Pearle::myLog("Revert upload found for image $image\n");
$skip_notice = 1;
}
if($pageref->[2] =~ /tweak|crop|scale|adjust|change|resize/i)
{
notelog("Tweak found for image $image\n");
Pearle::myLog("Tweak found for image $image\n");
$skip_notice = 1;
}
# Check for anything that might be considered a source
if(!defined($page_text))
{
# Checked
notelog("Empty IDP\n");
$image_update_summary = "Image has no source information";
$image_update_template = "\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n";
$image_update_process = 'license';
$user_note_template = "{{subst:User:OrphanBot/nosource-new|<IMAGE>}}";
$user_note_summary = "You've uploaded an unsourced image";
}
else
{
my $stripped_page_text = $page_text;
my $no_template_text = $page_text;
# Remove anything that is neither a potential source nor a copyright tag
$stripped_page_text =~ s/^==.*?==//gm; # Remove section headers
$stripped_page_text =~ s/\n//g; # Remove newlines
$stripped_page_text =~ s/{{{[^}]+}}}//g; # Remove template parameters
$stripped_page_text =~ s/{{\s*(?:template:|)(?:$nontags)//i; # Remove non-tags
$no_template_text =~ s/{{[^}]*}}//g; # Remove templates
if($page_text =~ /(bread climp)/i) # Trolling meme
{
notelog("Red flag '$1' found\n");
Pearle::myLog("Red flag '$1'found on image [[:$image]]\n");
userwarnlog("*Red flag found on image [[:$image]]\n");
}
if($page_text =~ /({{(?:[Tt]emplate:|)(?:$deletion_tags)\s*(?:\||}}))/i)
{
# Checked
# Deletion or other bookkeeping tag
notelog("Deletion tag $1 found\n");
next;
}
if($page_text =~ /{{(?:[Tt]emplate:|)($forbidden_tags)\s*(?:\||}})/i)
{
# Checked
Pearle::myLog("Forbidden tag $1 found on image [[:$image]]\n");
userwarnlog("*Forbidden tag $1 found on image [[:$image]]\n");
notelog("*Forbidden tag $1 found on image [[:$image]]\n");
next;
}
if($page_text =~ /({{(?:template:|)(?:$nosource_tags)\s*(?:\||}}))/i)
{
# Checked
notelog("Self-sourcing tag $1 found.\n");
next;
}
if($page_text =~ /({{(?:template:|)(?:$sourcereq_tags)\s*(?:\||}}))/i)
{
# NOTE: A sourcereq fairuse tag includes a rationale tag, so there's no need to check for a rationale if it's double-tagged with a simple fairuse tag
if($stripped_page_text =~ /^\s*{{(?:template:|)(?:$sourcereq_tags)(?:\|[^}]*|)}}\s*$/i)
{
# Checked
notelog("Tag without source\n");
$image_update_summary = "Image has no source information";
$image_update_template = "\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n";
$image_update_process = 'license';
$user_note_template = "{{subst:User:OrphanBot/nosource-new|<IMAGE>}}";
$user_note_summary = "You've uploaded an unsourced image";
goto process_image;
}
else
{
# Checked
notelog("Sourcereq tag $1 with text\n");
next;
}
}
if($page_text =~ /({{(?:template:|)(?:$deprecated_tags)\s*(?:\||}}))/i)
{
notelog("Page with deprecated tag $1 found.\n");
Pearle::myLog("Page with deprecated tag $1 found.\n");
$image_update_summary = "Obsolete or deprecated tag";
$image_update_template = "\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n";
$image_update_process = 'license';
$user_note_template = "{{subst:User:OrphanBot/deprecated|<IMAGE>}}";
$user_note_summary = "Image with obsolete or deprecated license";
goto process_image;
}
if($stripped_page_text !~ /{{.*}}/s)
{
# Checked
# Untagged page found
notelog("Untagged page\n");
$image_update_summary = "Image has no copyright tag";
$image_update_template = "\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n";
$image_update_process = 'tag';
$user_note_template = "{{subst:User:OrphanBot/untagged-new|<IMAGE>}}";
$user_note_summary = "You've uploaded an untagged image";
goto process_image;
}
# Default case. If we're here, none of the other filters fit
{
# Checked
# Templates found, but they don't fit any known group
notelog("Default case. Should have at least one unknown tag.\n");
my @new_templates;
my $tag;
@new_templates = $stripped_page_text =~ /{{([^}|]*)/g;
notelog("Num: ", scalar(@new_templates), "\n");
notelog("Text: $page_text\n");
foreach $tag (@new_templates)
{
if(!$tags{$tag})
{
Pearle::myLog("Page with only unknown tag {{$tag}} found ([[:$image]])\n");
$tags{$tag} = 1;
}
}
}
next;
}
process_image:
# Token-substitute messages
$image_update_template = tokenSubst($image_update_template, $image);
$user_note_template = tokenSubst($user_note_template, $image);
# Process
if($image_update_process eq 'license')
{
# Problem with image license
my ($text, $editTime, $startTime, $token) = Pearle::getPage($image);
if($text !~ /#REDIRECT/i)
{
$text .= $image_update_template;
Pearle::postPage($image, $editTime, $startTime, $token, $text, $image_update_summary, "no");
}
else
{
Pearle::myLog("*Redirect found for [[$image]]\n");
userwarnlog("*Redirect found for [[:$image]]\n");
}
# Notify uploader
my $is_notified = isNotified(undef, $uploader, $image_regex, $image, \%notifications, \%dont_notify);
if(1 != $is_notified and !$skip_notice)
{
if(!($users_notified{$uploader}))
{
Pearle::myLog("Warning user $uploader\n");
userwarnlog("$user_note_template --~~~~\n", $uploader, $user_note_summary, $is_notified);
$notifications{"$uploader,$image"} = 1;
$users_notified{$uploader} = 1;
}
else
{
Pearle::myLog("User $uploader has already been warned repeatedly\n");
$users_notified{$uploader} += 1;
}
}
Pearle::limit();
}
elsif($image_update_process eq 'tag')
{
# Lack of tag
my ($text, $editTime, $startTime, $token) = Pearle::getPage($image);
if($text =~ /{{[^}]*}}/)
{
Pearle::myLog("*Tag found on untagged image $image\n");
next;
}
elsif($text !~ /#REDIRECT/i)
{
$text .= $image_update_template;
Pearle::postPage($image, $editTime, $startTime, $token, $text, $image_update_summary, "no");
}
else
{
Pearle::myLog("*Redirect found for [[$image]]\n");
userwarnlog("*Redirect found for [[:$image]]\n");
}
# If no source, notify uploader
my $is_notified = isNotified(undef, $uploader, $image_regex, $image, \%notifications, \%dont_notify);
if(1 != $is_notified and !$skip_notice)
{
if(!($users_notified{$uploader}))
{
Pearle::myLog("Warning user $uploader\n");
userwarnlog("$user_note_template --~~~~\n", $uploader, $user_note_summary, $is_notified);
$notifications{"$uploader,$image"} = 1;
$users_notified{$uploader} = 1;
}
else
{
Pearle::myLog("User $uploader has already been warned repeatedly\n");
$users_notified{$uploader} += 1;
}
}
Pearle::limit();
}
else
{
notelog("ERROR:Unknown process $image_update_process\n");
Pearle::myLog("ERROR: Unknown process $image_update_process\n");
exit;
}
}
notelog("Saving notification list\n");
saveNotificationList("./orphanbot.note", %notifications);
}
notelog("Saving notification list\n");
saveNotificationList("./orphanbot.note", %notifications);
Pearle::myLog("Finished with day's uploads\n");

