User:OrphanBot/tagbot.pl

From Wikipedia, the free encyclopedia

Source code for OrphanBot's upload-tagging task. Requires libBot.pl and libPearle2.pl.


#!/usr/bin/perl

# Tagbot
#
# A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag.

# Known bugs: 
#       If an image is tagged with a sourcereq tag and a non-tag, but nothing else, it won't be tagged as "no info"
#       An image with only a non-tag will be reported as a "untagged with tag"

use strict;
use warnings;

use Date::Calc qw(Month_to_Text Today);
use XML::Simple;
use utf8;

use Data::Dumper;

require "libBot.pl";


my $permit_interruptions = 0;   # Allow talkpage messages to stop the bot?

my ($cur_y, $cur_m, $cur_d);

my %users_notified;                                                # List of users notifed.  0, undef = no; 1 = notified once; 2 = notified and second notice
my %notifications;                                         # List of user,image pairs, used to ensure that no user is ever notified about an image twice.
my %dont_notify = ();                                      # List of users to never notify

my %banned_users = ();                                     # List of users banned from uploading

my %exempt_users = ();                                     # List of users exempt from inspection

my $last_upload = undef;                                # Last upload inspected

my %tags;                                          # List of tags found that are not in either the "good" or "bad" list
my @sourcereq_tags;                             # List of tags that require a separate source
my $sourcereq_tags;
my @nosource_tags;                              # List of self-sourcing tags
my $nosource_tags;
my @deletion_tags;                              # Tags that will eventually lead to the deletion of the image
my $deletion_tags;
my @forbidden_tags;                             # List of tags that should never be seen
my $forbidden_tags;
my @deprecated_tags;                    # List of tags that shouldn't be used any more
my $deprecated_tags;    
my @nontags;                                    # List of tags that aren't copyright tags
my $nontags;

sub loadTagList
{
        my $filename = shift;
        my @list = ();
        open INFILE, "<", $filename;
        while(<INFILE>)
        {
                $_ =~ s/#.*//;          # Remove comments
                $_ =~ s/^\s*//;         # Remove leading whitespace
                $_ =~ s/\s*$//;         # Remove trailing whitespace
                push @list, $_ if($_ !~ /^\s*$/);
        }
        close INFILE;
        return @list;
}

sub processTagList
{
        my $tags = join "|", @_;
        $tags =~ s/\(/\\\(/g;
        $tags =~ s/\)/\\\)/g;
        $tags =~ s/\./\\\./g;
        $tags =~ s/\*/[^}|]*?/g;
        $tags =~ s/ /[ _]/g;
        return $tags;
}


@sourcereq_tags = loadTagList("sourcereq.tags");
$sourcereq_tags = processTagList(@sourcereq_tags);
#print "Sourcereq: $sourcereq_tags\n\n";
print "Sourcereq: Loaded\n\n";

@nosource_tags = loadTagList("nosource.tags");
$nosource_tags = processTagList(@nosource_tags);
#print "Nosource: $nosource_tags\n\n";
print "Nosource: Loaded\n\n";

@forbidden_tags = loadTagList("forbidden.tags");
$forbidden_tags = processTagList(@forbidden_tags);
#print "Forbid: $forbidden_tags\n\n";
print "Forbid: Loaded\n\n";

@deletion_tags = loadTagList("deletion.tags");
$deletion_tags = processTagList(@deletion_tags);
#print "Deletion: $deletion_tags\n\n";
print "Deletion: Loaded\n\n";

@deprecated_tags = loadTagList("deprecated.tags");
$deprecated_tags = processTagList(@deprecated_tags);
#print "Deprecated: $deprecated_tags\n\n";
print "Deprecated: Loaded\n\n";

@nontags = loadTagList("nontags.tags");
$nontags = processTagList(@nontags);
#print "Nontags: $nontags\n\n";
print "Nontags: Loaded\n\n";

# See if the specified category exists, and if not, create it
sub checkImageCategoryTag
{
        my $cat;
        my ($text, $editTime, $startTime, $token);
        $cat = "Category:Images with no copyright tag as of $_[0] $_[1] $_[2]";
        
        ($text, $editTime, $startTime, $token) = Pearle::getPage($cat);
        if($text !~ /\[\[[Cc]ategory:[Ii]mages with no copyright tag/)
        {
                $text .= "\n<span class='plainlinks'>[{{fullurl:{{FULLPAGENAME}}|action=purge}} Purge]</span>\n[[Category:Images with no copyright tag| ]]\n";
                Pearle::postPage($cat, $editTime, $startTime, $token, $text, "Created category", "no");
        }
}

sub tokenSubst
{
        my $string = shift;
        my $image = shift;

        $string =~ s/<IMAGE>/$image/g;
        $string =~ s/<DAY>/$cur_d/g;
        $string =~ s/<MONTH>/$cur_m/g;
        $string =~ s/<YEAR>/$cur_y/g;
        
        return $string;
}

sub loadUserList
{
        my $file = shift;
        my %notelist;
        my $i = 0;
        notelog("File: $file\n");
        open INFILE, "<", $file;
        while(<INFILE>)
        {
                my ($user, $reason);
                $_ =~ s/\s*#.*$//g;
                chomp;
                ($user, $reason) = $_ =~ /([^\t]*)\t(.*)/;
                next if(!defined($user) or !defined($reason));
                $notelist{$user} = $reason;
                $i++;
        }
        close INFILE;
        notelog("$i notifications loaded\n");
        return %notelist;

}

exit;




# Initialize
($cur_y, $cur_m, $cur_d) = Today(1);    # Today in GMT
$cur_m = Month_to_Text($cur_m);

my $xml_parser = XML::Simple->new(ForceArray => ['page'], KeyAttr => [page => 'title']);

%notifications = loadNotificationList("./orphanbot.note");
%dont_notify = loadNotificationList("./orphanbot.whitelist");
%banned_users = loadUserList("./banneduser.list");
%exempt_users = loadUserList("./exemptuser.list");

Pearle::init(<< INSERT BOT NAME HERE >>, << INSERT PASSWORD HERE >>, "./tagbot.log","./cookies.pearle.txt");
Pearle::config(nullOK => 1, sanityCheck => 1);
config(username => << INSERT BOT NAME HERE >>);

if(!Pearle::login())
{
        exit;
}

checkImageCategoryTag($cur_d, $cur_m, $cur_y);


# Get recent uploads
my @articles = Pearle::getLogArticles('upload', 300, 20);
notelog(scalar(@articles) . " found\n");




# Process in batches of 50
while(scalar(@articles) > 0)
{
        my @batch = splice(@articles, 0, 50);
        my @batch_articles;
        my $pair;
        
        foreach $pair (@batch)
        {
                push @batch_articles, $pair->[0];
        }
        
        notelog("Batch: " . scalar(@batch_articles) . "\n");
        # For each batch, export the image description pages
        my $articles_text = Pearle::Export(@batch_articles);
        
        # Ditch the "siteinfo" block: the parser doesn't fold "<page>" elements into an array if it's present
        $articles_text =~ s/<siteinfo>.*<\/siteinfo>//s;
        
        my $parsed_xml = $xml_parser->XMLin($articles_text);
        my %pages = %{$parsed_xml->{page}};
        my $pageref;

        # Verify that the export format hasn't changed
        if($parsed_xml->{version} ne '0.3')
        {
                Pearle::myLog("ERROR: Export format has changed\n");
                userwarnlog("ERROR: Export format has changed\n");
                exit;
        }
                
        foreach $pageref (@batch)
        {
                my ($image_update_summary, $image_update_process, $image_update_template, $user_note_summary, $user_note_template, $skip_notice);
                $skip_notice = 0;
                my $image = $pageref->[0];
                $image =~ s/&/&/g;
                if(!exists($pages{$image}))
                {
                        notelog("Page $image appears to have been deleted\n");
                        next;
                }
                
                my $page_text = $pages{$image}->{revision}->{text}->{content};
                my $editor = $pages{$image}->{revision}->{contributor}->{username};
                my $uploader = $pageref->[1];
                $uploader =~ s/^User://;
                
                my $no_source = 0;
                my $no_license = 0;
                my $untagged = 0;
                my ($raw_image) = $image =~ /Image:(.*)/;
                my $image_regex = "[Ii]mage[ _]*:[ _]*" . MakeWikiRegex($raw_image);
                
                notelog("Processing image $image\n");

                if($banned_users{$uploader})
                {
                        userwarnlog("*Upload by blacklisted user [[Special:Contributions/$uploader|$uploader]] found.\n");
#                       # TODO: Add a deletion tag
#                       my ($text, $editTime, $startTime, $token) = Pearle::getPage($image);
#                       $text .= "\n{{db|$banned_users{$uploader}}}\n";
#                       Pearle::postPage($image, $editTime, $startTime, $token, $text, '{{delete}}', "no");
#                       next;
                }
                
                if($uploader =~ /statusquo|status quo|thequo|tehquo|the quo|teh quo/i)
                {
                        userwarnlog("*Questionable uploader found ([[:$image]])\n");
                }

                if($exempt_users{$uploader})
                {
                        notelog("Upload by exempt user $uploader found.\n");
                        next;
                }

                if($pageref->[2] =~ /optimi(z|s)ed using (optipng|PNGCrusher)/i)
                {
                        notelog("Optimize upload found for image $image\n");
                        Pearle::myLog("Optimize upload found for image $image\n");
                        $skip_notice = 1;
                }
                if($pageref->[2] =~ /Reverted to earlier revision/)
                {
                        notelog("Revert upload found for image $image\n");
                        Pearle::myLog("Revert upload found for image $image\n");
                        $skip_notice = 1;
                }
                if($pageref->[2] =~ /tweak|crop|scale|adjust|change|resize/i)
                {
                        notelog("Tweak found for image $image\n");
                        Pearle::myLog("Tweak found for image $image\n");
                        $skip_notice = 1;
                }       
        
                # Check for anything that might be considered a source
                if(!defined($page_text))
                {
                 # Checked
                        notelog("Empty IDP\n");
                        $image_update_summary = "Image has no source information";
                        $image_update_template = "\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n";
                        $image_update_process = 'license';
                        $user_note_template = "{{subst:User:OrphanBot/nosource-new|<IMAGE>}}";
                        $user_note_summary = "You've uploaded an unsourced image";
                }
                else
                {
                        my $stripped_page_text = $page_text;
                        my $no_template_text = $page_text;
                        # Remove anything that is neither a potential source nor a copyright tag
                        $stripped_page_text =~ s/^==.*?==//gm;  # Remove section headers
                        $stripped_page_text =~ s/\n//g;                 # Remove newlines
                        $stripped_page_text =~ s/{{{[^}]+}}}//g;        # Remove template parameters
                        $stripped_page_text =~ s/{{\s*(?:template:|)(?:$nontags)//i;    # Remove non-tags
                
                        $no_template_text =~ s/{{[^}]*}}//g;            # Remove templates

                        if($page_text =~ /(bread climp)/i) # Trolling meme
                        {
                                notelog("Red flag '$1' found\n");
                                Pearle::myLog("Red flag '$1'found on image [[:$image]]\n");
                                userwarnlog("*Red flag found on image [[:$image]]\n");
                        }
                        if($page_text =~ /({{(?:[Tt]emplate:|)(?:$deletion_tags)\s*(?:\||}}))/i)
                        {
                         # Checked
                                # Deletion or other bookkeeping tag
                                notelog("Deletion tag $1 found\n");
                                next;
                        }
                        if($page_text =~ /{{(?:[Tt]emplate:|)($forbidden_tags)\s*(?:\||}})/i)
                        {
                         # Checked
                                Pearle::myLog("Forbidden tag $1 found on image [[:$image]]\n");
                                userwarnlog("*Forbidden tag $1 found on image [[:$image]]\n");
                                notelog("*Forbidden tag $1 found on image [[:$image]]\n");
                                next;
                        }
                        if($page_text =~ /({{(?:template:|)(?:$nosource_tags)\s*(?:\||}}))/i)
                        {
                         # Checked
                                notelog("Self-sourcing tag $1 found.\n");
                                next;
                        }
                        if($page_text =~ /({{(?:template:|)(?:$sourcereq_tags)\s*(?:\||}}))/i)
                        {       
                                # NOTE: A sourcereq fairuse tag includes a rationale tag, so there's no need to check for a rationale if it's double-tagged with a simple fairuse tag
                                if($stripped_page_text =~ /^\s*{{(?:template:|)(?:$sourcereq_tags)(?:\|[^}]*|)}}\s*$/i)
                                {
                                 # Checked
                                        notelog("Tag without source\n");
                                        $image_update_summary = "Image has no source information";
                                        $image_update_template = "\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n";
                                        $image_update_process = 'license';
                                        $user_note_template = "{{subst:User:OrphanBot/nosource-new|<IMAGE>}}";
                                        $user_note_summary = "You've uploaded an unsourced image";
                                        goto process_image;
                                }
                                else
                                {
                                 # Checked
                                        notelog("Sourcereq tag $1 with text\n");
                                        next;
                                }
                        }
                        if($page_text =~ /({{(?:template:|)(?:$deprecated_tags)\s*(?:\||}}))/i)
                        {
                                notelog("Page with deprecated tag $1 found.\n");
                                Pearle::myLog("Page with deprecated tag $1 found.\n");
                                $image_update_summary = "Obsolete or deprecated tag";
                                $image_update_template = "\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n";
                                $image_update_process = 'license';
                                $user_note_template = "{{subst:User:OrphanBot/deprecated|<IMAGE>}}";
                                $user_note_summary = "Image with obsolete or deprecated license";
                                goto process_image;
                        }
                        if($stripped_page_text !~ /{{.*}}/s)
                        {
                         # Checked
                                # Untagged page found
                                notelog("Untagged page\n");
                                $image_update_summary = "Image has no copyright tag";
                                $image_update_template = "\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n";
                                $image_update_process = 'tag';
                                $user_note_template = "{{subst:User:OrphanBot/untagged-new|<IMAGE>}}";
                                $user_note_summary = "You've uploaded an untagged image";
                                goto process_image;
                        }
                        
                        # Default case.  If we're here, none of the other filters fit
                        {
                         # Checked
                                # Templates found, but they don't fit any known group
                                notelog("Default case.  Should have at least one unknown tag.\n");

                                my @new_templates;
                                my $tag;
                                @new_templates = $stripped_page_text =~ /{{([^}|]*)/g;
                                notelog("Num: ", scalar(@new_templates), "\n");
                                notelog("Text: $page_text\n");
                                foreach $tag (@new_templates)
                                {
                                        if(!$tags{$tag})
                                        {
                                                Pearle::myLog("Page with only unknown tag {{$tag}} found ([[:$image]])\n");
                                                $tags{$tag} = 1;
                                        }
                                }
                        }
                        next;
                }

process_image:          
                # Token-substitute messages
                $image_update_template = tokenSubst($image_update_template, $image);
                $user_note_template = tokenSubst($user_note_template, $image);

                # Process
                if($image_update_process eq 'license')
                {
                        # Problem with image license
                        my ($text, $editTime, $startTime, $token) = Pearle::getPage($image);
                        if($text !~ /#REDIRECT/i)
                        {
                                $text .= $image_update_template;
                                Pearle::postPage($image, $editTime, $startTime, $token, $text, $image_update_summary, "no");
                        }
                        else
                        {
                                Pearle::myLog("*Redirect found for [[$image]]\n");
                                userwarnlog("*Redirect found for [[:$image]]\n");
                        }

                        # Notify uploader
                        my $is_notified = isNotified(undef, $uploader, $image_regex, $image, \%notifications, \%dont_notify);
                        if(1 != $is_notified and !$skip_notice)
                        {
                                if(!($users_notified{$uploader}))
                                {
                                        Pearle::myLog("Warning user $uploader\n");
                                        userwarnlog("$user_note_template --~~~~\n", $uploader, $user_note_summary, $is_notified);
                                        $notifications{"$uploader,$image"} = 1;
                                        $users_notified{$uploader} = 1;
                                }
                                else
                                {
                                        Pearle::myLog("User $uploader has already been warned repeatedly\n");
                                        $users_notified{$uploader} += 1;
                                }
                        }
                        Pearle::limit();
                }
                elsif($image_update_process eq 'tag')
                {
                        # Lack of tag
                        my ($text, $editTime, $startTime, $token) = Pearle::getPage($image);
                        if($text =~ /{{[^}]*}}/)
                        {
                                Pearle::myLog("*Tag found on untagged image $image\n");
                                next;
                        }
                        elsif($text !~ /#REDIRECT/i)
                        {
                                $text .= $image_update_template;
                                Pearle::postPage($image, $editTime, $startTime, $token, $text, $image_update_summary, "no");
                        }
                        else
                        {
                                Pearle::myLog("*Redirect found for [[$image]]\n");
                                userwarnlog("*Redirect found for [[:$image]]\n");
                        }

                        # If no source, notify uploader
                        my $is_notified = isNotified(undef, $uploader, $image_regex, $image, \%notifications, \%dont_notify);
                        if(1 != $is_notified and !$skip_notice)
                        {
                                if(!($users_notified{$uploader}))
                                {
                                        Pearle::myLog("Warning user $uploader\n");
                                        userwarnlog("$user_note_template --~~~~\n", $uploader, $user_note_summary, $is_notified);
                                        $notifications{"$uploader,$image"} = 1;
                                        $users_notified{$uploader} = 1;
                                }
                                else
                                {
                                        Pearle::myLog("User $uploader has already been warned repeatedly\n");
                                        $users_notified{$uploader} += 1;
                                }
                        }
                        Pearle::limit();
                }
                else
                {
                        notelog("ERROR:Unknown process $image_update_process\n");
                        Pearle::myLog("ERROR: Unknown process $image_update_process\n");
                        exit;
                }
        }
        notelog("Saving notification list\n");
        saveNotificationList("./orphanbot.note", %notifications);
}

notelog("Saving notification list\n");
saveNotificationList("./orphanbot.note", %notifications);

Pearle::myLog("Finished with day's uploads\n");