User:XLinkBot/Code/LinkParser.pl

From Wikipedia, the free encyclopedia

#!/usr/bin/perl

fork and exit;

use POE qw (Component::Client::TCP);
use HTML::Entities;
use LWP::UserAgent;
use strict;

my $diffFetcher=LWP::UserAgent->new;
$diffFetcher->agent("LinkParser/2.0");

my $server_port=shift;

sleep 4;

POE::Component::Client::TCP->new(
    RemoteAddress       =>'127.0.0.1',
    RemotePort          => $server_port,
    ServerInput         => \&server_input,
    Connected           => \&connected,
);

my ($heap,$kernel);
my $number_of_edits=0;

POE::Kernel->run();
exit 0;

sub server_input {
    my ( $session, $input ) = @_[ SESSION, ARG0 ];
    if ($input =~ m{EDIT \[\[(.+)\]\] \[\[(.+):User:(.+?)\]\] (http://.+) (.+)}) {
        my ($pagename) = $1;
        my ($lang)     = $2;
        my ($username) = $3;
        my ($diffurl)  = $4;                   
        my ($size)     = $5;
        $number_of_edits++;

        my @linksadded;
        my @linkremoved;
        if ( $diffurl ) {       
            my $DiffUrl= $diffurl . "&diffonly=1&action=render";
            my $diffContent=$diffFetcher->get($DiffUrl)->content;
            my @addedPre = ();
            my @removedPre = ();
            my @addedlinks = ();
            my @removedlinks = ();
            my $addedTotal = "";
            my $removedTotal = "";
        
            @addedPre=$diffContent=~m/<td class="diff-addedline"><div>(.*?)<\/div><\/td>/sg;
            @removedPre=$diffContent=~m/<td class="diff-deletedline"><div>(.*?)<\/div><\/td>/sg;    
        
            $addedTotal=join(' ', @addedPre);
            $removedTotal=join(' ', @removedPre);
        
            $addedTotal   =~ s/<span class="diffchange">//g;
            $addedTotal   =~ s/<\/span>//g;
            $removedTotal =~ s/<span class="diffchange">//g;
            $removedTotal =~ s/<\/span>//g;
        
            decode_entities( $addedTotal );
            decode_entities( $removedTotal );

            @addedlinks=$addedTotal=~m{(https?://[^\s\]\[\{\}\\\|^~`\'\"<>]+)}sgi;
            @removedlinks=$removedTotal=~m{(https?://[^\s\]\[\{\}\\\|\)\(^~`\'\"<>]+)}sgi;
            my @really_added_links = ();
            my @really_removed_links = ();
            my $links_added;
            my $links_removed;
            
            if (@addedlinks) {
                if (@removedlinks) {
                    foreach $links_added(@addedlinks) {
                        my $found = 0;
                        foreach $links_removed(@removedlinks) {
                            if ($links_removed eq $links_added) {
                                $found = 1;
                            }
                        }
                        unless ($found) {
                            push(@really_added_links,$links_added);
                        }
                    }
                } else {
                    @really_added_links = @addedlinks;
                }
            }
            if (@really_added_links) {
                my $message="PARSED [[$pagename]] $diffurl $size [[$lang:User:$username]] |" . join(" ",@really_added_links) . "|";
                $heap->{server}->put($message);
            }
        }
        $heap->{server}->put("REQUEST");
    }
    elsif ($input =~ m{NOEDIT}) {
        sleep 1;
        $heap->{server}->put("REQUEST");
    }
    if ($number_of_edits>50) {
        $kernel->post("shutdown");
        exit 0;
    }
}

sub connected {
    ( $kernel, $heap ) = @_[ KERNEL,HEAP ];
    $heap->{server}->put("REQUEST");
}

sub request_edit {
    $heap->{server}->put("REQUEST");
}