User:XLinkBot/Code/LinkParser.pl
From Wikipedia, the free encyclopedia
< User:XLinkBot | Code
#!/usr/bin/perl
fork and exit;
use POE qw (Component::Client::TCP);
use HTML::Entities;
use LWP::UserAgent;
use strict;
my $diffFetcher=LWP::UserAgent->new;
$diffFetcher->agent("LinkParser/2.0");
my $server_port=shift;
sleep 4;
POE::Component::Client::TCP->new(
RemoteAddress =>'127.0.0.1',
RemotePort => $server_port,
ServerInput => \&server_input,
Connected => \&connected,
);
my ($heap,$kernel);
my $number_of_edits=0;
POE::Kernel->run();
exit 0;
sub server_input {
my ( $session, $input ) = @_[ SESSION, ARG0 ];
if ($input =~ m{EDIT \[\[(.+)\]\] \[\[(.+):User:(.+?)\]\] (http://.+) (.+)}) {
my ($pagename) = $1;
my ($lang) = $2;
my ($username) = $3;
my ($diffurl) = $4;
my ($size) = $5;
$number_of_edits++;
my @linksadded;
my @linkremoved;
if ( $diffurl ) {
my $DiffUrl= $diffurl . "&diffonly=1&action=render";
my $diffContent=$diffFetcher->get($DiffUrl)->content;
my @addedPre = ();
my @removedPre = ();
my @addedlinks = ();
my @removedlinks = ();
my $addedTotal = "";
my $removedTotal = "";
@addedPre=$diffContent=~m/<td class="diff-addedline"><div>(.*?)<\/div><\/td>/sg;
@removedPre=$diffContent=~m/<td class="diff-deletedline"><div>(.*?)<\/div><\/td>/sg;
$addedTotal=join(' ', @addedPre);
$removedTotal=join(' ', @removedPre);
$addedTotal =~ s/<span class="diffchange">//g;
$addedTotal =~ s/<\/span>//g;
$removedTotal =~ s/<span class="diffchange">//g;
$removedTotal =~ s/<\/span>//g;
decode_entities( $addedTotal );
decode_entities( $removedTotal );
@addedlinks=$addedTotal=~m{(https?://[^\s\]\[\{\}\\\|^~`\'\"<>]+)}sgi;
@removedlinks=$removedTotal=~m{(https?://[^\s\]\[\{\}\\\|\)\(^~`\'\"<>]+)}sgi;
my @really_added_links = ();
my @really_removed_links = ();
my $links_added;
my $links_removed;
if (@addedlinks) {
if (@removedlinks) {
foreach $links_added(@addedlinks) {
my $found = 0;
foreach $links_removed(@removedlinks) {
if ($links_removed eq $links_added) {
$found = 1;
}
}
unless ($found) {
push(@really_added_links,$links_added);
}
}
} else {
@really_added_links = @addedlinks;
}
}
if (@really_added_links) {
my $message="PARSED [[$pagename]] $diffurl $size [[$lang:User:$username]] |" . join(" ",@really_added_links) . "|";
$heap->{server}->put($message);
}
}
$heap->{server}->put("REQUEST");
}
elsif ($input =~ m{NOEDIT}) {
sleep 1;
$heap->{server}->put("REQUEST");
}
if ($number_of_edits>50) {
$kernel->post("shutdown");
exit 0;
}
}
sub connected {
( $kernel, $heap ) = @_[ KERNEL,HEAP ];
$heap->{server}->put("REQUEST");
}
sub request_edit {
$heap->{server}->put("REQUEST");
}

