User:WatchlistBot/source.java

From Wikipedia, the free encyclopedia

Contents

[edit] WatchlistBot.java

class WatchlistBot {
    public static void main (String[] args) throws Exception {
        WikiSessionManager sessionMgr = new WikiSessionManager();
        sessionMgr.userLogin(Private.username, Private.password);
 
        // numismatics
        String[] includePages = {"Template:Currencies of Africa",
                                 "Template:Currencies of Asia",
                                 "Template:Currencies of Europe",
                                 "Template:Currencies of Oceania",
                                 "Template:Currencies of the Americas"};
        Project project = new Project(sessionMgr, "Numismatics", "Numismaticnotice",
                                                                  "Articles", includePages);
        project.updateWatchlist(true);
 
        // exonumia
        includePages = new String[0];
        project = new Project(sessionMgr, "Numismatics", "Exonumianotice", "Exonumia articles", includePages);
        project.updateWatchlist(true);
 
        // Hawaii
        project = new Project(sessionMgr, "Hawaii", "WPHawaii", "Hawaii recent changes", includePages);
        project.updateWatchlist(true);
 
        // Texas
        project = new Project(sessionMgr, "Texas", "WikiProject Texas", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Ice Hockey
        project = new Project(sessionMgr, "Ice Hockey", "Ice hockey", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Louisville
        project = new Project(sessionMgr, "Louisville", "WikiProject Louisville", "Watchall", includePages);
        project.updateWatchlist(true);
 
        // Kentucky
        project = new Project(sessionMgr, "Kentucky", "WikiProject Kentucky", "Watchall", includePages);
        project.updateWatchlist(true);
 
        // Texas State Highways
        project = new Project(sessionMgr, "Texas State Highways", "Texas State Highway WikiProject",
                                                  "Watchlist", includePages);
        project.updateWatchlist(true);
 
        // Dallas
        project = new Project(sessionMgr, "Dallas", "WikiProject Dallas", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Comics
        project = new Project(sessionMgr, "Comics", "comicsproj", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Pittsburgh
        project = new Project(sessionMgr, "Pittsburgh", "PittsburghWikiProject", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Baseball
        project = new Project(sessionMgr, "Baseball", "Baseball-WikiProject", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Bell Systems
        project = new Project(sessionMgr, "Bell Systems", "WikiProject Bell System", "Articles", includePages);
        project.updateWatchlist(true);
 
        // LGBT studies
        project = new Project(sessionMgr, "LGBT studies", "LGBTProject", "Articles", includePages);
        project.updateWatchlist(true);
 
        // San Francisco Bay Area
        project = new Project(sessionMgr, "San Francisco Bay Area", "SFBAProject", "Watchlist", includePages);
        project.updateWatchlist(true);
 
        // Africa
        project = new Project(sessionMgr, "Africa", "AfricaProject", "Watchlist", includePages);
        project.updateWatchlist(true);
 
        // Electronics
        project = new Project(sessionMgr, "Electronics", "Electron", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Tennessee
        project = new Project(sessionMgr, "Tennessee", "WikiProject Tennessee", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Hong Kong
        project = new Project(sessionMgr, "Hong Kong", "WikiProject Hong Kong", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Films
        project = new Project(sessionMgr, "Films", "Film", "Articles", includePages);
        project.updateWatchlist(true);
 
        // Automobiles
        project = new Project(sessionMgr, "Automobiles", "AutomobileWatch", "Articles", includePages);
        project.updateWatchlist(false);
 
        // Cricket
        project = new Project(sessionMgr, "Cricket", "CricketWatch", "Articles", includePages);
        project.updateWatchlist(false);
 
        System.out.println("finished");
        sessionMgr.userLogout();
        }
}

[edit] WikiSessionManager.java

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.net.URL;
import java.net.URLEncoder;
import java.net.URLConnection;
 
    /**
     * WikiSessionManager is a utility class that logs into the English
     * Wikipedia and facilitates making HTTP requests with cookies.
     *
     * This program is free software; you can redistribute it and/or modify
     * it under the terms of the GNU General Public License as published by
     * the Free Software Foundation; either version 2 of the License, or
     * (at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU General Public License
     * along with this program; if not, write to the Free Software
     * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
     * 
     * @author Gracenotes
     * @version 0.1
     **/
 
public class WikiSessionManager
{
    private String cookie, sessionData, username;
    private boolean loggedIn;
 
    public WikiSessionManager()
    {
        this.loggedIn = false;
        this.sessionData = "";
        this.cookie = "";
    }
 
    public void userLogin(String username, char[] password) throws IOException
    {
        username = username.trim();
        if (username.length() == 0 || password.length == 0) throw new IllegalArgumentException("Blank parameter");
 
        URL url = new URL("http://en.wikipedia.org/w/api.php");
        URLConnection connection = url.openConnection();
 
        connection.setDoOutput(true);
        connection.setUseCaches(false);
        connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
        connection.connect();
        OutputStreamWriter output = new OutputStreamWriter(connection.getOutputStream(), "UTF-8");
 
        output.write("action=login" +
                     "&lgname=" + URLEncoder.encode(username, "UTF-8") +
                     "&lgpassword=" + URLEncoder.encode(new String(password).trim(), "UTF-8"));
        output.flush();
        output.close();
 
        Arrays.fill(password, ' ');
 
        String headerName;
        StringBuffer receivedCookie = new StringBuffer();
        int i = 0;
        while ((headerName = connection.getHeaderFieldKey(++i)) != null)
        {
            headerName = connection.getHeaderFieldKey(i);
            if (headerName != null && headerName.equalsIgnoreCase("Set-Cookie"))
            {
                receivedCookie.append("; " + connection.getHeaderField(i).split(";")[0]);
            }
        }
        receivedCookie.delete(0, 2);
        this.cookie = receivedCookie.toString();
        this.loggedIn = this.cookie.indexOf("Token=") != -1;
        this.username = this.loggedIn ? username : null;
 
        // IB edit (get the session data)
        url = new URL("http://en.wikipedia.org/w/index.php?title=Wikipedia:Sandbox&action=edit");
        connection = url.openConnection();
        addCookies(connection);
        connection.connect();
        if (!findSessionData(connection)) {
                throw new IOException("Could not load session data");
        }
        // end IB edit
     }
 
    public void userLogout() throws IOException
    {
        if (!this.loggedIn)
            return;
        URL url = new URL("http://en.wikipedia.org/w/index.php?title=Special:Userlogout");
        URLConnection connection = url.openConnection();
        this.addCookies(connection);
        connection.connect();
 
        this.loggedIn = false;
        this.cookie = "";
        this.sessionData = "";
    }
 
    /**
     * Indicates whether a user is logged in or not
     * 
     * @return A boolean showing whether a user is logged in or not
     */
    public boolean isLoggedIn()
    {
        return this.loggedIn;
    }
 
    public void addCookies(URLConnection connection)
    {
        if (!this.loggedIn)
            return;
        connection.setRequestProperty("Cookie", this.cookie +
                                      (this.sessionData != null ? "; " + this.sessionData : ""));
        connection.setRequestProperty("User-Agent", this.username);
    }
 
    public boolean findSessionData(URLConnection connection)
    {
        sessionData = "";
        String headerName;
        int i = 0;
        while ((headerName = connection.getHeaderFieldKey(++i)) != null)
        {
            if (headerName.equals("Set-Cookie") && connection.getHeaderField(i).indexOf("_session") != -1)
                this.sessionData = connection.getHeaderField(i).split(";")[0];
        }
 
        return this.sessionData.length() != 0;
    }
}

[edit] Project.java

import java.io.*;
import java.net.*;
 
public class Project {
        /** are we debugging (sends output to file instead of wikipedia) **/
        final static boolean DBG = false;
 
        /** the watchlist **/
        private Watchlist watchlist;
 
        /** the name of the project (without Wikipedia:WikiProject) **/
        private String projectName;
 
        /** the session manager (controls logging in, communication w/ wikipedia) **/
        private WikiSessionManager sessionMgr;
 
        Project (WikiSessionManager sessionMgr, String projectName, String template,
                         String articlePage, String[] includePages) {
                this.sessionMgr = sessionMgr;
                this.projectName = projectName;
                this.watchlist = new Watchlist(projectName, articlePage, template, 
                                                                           sessionMgr, includePages, this);
        }
 
        /** update the watchlist
         * @param useTaggedPages are we inluding tagged pages (true), or all pages in
         *        tagged categories (false)
         **/
        void updateWatchlist (boolean useTaggedPages) throws UnsupportedEncodingException,
                        IOException, MalformedURLException {
                watchlist.update(useTaggedPages);
                watchlist.write();
        }
 
        /** write a page in the project
         * @param subPageName the name of the subpage
         * @param text the text to write
         */
        void writePage (String subPageName, String text) {
                try {
                        if (DBG) {
                                subPageName = subPageName.replaceAll("/", "_");
                                FileWriter file = new FileWriter(subPageName + ".txt");
                                file.write(text);
                                file.close();
                        } else {
                                String pageName = "Wikipedia:WikiProject " + projectName + "/" + subPageName;
                                String comment = "full update by [[User:WatchlistBot|WatchlistBot]]";
                                Page page = new Page(sessionMgr, pageName);
                                page.put(text, comment, false);
                        }
                } catch (Exception e) {
                        System.out.println(e);
                }
        }
}

[edit] Watchlist.java

import java.util.*;
import java.io.*;
import java.net.*;
 
public class Watchlist {
        /** the project **/
        private Project project;
 
        /** the template name (without namespace) **/
        private String template;
 
        /** the session manager **/
        private WikiSessionManager sessionMgr;
 
        /** does this watchlist use tagged pages (as opposed to pages in a category list **/
        private boolean taggedPages = true;
 
        /** pages which should be included in the project even though they're not tagged
         * (maybe because they share a talk page)
         **/
        private String[] includePages;
 
        /** the name of the project (without Wikipedia:WikiProject) **/
        private String projectName;
 
        /** the name of the page where the article list goes **/
        private String articlePage;
 
        /** the article pages **/
        private TreeSet<String> articles;
        /** the article talk pages **/
        private TreeSet<String> articlesTalk;
        /** the wikipedia pages **/
        private TreeSet<String> wikis;
        /** the wikipedia talk pages **/
        private TreeSet<String> wikisTalk;
        /** the template pages **/
        private TreeSet<String> templates;
        /** the template talk pages **/
        private TreeSet<String> templatesTalk;
        /** the category pages **/
        private TreeSet<String> categories;
        /** the category talk pages **/
        private TreeSet<String> categoriesTalk;
        /** the image pages **/
        private TreeSet<String> images;
        /** the image talk pages **/
        private TreeSet<String> imagesTalk;
        /** the portal pages **/
        private TreeSet<String> portals;
        /** the portal talk pages **/
        private TreeSet<String> portalsTalk;
 
        /** the maximum number of articles to put on one page **/
        private static final int MAX_ARTICLES = 9000;
 
        /** this one is for the top of all bot-created pages **/
        private static final String BOT_WARN =
                                "<div class=\"notice\" " +
                    "style=\"background:#ffe1a7; border:1px solid #AAA; " +
                    "padding:0.2em; margin:0.5em auto;\"> " +
                    "[[Image:Stop_hand.svg|left|20px]] This page is automatically " +
                    "recreated from time to time. Accordingly, any changes you " +
                    "make here will be overwitten. See below for details.</div>\n\n";
        /** this text is used to start the first page, if we're splitting (use SPLIT_INTRO for main page,
         * SPLIT_INTRO_NEXT for next pages)
         **/
        private static final String SPLIT_INTRO1 =
                                "There are too many articles (more than " + MAX_ARTICLES + ") in this project " +
                    "to list them all on one page. This page and the ones linked ";
        private static final String SPLIT_INTRO2 = "contain ";
        private static final String SPLIT_INTRO = SPLIT_INTRO1 + "below " + SPLIT_INTRO2;
        private static final String SPLIT_INTRO_NEXT = SPLIT_INTRO1 + "from the main page " + SPLIT_INTRO2;
        /** this text starts the first page, if we're not splitting **/
        private static final String ONE_PAGE_INTRO = "This page contains ";
        /** this text is the rest of the intro, in either case (use END_INTRO1 + tagText + END_INTRO2
         * + template + END_INTRO3 + pageName + END_INTRO4 + pageName + END_INTRO5)
         **/
        private static final String END_INTRO1 =
                                "links to all articles, categories, images, portal pages " +
                    "templates, and project pages ";
        private static final String END_INTRO2 = "with {{tl|";
        private static final String END_INTRO3 = "}} on their talk page. It was " +
                    "generated by [[User:WatchlistBot|" +
                    "WatchlistBot]]. Its purpose is to be able to track " +
                    "the project history using ''[[Special:Recentchangeslinked/" +
                    "Wikipedia:WikiProject ";
        private static final String END_INTRO4 =
                                "|related changes]]'' or ''[http://tools.wikimedia.de/~interiot/" +
                    "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" +
                    "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" +
                    "%3DWikipedia:WikiProject_";
        private static final String END_INTRO5 =
                                "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist]'' which " +
                    "only shows the last change for each article.\n\n";
 
        /** the text to be put on the main page **/
        private StringBuilder mainText;
        /** the text to be put on a sub page **/
        private StringBuilder subText;
        /** the number of articles on the main page **/
        private int count = 0;
        /** are we still putting articles on the main page **/
        private boolean onMainPage = true;
        /** special text to use if we're not using tagged pages **/
        private String tagText = "";
        /** the page number for the current subpage **/
        private int pageNo = 1;
        /** the output page name, for putting in messages **/
        private String outputName;
 
        Watchlist (String projectName, String articlePage, String template,
                           WikiSessionManager sessionMgr, String[] includePages,
                           Project project) {
                this.projectName = projectName;
                this.articlePage = articlePage;
                this.template = template;
                this.sessionMgr = sessionMgr;
                this.includePages = includePages;
                this.project = project;
        }
 
        /** update the watchlist
         * @param useTaggedPages are we inluding tagged pages (true), or all pages in
         *        tagged categories (false)
         **/
        void update (boolean useTaggedPages) throws UnsupportedEncodingException,
                        IOException, MalformedURLException {
                // reinitialize lists
                initLists();
                // first find the pages which are linked
                Page page = new Page(sessionMgr, "Template:" + template);
                TreeSet<String> refs = page.getTransclusions();
                if (!useTaggedPages) {
                        // the list of pages in tagged categories
                        TreeSet<String> pages = new TreeSet<String>();
                        for (String ref : refs) {
                                if (ref.startsWith("Category talk:")) {
                                        System.out.println("getting pages in " + ref + " pages: " + pages.size());
                                        Page cat = new Page(sessionMgr, ref.replace(" talk", ""));
                                        pages.addAll(cat.getMembers());
                                }
                        }
                        // move the pages list into refs (so 
                        refs = pages;
                }
                for (String ref : refs) {
                        processPageName(ref);
                }
        }
 
        void initLists () {
                articles = new TreeSet<String>();
                articlesTalk = new TreeSet<String>();
                wikis = new TreeSet<String>();
                wikisTalk = new TreeSet<String>();
                templates = new TreeSet<String>();
                templatesTalk = new TreeSet<String>();
                categories = new TreeSet<String>();
                categoriesTalk = new TreeSet<String>();
                images = new TreeSet<String>();
                imagesTalk = new TreeSet<String>();
                portals = new TreeSet<String>();
                portalsTalk = new TreeSet<String>();
                for (String page : includePages) {
                        processPageName(page);
                }
        }
 
 
        /** process a page name -- that is, add the article and its talk
         *  page to the appropriate lists
         **/
        private void processPageName (String pageName) {
                String[] result = pageName.split(":");
                if (result.length == 1) {
                        articles.add(result[0]);
                        articlesTalk.add("Talk:" + result[0]);
                } else if (result[0].equals("Talk")) {
                        articles.add(result[1]);
                        articlesTalk.add("Talk:" + result[1]);
                } else if (result[0].startsWith("Wikipedia")) {
                        wikis.add("Wikipedia:" + result[1]);
                        wikisTalk.add("Wikipedia talk:" + result[1]);
                } else if (result[0].startsWith("Template")) {
                        templates.add("Template:" + result[1]);
                        templatesTalk.add("Template talk:" + result[1]);
                } else if (result[0].startsWith("Category")) {
                        categories.add(":Category:" + result[1]);
                        categoriesTalk.add("Category talk:" + result[1]);
                } else if (result[0].startsWith("Image")) {
                        images.add(":Image:" + result[1]);
                        imagesTalk.add("Image talk:" + result[1]);
                } else if (result[0].startsWith("Portal")) {
                        portals.add("Portal:" + result[1]);
                        portalsTalk.add("Portal talk:" + result[1]);
                }
        }
 
        /** prepare the output and write to wikipedia **/
        void write () {
                // if we're not using tagged pages, we need to update the output a bit
                if (!taggedPages) {
                tagText = "in categories ";
        }
                // the page name of the output
        outputName = projectName.replace(" ", "_") + "/" +
                articlePage.replace(" ", "_");
 
                mainText = new StringBuilder(BOT_WARN);
 
                // count the number of articles
                int numArticles = articles.size() + wikis.size() + templates.size() +
                        categories.size() + images.size() + portals.size();
 
                // figure out if we can fit everything on one page (double the
                // number of articles to count talk pages)
                boolean splitting = (numArticles*2 > MAX_ARTICLES);
                if (splitting) {
                        mainText.append(SPLIT_INTRO);
                } else {
                        mainText.append(ONE_PAGE_INTRO);
                }
                mainText.append(END_INTRO1 + tagText + END_INTRO2 + template + END_INTRO3 +
                        outputName + END_INTRO4 + outputName + END_INTRO5);
 
                mainText.append("==Regular content (count: " + numArticles + ")==\n");
 
                mainText.append("===Articles (count: " + articles.size() + ")===\n");
                char prevChar = 'Z';
                char firstChar = prevChar; // initialize to something late in the alphabet
 
                // the text for this subpage (if we're not splitting, this will be put
                // onto the main page)
                subText = new StringBuilder();
 
                for (String s : articles) {
                        if (s.charAt(0) != prevChar) {
                                subText.append("====" + s.charAt(0) + "====\n");
                                prevChar = s.charAt(0);
                                // if this is the first article
                                if (count == 0) {
                                        firstChar = prevChar;
                                }
                        }
                        // put the article name
                        subText.append("*[[" + s + "]]\n");
                        count++;
                        // if we've put all teh articles we can on this page
                        if (count > MAX_ARTICLES) {
                                count = 0;
                                if (onMainPage) {
                                        onMainPage = false;
                                        mainText.append(subText);
                                } else {
                                        mainText.append("====[[/Page" + pageNo + "|" +
                                                        firstChar + "-" + prevChar + "]]====\n");
                                        int index = subText.indexOf("<range>");
                                        subText.replace(index, index+7, firstChar + "-" + prevChar);
                                        project.writePage(articlePage + "/Page" + pageNo, subText.toString());
                                        pageNo++;
                                }
                                firstChar = prevChar;
                                subText = new StringBuilder("===Articles <range>===\n" +
                                                  "====" + firstChar + "====\n");
                        }
                }
                // if we have too many articles, and we've already started the second
                // (or more) page
                if (splitting && !onMainPage) {
                        mainText.append("====[[/Page" + pageNo + "|" +
                                        firstChar + "-" + prevChar + "]]====\n");
                        int index = subText.indexOf("<range>");
                        subText.replace(index, index+7, firstChar + "-" + prevChar);
                        project.writePage(articlePage + "/Page" + pageNo, subText.toString());
                        pageNo++;
                } else { // we only have one page or this is the first batch
                        mainText.append(subText);
                }
 
                prepareArticleList("Wikipedia", wikis, true);
                prepareArticleList("Templates", templates, true);
                prepareArticleList("Portals", portals, true);
                prepareArticleList("Categories", categories, true);
                prepareArticleList("Images", images, true);
 
                mainText.append("==Talk pages==\n");
 
                mainText.append("===Articles===\n");
                prevChar = firstChar = 'Z';
                if (splitting && subText.length() != 0) {
                        project.writePage(articlePage + "/Page" + pageNo, subText.toString());
                        pageNo++;
                        subText = new StringBuilder(BOT_WARN + SPLIT_INTRO_NEXT +
                                                                                END_INTRO1 + tagText + END_INTRO2 +
                                                                                template + END_INTRO3 + outputName +
                                                                                END_INTRO4 + outputName + END_INTRO5);
                        subText.append("===Articles <range>==\n");
                } else {
                        subText = new StringBuilder();
                }
                count = 0;
                char endChar = 'Z';
                for (String s : articlesTalk) {
                        if (count == 0) {
                                firstChar = s.charAt(5);
                        }
                        subText.append("*[[" + s + "]]\n");
                        count++;
                        if (count > MAX_ARTICLES) {
                                count = 0;
                                endChar = s.charAt(5);
                                mainText.append("*[[/Page" + pageNo + "|" +
                                                            firstChar + "-" + endChar + "]]\n");
                                int index = subText.indexOf("<range>");
                                subText.replace(index, index+7, firstChar + "-" + endChar);
                                project.writePage(articlePage + "/Page" + pageNo, subText.toString());
                                pageNo++;
                                firstChar = endChar;
                subText = new StringBuilder("===Articles <range>===\n");
                        }
                        endChar = s.charAt(5);
                }
                if (splitting) {
                        mainText.append("*[[/Page" + pageNo + "|" +
                                        firstChar + "-" + endChar + "]]\n");
            int index = subText.indexOf("<range>");
            if (index != -1) {
                subText = subText.replace(index, index+7, firstChar + "-" + endChar);
            }
            project.writePage(articlePage + "/Page" + pageNo, subText.toString());
            pageNo++;
                } else {
                        mainText.append(subText);
                }
 
                prepareArticleList("Wikipedia", wikisTalk, false);
                prepareArticleList("Templates", templatesTalk, false);
                prepareArticleList("Portals", portalsTalk, false);
                prepareArticleList("Categories", categoriesTalk, false);
                prepareArticleList("Images", imagesTalk, false);          
 
                project.writePage(articlePage, mainText.toString());
        }
 
        private void prepareArticleList (String title, TreeSet<String> pages,
                                                                         boolean includeCount) {
                String countText = "";
                if (includeCount) {
                        countText = " (count: " + pages.size() + ")";
                }
                mainText.append("===" + title + countText + "===\n");
                // if we need to put these articles on the next page (becaue we've
                // already started the second page, or we can't fit all these pages
                // on the main page
                boolean pagesOnNext = !onMainPage || count + pages.size() > MAX_ARTICLES;
                if (pagesOnNext) {
                        subText = new StringBuilder(BOT_WARN + SPLIT_INTRO_NEXT +
                                        END_INTRO1 + tagText + END_INTRO2 + template + END_INTRO3 +
                                        outputName + "/Page" + pageNo + END_INTRO4 + outputName + "/" + pageNo +
                                        END_INTRO5 +
                                        "===" + title + "===\n");
                        mainText.append("*[[/Page" + pageNo + "#" + title + "|" + title +"]]\n");
                } else {
                        subText = new StringBuilder();
                        count += pages.size();
                }
                for (String s : pages) {
                        subText.append("*[[" + s + "]]\n");
                }
                // if these pages are going on the main page, put them there
                if (!pagesOnNext) {
                        mainText.append(subText);
                        subText = new StringBuilder();
                } else {
                        onMainPage = false;
                }
        }
}

[edit] Page.java

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import org.apache.commons.lang.StringEscapeUtils;
 
public class Page {
        /** the title of the page (with namespace) **/
        private String title;
        /** the title of the page (without namespace) **/
        private String titleWithoutNamespace = null;
 
        /** the index.php URL (as a String) **/
        private final String strIndexURL = "http://en.wikipedia.org/w/index.php";
 
        /** the api.php URL (as a String) **/
        private final String strAPIURL = "http://en.wikipedia.org/w/api.php";
 
        /** the session manager (manages logging in, cookies, etc) **/
        private WikiSessionManager sessionMgr;
 
        /** how long to sleep if maxlag is > 5 -- start with 5 sec **/
        private static int sleepTime = 5000;
        /** the maximum time to sleep (after this much time, we quit **/
        private final static int MAX_SLEEP_TIME = 160000;
        /** the last write time, so we can keep the bot slow **/
        private static long lastWriteTime = -1;
        /** the minimum delay between writes **/
        private final static int MIN_WRITE_DELAY = 10000;
 
        /** the list of articles that we're building, for example,
         *  in @see getTransclusions()
         **/
        private TreeSet<String> articles;
 
        /** create the Page object and store its title (with namespace)
         * @param title the title of the page (with namespace)
         * @param sessionMgr the session manager (controls loggin in and other interaction
         *        with wikipedia
         * @throws UnsupportedEncodingException if there's a problem with the URL
         */
        Page (WikiSessionManager sessionMgr, String title)
                        throws UnsupportedEncodingException {
                this.sessionMgr = sessionMgr;
                this.title = title;
                this.titleWithoutNamespace = URLEncoder.encode(titleWithoutNamespace(), "UTF-8");
                this.title = URLEncoder.encode(title, "UTF-8");
        }
 
        /** get the title of this page without namespace
         **/
        String titleWithoutNamespace () {
                // if we've already gotten it once, don't do it again (because of encoding)
                if (titleWithoutNamespace != null) return titleWithoutNamespace;
                // we haven't called this yet -- means we're in the constructor
                String[] split = title.split(":");
                if (split.length == 1) return split[0];
                return split[1];
        }
 
        /** get the contents of the page
         * @return the page contents
         * @throws IOException if something goes wrong (like the page doesn't exist)
         **/
        public String get () throws IOException {
                // get the URL & connection
                return urlRequest(strIndexURL + "?title=" + title + "&action=raw");
        }
 
        /** write the specified text to the page
         * @param text the text to put on the page
         * @param summary the edit summary
         * @param minor is this a minor edit
         * @throws MalformedURLException if there's a problem with the page URL
         * @throws IOException if there's a problem with one of the readers or writers
         **/
        void put (String text, String summary, boolean minor) {
                try {
                        URLConnection connection = null;
                        URL url = null;
 
                        // get the URL and connection
                        url = new URL(strIndexURL + "?title=" + title + "&action=edit&maxlag=5");
                        connection = url.openConnection();
                        sessionMgr.addCookies(connection);
                        connection.connect();
 
                        // process the existing page text to find:
                        // wpStarttime, wpEdittime, and wpEditToken. They're in lines of the
                        // form given in the pattern
                        Pattern pattern = Pattern.compile("<input type='hidden' value=\"(.*?)\" name=\"(.*?)\" />");
                        Matcher matcher;
 
                        String startTime = "", editTime = "", editToken = "";
                        BufferedReader reader = null;
                        boolean stillTrying = true;
                        while (stillTrying) {
                                try {
                                        reader = new BufferedReader(
                                                        new InputStreamReader(connection.getInputStream()));
                                        stillTrying = false;
                                        sleepTime = 5000;
                                } catch (IOException e) {
                                        // there must be a better way to do this!
                                        if (e.toString().contains("503")) {
                                                System.out.println("Max lag -- sleeping for " + sleepTime/1000 + " seconds");
                                                Thread.sleep(sleepTime);
                                                sleepTime *= 2;
                                                if (sleepTime > MAX_SLEEP_TIME) {
                                                        System.out.println("Giving up");
                                                        System.exit(-1);
                                                }
                                        }
                                }
                        }
                        String line = reader.readLine();
                        while (line != null) {
                                if (line.indexOf("<input type='hidden'") != -1) {
                                        matcher = pattern.matcher(line);
                                        matcher.find();
                                        String name = matcher.group(2);
                                        String value = matcher.group(1);
                                        if (name.equals("wpStarttime")) {
                                                startTime = value;
                                        } else if (name.equals("wpEdittime")) {
                                                editTime = value;
                                        } else if (name.equals("wpEditToken")) {
                                                editToken = value;
                                                break; // we don't need anything else
                                        }
                                }
                                line = reader.readLine();
                        }
                        reader.close();
 
                        // send the data
                        url = new URL(strIndexURL + "?title=" + title + "&action=submit");
                        connection = url.openConnection();
 
                        connection.setDoInput(true);
                        connection.setDoOutput(true);
                        connection.setUseCaches(false);
                        connection.setRequestProperty("Content-Type",
                                        "application/x-www-form-urlencoded");
                        sessionMgr.addCookies(connection);
 
                        // write the data to the output stream
                        long writeDelay = System.currentTimeMillis() - lastWriteTime;
                        if (lastWriteTime != -1 && writeDelay < MIN_WRITE_DELAY) {
                                System.out.println("Waiting " + (MIN_WRITE_DELAY-writeDelay)/1000 + " seconds");
                                Thread.sleep(MIN_WRITE_DELAY-writeDelay);
                        }
                        System.out.println("Writing " + titleWithoutNamespace);
                        OutputStreamWriter output = new OutputStreamWriter(connection
                                        .getOutputStream(), "UTF-8");
                        output.write("wpStarttime=" + startTime);
                        output.write("&wpEdittime=" + editTime);
                        output.write("&wpEditToken=" + URLEncoder.encode(editToken, "UTF-8"));
                        output.write("&wpTextbox1=" + URLEncoder.encode(text, "UTF-8"));
                        output.write("&wpSummary=" + URLEncoder.encode(summary, "UTF-8"));
                        if (minor) {
                                output.write("&wpMinorEdit=1");
                        }
                        output.flush();
                        output.close();
                        lastWriteTime = System.currentTimeMillis();
 
                        // I don't understand why this is necessary
                        BufferedReader input = new BufferedReader(new InputStreamReader(
                                        connection.getInputStream()));
                        line = input.readLine();
                        /* could be used to check for errors
                        while (line != null) {
                                line = input.readLine();
                        } */
                } catch (Exception e) {
                        System.out.println(e);
                }
        }
 
        /** get the transclusions for this page
         * @return the list of all articles which transclude this page
         * @state articles is used to build the list, but it is initialized
         *        and then returned
         * @throws MalformedURLException
         * @throws IOException
         */
        TreeSet<String> getTransclusions () 
                        throws MalformedURLException, IOException {
                // the article list
                articles = new TreeSet<String>();
                // the parameters to use in the URL
                final String urlParams = "action=query&list=embeddedin&eilimit=5000&format=xml";
 
                String result = urlRequest(strAPIURL + "?titles=" + title + "&" + urlParams);
 
                int index = 0;
                while (index != -1) {
                        processResult(result, "ei");
                        index = result.indexOf("eicontinue");
                        if (index != -1) {
                                // find the next " after eicontinue=" (12 chars long)
                                int endIndex = result.indexOf("\"", index+12);
                                String continueText = result.substring(index+12, endIndex);
                                result = urlRequest(strAPIURL + "?" + urlParams + "&eicontinue=" +
                                                                        URLEncoder.encode(continueText, "UTF-8"));
                        }
                }
 
                return articles;
        }
 
        /** get the articles in the category
         * @return the list of all articles in this category
         * @state articles is used to build the list, but it is initialized
         *        and then returned
         * @throws MalformedURLException
         * @throws IOException
         */
        TreeSet<String> getMembers () 
                        throws MalformedURLException, IOException {
                // the article list
                articles = new TreeSet<String>();
                // the parameters to use in the URL
                final String urlParams = "?cmcategory=" + titleWithoutNamespace +
                                                                 "&action=query&list=categorymembers&cmlimit=5000&format=xml";
                String result = urlRequest(strAPIURL + urlParams);
 
                int index = 0;
                while (index != -1) {
                        processResult(result, "cm");
                        index = result.indexOf("cmcontinue");
                        if (index != -1) {
                                // find the next " after cmcontinue=" (12 chars long)
                                int endIndex = result.indexOf("\"", index+12);
                                String continueText = result.substring(index+12, endIndex);
                                result = urlRequest(strAPIURL + urlParams + "&cmcontinue=" +
                                                                        URLEncoder.encode(continueText, "UTF-8"));
                        }
                }
 
                return articles;
        }
 
        /** process the result -- this is a list of articles in XML format
         * @param result the raw text
         * @param id the id to use in the pattern (e.g., "ei" for embedded in, "cm" for
         *        for category members, etc.)
         * @state articles new article titles are added to articles
         */
        private void processResult (String result, String id) {
                Pattern pattern =
                        Pattern.compile("<" + id + " pageid=\"(.*?)\" ns=\"(.*?)\" title=\"(.*?)\" />");
                Matcher matcher = pattern.matcher(result);
 
                while (matcher.find()) {
                        String article = matcher.group(3);
                        article = StringEscapeUtils.unescapeXml(article);
                        articles.add(article);
                }
        }
 
        /** open a URL and read the page
         * @param http the full URL "http://whatever"
         * @return the text of the page
         * @throws MalformedURLException
         * @throws IOException
         */
        private String urlRequest (String http) throws MalformedURLException,
                        IOException {
                // get the URL & connection
                URL url = new URL(http);
                URLConnection connection = url.openConnection();
                sessionMgr.addCookies(connection);
 
                // convert the connection stream into a String
                StringBuilder sbResult = new StringBuilder();
                BufferedReader reader = new BufferedReader(new InputStreamReader(
                                connection.getInputStream(), "UTF-8"));
                String line = reader.readLine();
                while (line != null) {
                        sbResult.append(line + "\n");
                        line = reader.readLine();
                }
                reader.close();
 
                return sbResult.toString();
        }
}