User:Nicolas1981/ASE Source code
From Wikipedia, the free encyclopedia
Generating ASE lists is easy (but takes a long time) with these scripts. If you have any question, just ask on the project's talk page, I am willing to help :-)
Right now I use a web-crawling method, which is bad. I hope to find a way to do the same with database dumps, but so far I have not seen which database contains the information found in history pages. Anyone familiar could be greatly helpful :-)
[edit] asespotter.groovy
Here is the ASE spotter, which I use to generate a list of ASE articles. You will have to replace occurrences of ".programs" with where you install this script.
/**
* AseSpotter.
*
* Definitions:
* an ase: an article that has been written by a single editor (See WP:ASE).
* an asena: a list of links to ases.
* to spot: to find an ase and add it to an asena.
*
* Strategy:
* For each article linked in the "Allpages" pages,
* if its history contains only one editor,
* spot it.
*
* TODO:
* Port to other WPs
* Optimize for CPU and network usage.
*/
/**
* Global parser (XmlParser)
* An HTML parser for web pages, that can handle non-valid XML.
*/
def nekoParser = new org.cyberneko.html.parsers.SAXParser()
nekoParser.setFeature('http://xml.org/sax/features/namespaces', false)
parser = new XmlParser(nekoParser)
/**
* Output.
*/
debugToFile = true // If false, logs to standard output.
if (debugToFile) {
debugFile = new File('.programs/asespotter/debug.log')
debugFile.write('')
}
asenaFile = new File('.programs/asespotter/asena.txt')
asenaFile.write('')
currentFirstLetter = null // First letter of the last article, for headers.
/**
* Constants for the English language.
*/
//INITIAL_INDEX_URL = 'http://en.wikipedia.org/w/index.php?title=Special:Allpages&from=%21' // First article is believed to be %21.
INITIAL_INDEX_URL = 'http://en.wikipedia.org/w/index.php?title=Special:Allpages&from=Astrocaryum_ferrugineum'
NEXT_INDEX_PAGE_ANCHOR_LABEL = 'Next page'
URL_BASE = 'http://en.wikipedia.org'
HISTORY_URL_PREFIX = 'http://en.wikipedia.org/w/index.php?title='
HISTORY_URL_SUFFIX = '&action=history'
CONTRIBS_PATH = '/wiki/Special:Contributions/'
/**
* Unit tests.
*/
// Local tests.
//assert ! isHistoryAse('unittests/SomeNonAsePage.html');
//assert isHistoryAse('unittests/SomeAsePage.html');
//getArticles('unittests/SomeSmallIndexPage.html')
//getArticles('unittests/SomeBigIndexPage.html')
//spotAseArticles('unittests/SomeSmallIndexPage.html')
//spotAseArticles('unittests/SomeBigIndexPage.html')
//spotAseHistory('unittests/SomeHistoryPage.html')
// Remote tests.
//assert isHistoryAse('http://en.wikipedia.org/w/index.php?title=Burlap_%28disambiguation%29&limit=500&action=history');
//assert ! isHistoryAse('http://en.wikipedia.org/w/index.php?title=Spring_Framework&limit=500&action=history');
//assert ! isArticleAse('http://en.wikipedia.org/wiki/Aaron_Dunn')
//spotAseArticles('http://en.wikipedia.org/w/index.php?title=Special:Allpages&from=Aaron_Dunn')
/**
* Main program.
*/
spotAllAseArticles()
/**
* Spot all of the ase articles, in all of the index pages.
*/
def spotAllAseArticles() {
// Cursor initially set on the first index page.
def cursorUrl = INITIAL_INDEX_URL
// Loop through the index pages.
while ( true ) {
debug(cursorUrl)
// Spot ase articles in this index page.
spotAseArticles(cursorUrl)
// Advance cursor to next index page.
def page = parser.parse(cursorUrl)
def tableElements = page.depthFirst().TABLE.findAll{ it }
def tableElement = tableElements[0] // Navigation part, contains a link to the next page.
def aElements = tableElement.depthFirst().A.findAll{ it }
def nextPageUri = null
aElements.each {
if(it.value()[0].contains(NEXT_INDEX_PAGE_ANCHOR_LABEL)) {
nextPageUri = it.attribute('href')
}
}
if(nextPageUri == null) {
debug('No next page, it was the last index page.')
return;
}
def nextPageUrl = URL_BASE + nextPageUri
cursorUrl = nextPageUrl
}
}
/**
* Spot all ase articles seen on an index page.
* @param indexUrl URL of the index page. (String)
*/
def spotAseArticles(indexUrl) {
// Get all articles at this index.
articles = getArticles(indexUrl)
debug(articles)
// Loop through the articles.
articles.each {
debug(it)
// Spot this article if it is ase.
spotAseArticle(it)
// Wait a bit to reduce load on the local host and on the server.
Thread.sleep(1000)
}
}
/**
* Get a list of articles from an index page.
* @param indexUrl URL to the index page. (String)
* @return list of URLs, each URL points to an article. (String[])
*/
def getArticles(indexUrl) {
def page = parser.parse(indexUrl)
def tableElements = page.depthFirst().TABLE.findAll{ it }
def tableElement = tableElements[1] // Skip the navigation part
def tdElements = tableElement.depthFirst().TD.findAll{ it }
def articles = []
tdElements.each {
def tdChild = it.children()[0]
if (tdChild.name() == 'A') {
def articleUrl = tdChild.attribute('href')
//debug(articleUrl)
articles << URL_BASE + articleUrl
}
}
return articles
}
/**
* Spot an article as ase or not.
* @param articleUrl URL for the article.
*/
def spotAseArticle(articleUrl) {
def encodedArticleName = articleUrl.substring(29)
debug(encodedArticleName)
def historyUrl = HISTORY_URL_PREFIX + encodedArticleName + HISTORY_URL_SUFFIX
spotAseHistory(historyUrl)
}
/**
* Spot an article as ase or not.
* @param historyUrl URL for the history page of the article. (String)
*/
def spotAseHistory(historyUrl) {
debug('historyUrl=' + historyUrl)
def page = parser.parse(historyUrl)
def ulElements = page.depthFirst().UL.findAll{ it.'@id' == 'pagehistory' }
def ulElement = ulElements[0]
def liElements = ulElement.depthFirst().LI.findAll{ it }
debug(liElements.size + ' edits')
// Loop through the edits
def previousIterationEditor
def firstIteration = true
for ( liElement in liElements ) {
def spanElements = liElement.depthFirst().SPAN.findAll{ it.'@class' == 'history-user' }
def spanElement = spanElements[0] // There should be exactly one.
def aElements = spanElement.depthFirst().A.findAll{ it.'@href'.contains(CONTRIBS_PATH) }
def aElement = aElements[0] // There should be exactly one.
def contribs = aElement.'@href'
def editor = contribs.substring(CONTRIBS_PATH.length()) // Get rid of the path before the editor name
if ( isBot(editor) ) {
debug('editor: ' + editor + ' ignored.')
} else {
debug('editor: ' + editor)
if ( editor != previousIterationEditor // Two different editors.
&& ! firstIteration) { // Compare to previous only if not first.
return false
}
previousIterationEditor = editor
firstIteration = false
}
}
// No different editors were found, so it is ase.
// Find article name.
def h1Elements = page.depthFirst().H1.findAll{ it.'@class' == 'firstHeading' }
def articleName = h1Elements[0].value()[0]
// This article has been spotted as ase.
spotted(articleName)
}
/**
* An article has been spotted as ase, add it to the ase page.
* @param articleName Name of the ase article. (String)
*/
def spotted(articleName) {
debug('Spotted: ' + articleName)
// Insert header in asena if new first letter.
def firstLetter = articleName.substring(0, 1)
if (firstLetter != currentFirstLetter) {
asenaFile.append('\n== ' + firstLetter + ' ==\n')
currentFirstLetter = firstLetter
}
// Write to asena.
asenaFile.append('[[' + articleName + ']]\n')
}
/**
* Checks whether an editor is a bot or not.
* @param editor name of the editor. (String)
* @return bot or not. (boolean)
*/
def isBot(editor) {
return editor.toLowerCase().contains('bot')
}
/**
* Print a debugging message.
* @message message to be written as debug. (String)
*/
def debug(message) {
if (debugToFile) {
debugFile.append(message + '\n')
} else {
println message
}
}
[edit] asespotter.sh
If you run a UNIX system such as Linux, you may find this script useful to run the ASE spotter in the background, but it is optional.
export JAVA_HOME=~/.programs/java ~/.programs/groovy/bin/groovy ~/.programs/asespotter/asespotter.groovy > ~/.programs/asespotter/stdout.log 2>&1
[edit] chunks.groovy
This script splits a big list of ASE into chunks. Replace "a.txt" with the name of your list file.
CHUNK_SIZE=30
asenaFile = new File('a.txt')
ases = asenaFile.readLines()
chunkId = 0
aseCounter = 0
ases.each {
if (aseCounter == 0) {
println ""
println ""
println "== CHUNK " + chunkId + " =="
aseCounter = 0
chunkId++
print it
}
else {
print ", " + it
}
aseCounter++
if (aseCounter == CHUNK_SIZE) {
aseCounter = 0
}
}

