User:Jediarchives11/Wikification
From Wikipedia, the free encyclopedia
The code below is for the Automatic Wikification Extension. This extension searches articles when they are saved for words or phrases that have an article and then links them to that article.
<?php
/*
Wikification Extension
Gregory Szorc <gregory.szorc@case.edu>
Requested by and edited by Nicholas Anderson <jediarchives11@gmail.com>
This extension is a hook for MediaWiki that examines an article before it is
committed to the database and looks for possible wiki topics in the article
that are not marked as links and converts them.
Changelog
2005-07-25: Work started
2006-01-06: Fixed Bug: When adding links, spaces would be removed
2006-01-07: $excludelist array added
To Do
*Fix bug: Commas and periods aren't removed when finding things to link
*Fix bug: Last word in an article never links
*/
//when searching for phrases (like "History of Greece"), up to how many words
//should we search?
//the higher this number, the slower the extension
$wikifiPhraseWordLimit = 4;
//when searching for a single word term, what is the minimum number of characters
//allowed for a word
//this value is ignored for phrases
$wikifiMinWordLength = 3;
//namespaces to search for matches
//should have insignificant performance impact
$wikifiSearchNamespaces = array(NS_MAIN);
//when true, the only words that will be searched for matches are capitalized words
$wikifiOnlyCheckProper = false;
$wgExtensionFunctions[] = "Wikification_Wikify";
//register this hook
function Wikification_Wikify() {
global $wgHooks;
$wgHooks['ArticleSave'][] = 'Wikification_Save';
}
//this is the function that does the work
//all variables are passed by reference
function Wikification_Save($article, $user, $text) {
global $wikifiPhraseWordLimit, $wikifiMinWordLength, $wikifiSearchNamespaces;
global $wikifiOnlyCheckProper;
//grab the database reference
$db = &wfGetDB(DB_MASTER);
//first we need to strip out things that should never be links
//strip out existing wiki links [[*]] [*]
$s = preg_replace("/\\[\\[.*?\\]\\]/", '', $text);
$s = preg_replace("/\\[.*?\\]/", '', $s);
//strip out section headers
$s = preg_replace("/={1,5}.*?={1,5}/", '', $s);
//strip out other junk
$s = preg_replace("/[.,]/","", $s);
$excludelist = array("about", "test", "spam blacklist test");
$s = str_replace($excludelist, "", $s);
//separate the text into words
$words = explode(' ', $s);
//remove any non-printable characters
foreach ($words as $k=>$w) {
$words[$k] = trim($w);
if (strlen($w) == 0) {
unset($words[$k]);
}
}
//reindex the keys
$words = array_values($words);
$count = count($words);
$search = array();
$i = 0;
foreach ($words as $k=>$v) {
++$i;
//add an individual word if it is long enough
if (strlen($v) >= $wikifiMinWordLength) {
if ($wikifiOnlyCheckProper) {
if (ctype_upper($v{0})) {
$search[] = $v;
}
}
else {
$search[] = $v;
}
}
for ($j = 1; $j < $wikifiPhraseWordLimit; $j++) {
//if we have enough words left in the array
if ( ($i + $j) < $count) {
$phrase = $v;
for ($l = 0; $l < $j; $l++) {
$phrase .= ' ' . $words[$k+$l+1];
}
$search[] = $phrase;
}
}
}
//$search is an array for terms for which to search
//we need to convert them to titles
foreach ($search as $k=>$v) {
$search[$k] = str_replace(' ', '_', ucwords($v));
}
//assemble what could be a massive sql query
$sql = "SELECT page_namespace, page_title FROM wikipage";
$sql .= " WHERE page_namespace IN (".implode(',', $wikifiSearchNamespaces).")";
$sql .= " AND page_title IN (";
foreach ($search as $v) {
$sql .= "'".addslashes($v)."', ";
}
$sql = rtrim($sql, " ,");
$sql .= ")";
$result = $db->doQuery($sql);
//if we found a match
if ($db->numRows($result)) {
//loop through all of the matches
while ($row = $db->fetchRow($result)) {
$namespace = $row['page_namespace'];
$title = $row['page_title'];
//start building the replacement text
$link = " [[";
switch ($namespace) {
case NS_MAIN:
break;
//need to add prefixes in here
case NS_TALK:
case NS_USER:
case NS_USER_TALK:
case NS_PROJECT:
case NS_PROJECT_TALK:
case NS_IMAGE:
case NS_IMAGE_TALK:
case NS_MEDIAWIKI:
case NS_MEDIAWIKI_TALK:
case NS_TEMPLATE:
case NS_TEMPLATE_TALK:
case NS_HELP:
case NS_HELP_TALK:
case NS_CATEGORY:
case NS_CATEGORY_TALK:
default:
break;
}
$link .= "$title|";
//find the original text in the article
$matches = array();
$find = str_replace('_', ' ', $title);
preg_match_all("/$find/i", $text, $matches);
$matches = array_unique($matches[0]);
foreach ($matches as $m) {
$newlink = $link."$m]] ";
//this regexp needs fine tuning
$text = preg_replace("/[^\[]$m\s/", $newlink, $text);
}
}
}
return true;
}
?>

