The goal of Wiki Words is to provide a series of statistics about MediaWiki articles.

I wanted to find out how many words were in each wiki article. After a day of searching, it became sadly apparent that even though MediaWiki has been around since 2002 there is no readily accessible solution for obtaining the word counts of a wiki without substantially modifying its entire code.

The first statistic is word counts. See if you can make code that counts the characters and lines of each article as well. Post here.

Demo URL: http://repo.phpexperts.pro/wiki_words/wiki_words.php

== Version 1 == <?php /* Wiki Words Copyright 2010 Theodore R. Smith theodore@phpexperts.pro

The following code is licensed under a modified BSD License. All of the terms and conditions of the BSD License apply with one exception:

  • Every one who has not been a registered student of the “PHPExperts From Beginner To Pro” course (http://www.phpexperts.pro/) is forbidden from modifing this code or using in an another project, either as a deritvative work or stand-alone.

    BSD License: http://www.opensource.org/licenses/bsd-license.php */

require ‘./lib/MyDB.inc.php’;

class PrettyException extends Exception { // Overload Exception::__construct so we can pass it data. public function __construct($message, array $params = null) { // 1. Returns a pretty exception message. $message = !empty($params) ? vsprintf($message, $params) : $message;

    // 2. Don't trust it. Remove XSS and spam.
    $message = filter_var($message, FILTER_SANITIZE_STRIPPED, FILTER_FLAG_STRIP_LOW | FILTER_FLAG_ENCODE_HIGH);

    parent::__construct($message, 0);
}    

}

class WebPageException extends PrettyException { const MISSING_CURL = “The cURL PHP extension must be enabled in order to use WebPageStats.”; const MISSING_URL = “A URL has not been supplied or set.”; const INVALID_URL = “The URL ‘%s’ is not valid.”; const BLANK_URL = “The URL ‘%s’ contained no data. It is probably invalid.”; const CONNECTION_TIMED_OUT = “The connection timed out.”; const FILE_NOT_FOUND = “404: ‘%s’ could not be found.”; const PERMISSION_DENIED = “Permission denied.”; }

/** * Web page datatype that holds all the various parts * and info about a web page. */ class WebPage { public $url; public $headers; public $body; public $text;

public function __construct($url)
{
    // 1. Bail out now if the CURL extension is not loaded.
    if (!in_array('curl', get_loaded_extensions()))
    {
        throw new Exception(WebPageException::MISSING_CURL);
    }

    // 2. Make sure the URL is valid.
    self::ensureValidURL($url);

    // 3. Store the URL.
    $this->url = $url;
}

/**
* Determine if a URL is valid.
* 
* @param string $url
* @returns true if the URL is a string and is a valid URL. False, otherwise.
*/
public static function isURLValid($url)
{
    return (is_string($url) && 
            filter_var($url, FILTER_VALIDATE_URL) !== false);        
}

public static function ensureValidURL($url)
{
    if (!self::isURLValid($url))
    {
        throw new WebPageException(WebPageException::INVALID_URL, array($url));
    }
}

// captureHeader() donated by bendavis78@gmail.com,
// via http://us.php.net/curl_setopt_array
private function captureHeader($ch, $header)
{
    $this->headers[] = $header; 
    return strlen($header); 
}

public function fetchURL()
{
    $ch = curl_init();
    curl_setopt_array($ch, array(CURLOPT_URL => $this->url,
                                 CURLOPT_RETURNTRANSFER => 1,
                                 CURLOPT_HEADERFUNCTION => array($this, 'captureHeader'),
                                 CURLOPT_TIMEOUT => 5,
                                 )
                     );

    $data = curl_exec($ch);
    curl_close($ch);

    if ($data === false || is_null($data) || $data == '')
    {
        throw new WebPageException(WebPageException::BLANK_URL, array($this->url));
    }

    // TODO: Need to handle HTTP error messages, such as 404 and 502.
    $this->body = $data;
    $this->text = strip_tags($data);
}

}

class WebPageStats { /** * @var WebPage */ private $webpage;

public function __construct($url)
{
    // 1. Store the URL.
    $this->webpage = new WebPage($url);

    // 2. Fetch the URL.
    $this->webpage->fetchURL();
}

public function changeURL($url)
{
    // 1. Kill the old WebPage object.
    unset($this->webpage);

    // 2. Create the new one.
    $this->webpage = new WebPage($url);
}

public function calculateWordCount()
{
    return str_word_count($this->webpage->text);
}

}

class WikiStatsException extends PrettyException { const INVALID_ARTICLE_NAME = ‘”%s” is an invalid Wiki article name.’; const INVALID_LIMIT = ‘Limits and offsets must be integers.’; const OFFSET_WITHOUT_LIMIT = “A limit must be set in order to use an offset.”; const MISSING_WIKIDB_CREDS = “The wiki database credentials are missing.”; }

class WikiArticleStats { const OPT_BLANK_ARTICLE = 1;

private $baseURL;
private $options;

public function __construct($baseURL, array $options = null)
{
    // 1. Make sure the last character is /.'
    $baseURL = (substr($baseURL, -1) != '/') ? $baseURL . '/' : $baseURL;

    // 2. Make sure the base URL is valid.
    WebPage::ensureValidURL($baseURL);

    // 3. Store the URL.
    $this->baseURL = $baseURL;
    $this->options = $options;
}

private function convertToWikiName($articleName)
{
    // 1. Sanity checks.
    if (!is_string($articleName))
    {
        throw new WikiStatsException(WikiStatsException::INVALID_ARTICLE_NAME, array($articleName));
    }

    // 1. Convert spaces to _.
    $articleName = str_replace(' ', '_', $articleName);

    // 2. Capitalize the first letter.
    $articleName[0] = strtoupper($articleName[0]);

    return $articleName;
}

public function getArticleWordCount($articleName)
{
    // 1. Convert to wiki name.
    $articleName = $this->convertToWikiName($articleName);

    // 2. Get the word count.
    $stats = new WebPageStats($this->baseURL . $articleName);

    return $stats->calculateWordCount();
}

public function fetchArticles(array $fields, $limit = null, $offset = null)
{
    // 1. Sanity checks.
    if ((!is_null($limit) && !is_integer($limit)) || (!is_null($offset) && !is_integer($offset)))
    {
        throw new WikiStatsException(WikiStatsException::INVALID_LIMIT);
    }

    if (is_numeric($offset) && is_null($limit))
    {
        throw new WikiStatsException(WikiStatsException::OFFSET_WITHOUT_LIMIT);
    }

    // 2. Convert fields to a comma-delimited string.
    $fields = join(', ', $fields);

    // 3. Find the wiki articles.
    $DB = MyDB::loadDB();

    // 3a. Select the page titles from the db where the page isn't a redirect nor ends with .css.
    $sql = "SELECT $fields FROM phppro_page WHERE page_is_redirect != 1 AND page_title NOT REGEXP('.css$')";
    if (!is_null($limit))
    {
        if (!is_null($offset))
        {
            $sql .= " LIMIT $offset, $limit";
        }
        else
        {
            $sql .= " LIMIT $limit";
        }
    }

    $DB->query($sql);

    // 4. Grab the articles.
    $articles = array();
    while (($page = $DB->fetchObject()))
    {
        // FIXME: I simply must get these from mediawiki somehow! -Ted 2010-08-11
        if ($page->page_namespace == 2)
        {
            $page->page_title = "User:" . $page->page_title;
        }
        else if ($page->page_namespace == 100)
        {
            $page->page_title = "PHPBook:" . $page->page_title;
        }

        $articles[] = $page;
    }

    return $articles;
}

public function calculateArticlesWordCounts($limit = null, $offset = null)
{
    // 1. Fetch the article names.
    $articles = $this->fetchArticles(array('page_namespace, page_title'), $limit, $offset);

    // 2. Fetch a blank article to get a baseline word count.
    $baselineArticle = isset($this->options[self::OPT_BLANK_ARTICLE]) ? $this->options[self::OPT_BLANK_ARTICLE] : 'Blank';
    $baseline = $this->getArticleWordCount($baselineArticle);

    // 3. Get the word counts for all the other articles.
    $wordcounts = array();
    foreach ($articles as $article)
    {
        $wordcount = $this->getArticleWordCount($article->page_title) - $baseline;
        $wordcounts[$article->page_title] = $wordcount;
    }

    return $wordcounts;
}

}

$stats = new WikiArticleStats(‘http://wiki.phpexperts.pro/’, array(WikiArticleStats::OPT_BLANK_ARTICLE => ‘PFBP_Chapter_1’)); $wordcount = $stats->getArticleWordCount(‘PFBP Chapter 1’);

// Detect if we’re running on a web browser. if (isset($_SERVER[‘DOCUMENT_ROOT’])) { echo ‘

<

pre>’; }

print_r($stats->calculateArticlesWordCounts());

[[Category:PHP_From_Beginner_to_Pro]]