private function handlePageImport($page) { global $wgOut; $page = html_entity_decode($page); $wikiText = ArticleImportUtils::getCurlResponse($this->mPageUrl . str_replace(' ', '_', $page) . "&action=raw"); if ($wikiText) { $this->createPage($page, $this->parseWikiText($wikiText)); } else { $wgOut->addWikiText("* '''Failed''' to get {$page}"); $this->mFailedPages[] = $page; $this->mFailedByGet++; } }
private function getArticles($superCategory) { global $wgRequest, $wgOut, $wgDBprefix, $wgLang, $egAICategories; ini_set('max_execution_time', 7200); // The categories that need to be crawled. $categories = array(); if ($superCategory) { $superCategory = strtolower(trim($superCategory)); if (array_key_exists($superCategory, $egAICategories)) { $categories[] = $egAICategories[$superCategory]; } else { die('Invalid category provided. Supported categories: ' . $wgLang->listToText(array_keys($egAICategories))); } } else { die('No category provided. Supported categories: ' . $wgLang->listToText(array_keys($egAICategories))); } // The url of the MW's api. $apiUrl = $wgRequest->getVal('api', 'http://en.wikipedia.org/w/api.php'); $dbw = wfGetDB(DB_MASTER); $dbr = wfGetDB(DB_SLAVE); $fullPageTable = $wgDBprefix . AI_TABLE; // Ensure the tables exist. $dbw->query(<<<EOT CREATE TABLE IF NOT EXISTS `{$fullPageTable}` ( `category_id` smallint(6) NOT NULL, `article_name` varchar(100) character set utf8 NOT NULL, KEY `category_id` (`category_id`), KEY `article_name` (`article_name`) ) ENGINE=MyISAM DEFAULT CHARSET=latin1;\t\t\t EOT ); $fullCatsTable = $wgDBprefix . AI_CATS_TABLE; $dbw->query(<<<EOT CREATE TABLE IF NOT EXISTS `{$fullCatsTable}` ( `category_id` smallint(6) NOT NULL auto_increment, `category_name` varchar(255) NOT NULL, PRIMARY KEY (`category_id`), UNIQUE KEY `category_name` (`category_name`) ) ENGINE=MyISAM DEFAULT CHARSET=latin1;\t EOT ); // Truncate the table when requested. if ($wgRequest->getCheck('truncate')) { $dbw->query("TRUNCATE TABLE `{$wgDBprefix}" . AI_TABLE . "`"); } // The categories that already have been crawled. $usedCats = array(); $pageAmount = 0; $wgOut->addWikiText('== Page collection started =='); $categoryResult = $dbr->selectRow(AI_CATS_TABLE, 'category_id', array('category_name' => $superCategory)); if ($categoryResult) { $catId = $categoryResult->category_id; } else { $dbw->insert(AI_CATS_TABLE, array('category_name' => $superCategory)); $catId = $dbw->insertId(); } // Loop through all the categories in the $categories array, get their pages and subcategories. while ($category = array_pop($categories)) { // Ignore already scrawled cats. if (!in_array($category, $usedCats)) { $usedCats[] = $category; $wgOut->addWikiText("'''Starting work on category {$category}'''"); // Categories with more then 500 pages won't be completely imported! // An extra loop should be added if this is required. $apiRes = ArticleImportUtils::getCurlResponse("{$apiUrl}?action=query&list=categorymembers&cmtitle={$category}&format=xml&cmlimit=500"); if ($apiRes) { $xml = new SimpleXMLElement($apiRes); foreach ($xml->query->categorymembers->cm as $line) { if ((string) $line['ns'] == '14') { // Is a category $categories[] = str_replace(' ', '_', (string) $line['title']); } else { // Is a page $pageName = htmlspecialchars((string) $line['title']); $page = $dbr->selectRow(AI_TABLE, 'article_name', array('category_id' => $catId, 'article_name' => $pageName)); if (!$page) { $pageAmount++; $dbw->insert(AI_TABLE, array('category_id' => $catId, 'article_name' => $pageName)); $wgOut->addWikiText("* Added {$pageName}"); } else { $wgOut->addWikiText("* Skipped {$pageName}"); } } } } } } $wgOut->addWikiText('== Page collection finished =='); $wgOut->addWikiText("Imported {$pageAmount} pages from " . count($usedCats) . ' categories.'); $wgOut->addWikiText("=== Imported categories ==="); foreach ($usedCats as $cat) { $wgOut->addWikiText("* {$cat}"); } }