コード例 #1
0
 private function handlePageImport($page)
 {
     global $wgOut;
     $page = html_entity_decode($page);
     $wikiText = ArticleImportUtils::getCurlResponse($this->mPageUrl . str_replace(' ', '_', $page) . "&action=raw");
     if ($wikiText) {
         $this->createPage($page, $this->parseWikiText($wikiText));
     } else {
         $wgOut->addWikiText("* '''Failed''' to get {$page}");
         $this->mFailedPages[] = $page;
         $this->mFailedByGet++;
     }
 }
コード例 #2
0
    private function getArticles($superCategory)
    {
        global $wgRequest, $wgOut, $wgDBprefix, $wgLang, $egAICategories;
        ini_set('max_execution_time', 7200);
        // The categories that need to be crawled.
        $categories = array();
        if ($superCategory) {
            $superCategory = strtolower(trim($superCategory));
            if (array_key_exists($superCategory, $egAICategories)) {
                $categories[] = $egAICategories[$superCategory];
            } else {
                die('Invalid category provided. Supported categories: ' . $wgLang->listToText(array_keys($egAICategories)));
            }
        } else {
            die('No category provided. Supported categories: ' . $wgLang->listToText(array_keys($egAICategories)));
        }
        // The url of the MW's api.
        $apiUrl = $wgRequest->getVal('api', 'http://en.wikipedia.org/w/api.php');
        $dbw = wfGetDB(DB_MASTER);
        $dbr = wfGetDB(DB_SLAVE);
        $fullPageTable = $wgDBprefix . AI_TABLE;
        // Ensure the tables exist.
        $dbw->query(<<<EOT
CREATE TABLE IF NOT EXISTS `{$fullPageTable}` (
  `category_id` smallint(6) NOT NULL,
  `article_name` varchar(100) character set utf8 NOT NULL,
  KEY `category_id` (`category_id`),
  KEY `article_name` (`article_name`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;\t\t\t
EOT
);
        $fullCatsTable = $wgDBprefix . AI_CATS_TABLE;
        $dbw->query(<<<EOT
CREATE TABLE IF NOT EXISTS `{$fullCatsTable}` (
  `category_id` smallint(6) NOT NULL auto_increment,
  `category_name` varchar(255) NOT NULL,
  PRIMARY KEY  (`category_id`),
  UNIQUE KEY `category_name` (`category_name`)
) ENGINE=MyISAM  DEFAULT CHARSET=latin1;\t
EOT
);
        // Truncate the table when requested.
        if ($wgRequest->getCheck('truncate')) {
            $dbw->query("TRUNCATE TABLE `{$wgDBprefix}" . AI_TABLE . "`");
        }
        // The categories that already have been crawled.
        $usedCats = array();
        $pageAmount = 0;
        $wgOut->addWikiText('== Page collection started ==');
        $categoryResult = $dbr->selectRow(AI_CATS_TABLE, 'category_id', array('category_name' => $superCategory));
        if ($categoryResult) {
            $catId = $categoryResult->category_id;
        } else {
            $dbw->insert(AI_CATS_TABLE, array('category_name' => $superCategory));
            $catId = $dbw->insertId();
        }
        // Loop through all the categories in the $categories array, get their pages and subcategories.
        while ($category = array_pop($categories)) {
            // Ignore already scrawled cats.
            if (!in_array($category, $usedCats)) {
                $usedCats[] = $category;
                $wgOut->addWikiText("'''Starting work on category {$category}'''");
                // Categories with more then 500 pages won't be completely imported!
                // An extra loop should be added if this is required.
                $apiRes = ArticleImportUtils::getCurlResponse("{$apiUrl}?action=query&list=categorymembers&cmtitle={$category}&format=xml&cmlimit=500");
                if ($apiRes) {
                    $xml = new SimpleXMLElement($apiRes);
                    foreach ($xml->query->categorymembers->cm as $line) {
                        if ((string) $line['ns'] == '14') {
                            // Is a category
                            $categories[] = str_replace(' ', '_', (string) $line['title']);
                        } else {
                            // Is a page
                            $pageName = htmlspecialchars((string) $line['title']);
                            $page = $dbr->selectRow(AI_TABLE, 'article_name', array('category_id' => $catId, 'article_name' => $pageName));
                            if (!$page) {
                                $pageAmount++;
                                $dbw->insert(AI_TABLE, array('category_id' => $catId, 'article_name' => $pageName));
                                $wgOut->addWikiText("* Added {$pageName}");
                            } else {
                                $wgOut->addWikiText("* Skipped {$pageName}");
                            }
                        }
                    }
                }
            }
        }
        $wgOut->addWikiText('== Page collection finished ==');
        $wgOut->addWikiText("Imported {$pageAmount} pages from " . count($usedCats) . ' categories.');
        $wgOut->addWikiText("=== Imported categories ===");
        foreach ($usedCats as $cat) {
            $wgOut->addWikiText("* {$cat}");
        }
    }