/** * */ public function getSimpleFormatForArticle(\Article $article) { $measurement = \Wikia\Measurements\Time::start([__CLASS__, __METHOD__]); $cacheKey = wfMemcKey("SimpleJson", $article->getPage()->getId(), self::SIMPLE_JSON_SCHEMA_VERSION); $jsonSimple = $this->app->wg->memc->get($cacheKey); if ($jsonSimple === false) { /** * Prevention from circular references, when parsing articles with tabs. * * E.g. when page contains tab, which is actually link to itself, * or if any tab contains tab, which referenced to given page. * * @see DivContainingHeadersVisitor::parseTabview */ \Wikia\JsonFormat\HtmlParser::markAsVisited($article->getTitle()->getText()); $jsonFormatRootNode = $this->getJsonFormatForArticle($article); // We have finished parsing of article, so we can clean array of visited articles \Wikia\JsonFormat\HtmlParser::clearVisited(); $simplifier = new Wikia\JsonFormat\JsonFormatSimplifier(); $jsonSimple = $simplifier->simplify($jsonFormatRootNode, $article->getTitle()->getText()); $this->app->wg->memc->set($cacheKey, $jsonSimple, self::SIMPLE_JSON_CACHE_EXPIRATION); } $measurement->stop(); return $jsonSimple; }
/** * @param string $html * @return \JsonFormatNode */ public function parse($html) { $time = Time::start([__CLASS__, __METHOD__]); $doc = new \DOMDocument(); libxml_use_internal_errors(true); $html = preg_replace("/\\s+/", " ", $html); $doc->loadHTML("<?xml encoding=\"UTF-8\">\n<html><body>" . $html . "</body></html>"); libxml_clear_errors(); $body = $doc->getElementsByTagName('body')->item(0); $jsonFormatTraversingState = new \JsonFormatBuilder(); $visitor = $this->createVisitor($jsonFormatTraversingState); $visitor->visit($body); $root = $jsonFormatTraversingState->getJsonRoot(); $time->stop(); return $root; }
protected function getTopArticles($wikiId, $lang) { return \WikiaDataAccess::cache(wfSharedMemcKey("CombinedSearchService", $wikiId, $lang), self::TOP_ARTICLES_CACHE_TIME, function () use($wikiId, $lang) { $timer = Time::start(["CombinedSearchService", "getTopArticles"]); $requestedFields = ["title", "url", "id", "score", "pageid", "lang", "wid", "article_quality_i", Utilities::field('html', $lang)]; $topArticlesMap = \DataMartService::getTopArticlesByPageview($wikiId, null, [NS_MAIN], false, self::TOP_ARTICLES_PER_WIKI + 1); $query = " +(" . Utilities::valueForField("wid", $wikiId) . ") "; $query .= " +( " . implode(" OR ", array_map(function ($x) { return Utilities::valueForField("pageid", $x); }, array_keys($topArticlesMap))) . ") "; $query .= " +(is_main_page:false) "; $searchConfig = new Config(); $searchConfig->setLimit(self::TOP_ARTICLES_PER_WIKI)->setQuery($query)->setPage(1)->setRequestedFields($requestedFields)->setDirectLuceneQuery(true)->setWikiId($wikiId); $resultSet = (new Factory())->getFromConfig($searchConfig)->search(); $currentResults = $resultSet->toArray($requestedFields); $articles = []; foreach ($currentResults as $article) { $articles[$article['pageid']] = $this->processArticle($article); if (sizeof($articles) >= self::TOP_ARTICLES_PER_WIKI) { break; } } $result = []; foreach ($topArticlesMap as $id => $a) { if (isset($articles[$id])) { $result[] = $articles[$id]; } } $timer->stop(); return $result; }); }
public function simplify(\JsonFormatRootNode $rootNode, $articleTitle) { $timer = Time::start([__CLASS__, __METHOD__]); /** @var \JsonFormatSectionNode[]|\JsonFormatRootNode[] $sections */ $sections = []; $this->findSections($rootNode, $sections); /** @var \JsonFormatSectionNode[]|\JsonFormatRootNode[] $returnSections */ $returnSections = []; for ($i = sizeof($sections) - 1; $i >= 0; $i -= 1) { $section = $sections[$i]; /** @var \JsonFormatSectionNode $section */ $content = []; $images = []; $this->getParagraphs($section, $content); $this->clearParagraphs($content); $this->getImages($section, $images); if (sizeof($content) == 0 && sizeof($images) == 0 && (sizeof($returnSections) == 0 || $section->getLevel() >= $returnSections[sizeof($returnSections) - 1]["level"]) && $sections[$i]->getLevel() != 1) { continue; } $returnSections[] = ["title" => $section->getType() == "section" ? $section->getTitle() : $articleTitle, "level" => $section->getType() == "section" ? $section->getLevel() : 1, "content" => $content, "images" => $images]; } $returnSections = array_reverse($returnSections); $timer->stop(); return ["sections" => $returnSections]; }
/** * Get a list of suggested titles * * @param WebRequest $request * @return bool|Object|string * * @author Inez Korczyński <*****@*****.**> * @author Robert Elwell <*****@*****.**> */ static function getLinkSuggest(WebRequest $request) { global $wgContLang, $wgContentNamespaces, $wgMemc, $wgLinkSuggestLimit; $measurement = T::start(__FUNCTION__); wfProfileIn(__METHOD__); $isMobile = F::app()->checkSkin('wikiamobile'); // trim passed query and replace spaces by underscores // - this is how MediaWiki store article titles in database $query = urldecode(trim($request->getText('query'))); $query = str_replace(' ', '_', $query); if ($isMobile) { $key = wfMemcKey(__METHOD__, md5($query . '_' . $request->getText('format') . $request->getText('nospecial', '')), 'WikiaMobile'); } else { $key = wfMemcKey(__METHOD__, md5($query . '_' . $request->getText('format') . $request->getText('nospecial', ''))); } if (strlen($query) < 3) { // enforce minimum character limit on server side $out = self::getEmptyResponse($request->getText('format')); } else { if ($cached = $wgMemc->get($key)) { $out = $cached; } } if (isset($out)) { wfProfileOut(__METHOD__); return $out; } // Allow the calling-code to specify a namespace to search in (which at the moment, could be overridden by having prefixed text in the input field). // NOTE: This extension does parse titles to try to find things in other namespaces, but that actually doesn't work in practice because jQuery // Autocomplete will stop making requests after it finds 0 results. So if you start to type "Category" and there is no page beginning // with "Cate", it will not even make the call to LinkSuggest. $namespace = $request->getVal('ns'); // explode passed query by ':' to get namespace and article title $queryParts = explode(':', $query, 2); if (count($queryParts) == 2) { $query = $queryParts[1]; $namespaceName = $queryParts[0]; // try to get the index by canonical name first $namespace = MWNamespace::getCanonicalIndex(strtolower($namespaceName)); if ($namespace == null) { // if we failed, try looking through localized namespace names $namespace = array_search(ucfirst($namespaceName), $wgContLang->getNamespaces()); if (empty($namespace)) { // getting here means our "namespace" is not real and can only be part of the title $query = $namespaceName . ':' . $query; } } if ($namespace !== null && $query === '') { $out = self::getEmptyResponse($request->getText('format')); wfProfileOut(__METHOD__); return $out; } } // which namespaces to search in? if (empty($namespace)) { // search only within content namespaces (BugId:4625) - default behaviour $namespaces = $wgContentNamespaces; } else { // search only within a given namespace $namespaces = array($namespace); } //limit the result only to this namespace $namespaceFilter = $request->getVal('nsfilter'); if (strlen($namespaceFilter) > 0) { $namespaces = array($namespaceFilter); } if (!empty($namespaceFilter) && $namespace != $namespaceFilter) { $out = self::getEmptyResponse($request->getText('format')); wfProfileOut(__METHOD__); return $out; } $query = addslashes($query); $db = wfGetDB(DB_SLAVE, 'search'); $redirects = array(); $results = array(); $exactMatchRow = null; $queryLower = strtolower($query); $sql1Measurement = T::start([__FUNCTION__, "sql-1"]); $res = $db->select(array('querycache', 'page'), array('page_namespace', 'page_title', 'page_is_redirect'), array('qc_title = page_title', 'qc_namespace = page_namespace', 'page_is_redirect = 0', 'qc_type' => 'Mostlinked', "(qc_title LIKE '{$query}%' or LOWER(qc_title) LIKE '{$queryLower}%')", 'qc_namespace' => $namespaces), __METHOD__, array('ORDER BY' => 'qc_value DESC', 'LIMIT' => $wgLinkSuggestLimit)); self::formatResults($db, $res, $query, $redirects, $results, $exactMatchRow); $sql1Measurement->stop(); if (count($namespaces) > 0) { $commaJoinedNamespaces = count($namespaces) > 1 ? array_shift($namespaces) . ', ' . implode(', ', $namespaces) : $namespaces[0]; } $pageNamespaceClause = isset($commaJoinedNamespaces) ? 'page_namespace IN (' . $commaJoinedNamespaces . ') AND ' : ''; if (count($results) < $wgLinkSuggestLimit) { /** * @var string $pageTitlePrefilter this condition is able to use name_title index. It's added only for performance reasons. * It uses fact that page titles can't start with lowercase letter. */ $pageTitlePrefilter = ""; if (strlen($queryLower) >= 2) { $pageTitlePrefilter = "(\n\t\t\t\t\t\t\t( page_title " . $db->buildLike(strtoupper($queryLower[0]) . strtolower($queryLower[1]), $db->anyString()) . " ) OR\n\t\t\t\t\t\t\t( page_title " . $db->buildLike(strtoupper($queryLower[0]) . strtoupper($queryLower[1]), $db->anyString()) . " ) ) AND "; } else { if (strlen($queryLower) >= 1) { $pageTitlePrefilter = "( page_title " . $db->buildLike(strtoupper($queryLower[0]), $db->anyString()) . " ) AND "; } } // TODO: use $db->select helper method $sql = "SELECT page_len, page_id, page_title, rd_title, page_namespace, rd_namespace, page_is_redirect\n\t\t\t\t\t\tFROM page\n\t\t\t\t\t\tLEFT JOIN redirect ON page_is_redirect = 1 AND page_id = rd_from\n\t\t\t\t\t\tLEFT JOIN querycache ON qc_title = page_title AND qc_type = 'BrokenRedirects'\n\t\t\t\t\t\tWHERE {$pageTitlePrefilter} {$pageNamespaceClause} (LOWER(page_title) LIKE '{$queryLower}%')\n\t\t\t\t\t\t\tAND qc_type IS NULL\n\t\t\t\t\t\tLIMIT " . $wgLinkSuggestLimit * 3; // we fetch 3 times more results to leave out redirects to the same page $sql2Measurement = T::start([__FUNCTION__, "sql-2"]); $res = $db->query($sql, __METHOD__); self::formatResults($db, $res, $query, $redirects, $results, $exactMatchRow); $sql2Measurement->stop(); } if ($exactMatchRow !== null) { /* @var StdClass $exactMatchRow */ $row = $exactMatchRow; $titleFormatted = self::formatTitle($row->page_namespace, $row->page_title); if ($row->page_is_redirect == 0) { // remove any instances of original array's value $resultsFlipped = array_flip($results); unset($resultsFlipped[$titleFormatted]); $results = array_flip($resultsFlipped); array_unshift($results, $titleFormatted); $flippedRedirs = array_flip($redirects); if (isset($flippedRedirs[$titleFormatted])) { unset($redirects[$flippedRedirs[$titleFormatted]]); } } else { $redirTitleFormatted = self::formatTitle($row->page_namespace, $row->rd_title); // remove any instances of original array's value $resultsFlipped = array_flip($results); unset($resultsFlipped[$redirTitleFormatted]); $results = array_flip($resultsFlipped); array_unshift($results, $redirTitleFormatted); $redirects[$redirTitleFormatted] = $titleFormatted; } } $db->freeResult($res); if ($request->getText('nospecial', 0) != 1) { // bugid 29988: include special pages // (registered in SpecialPage::$mList, not in the DB like a normal page) if ($namespaces == array('-1') && strlen($query) > 0) { $specialPagesByAlpha = SpecialPageFactory::getList(); $specialPagesByAlpha = get_object_vars($specialPagesByAlpha); ksort($specialPagesByAlpha, SORT_STRING); array_walk($specialPagesByAlpha, function ($val, $key) use(&$results, $query) { if (strtolower(substr($key, 0, strlen($query))) === strtolower($query)) { $results[] = self::formatTitle('-1', $key); } }); } } // Overwrite canonical title with redirect title for all formats self::replaceResultIfRedirected($results, $redirects); $format = $request->getText('format'); if ($format == 'json') { $result_values = array_values($results); if ($isMobile) { $out = json_encode(array(array_splice($result_values, 0, 10), array_splice($redirects, -1, 1))); } else { $out = json_encode(array('query' => $request->getText('query'), 'suggestions' => $result_values, 'redirects' => $redirects)); } } elseif ($format == 'array') { $out = $results; } else { // legacy: LinkSuggest.js uses plain text $out = implode("\n", $results); } // 15 minutes times four (one hour, but easier to slice and dice) $wgMemc->set($key, $out, 4 * 900); wfProfileOut(__METHOD__); return $out; }