public function testDocumentIsValid() { // Assert a page has not been set yet. $this->assertNull($this->_testDocument->getPageId(), 'The document page ID was prematurely set'); // Assert a page can be set (in this case, the first page). $this->_testDocument->setPage(null); $this->assertNotNull($this->_testDocument->getPageId(), 'The document page ID was not set'); // Assert accessor methods return expected values. $this->assertIdentical(TEST_DOCUMENT_ID, $this->_testDocument->getId()); // Assert the decoding the base title works. $baseTitle = Scripto_Document::encodeBaseTitle($this->_testDocument->getId(), $this->_testDocument->getPageId()); $decodedBaseTitle = Scripto_Document::decodeBaseTitle($baseTitle); $this->assertEqual($decodedBaseTitle[0], TEST_DOCUMENT_ID, 'Something wen wrong during base title encoding/decoding. Document ID does not match'); $this->assertEqual($decodedBaseTitle[1], $this->_testDocument->getPageId(), 'Something wen wrong during base title encoding/decoding. Page ID does not match'); }
/** * Encode a base title that enables fail-safe document page transport * between the external system, Scripto, and MediaWiki. * * The base title is the base MediaWiki page title that corresponds to the * document page. Encoding is necessary to allow all Unicode characters in * document and page IDs, even those not allowed in URL syntax and MediaWiki * naming conventions. Encoding in Base64 allows the title to be decoded. * * The base title has four parts: * <ol> * <li>A title prefix to keep MediaWiki from capitalizing the first * character</li> * <li>A URL-safe Base64 encoded document ID</li> * <li>A delimiter between the encoded document ID and page ID</li> * <li>A URL-safe Base64 encoded page ID</li> * </ol> * * @link http://en.wikipedia.org/wiki/Base64#URL_applications * @link http://en.wikipedia.org/wiki/Wikipedia:Naming_conventions_%28technical_restrictions%29 * @param string|int $documentId The document ID * @param string|int $pageId The page ID * @return string The encoded base title */ public static function encodeBaseTitle($documentId, $pageId) { return self::BASE_TITLE_PREFIX . Scripto_Document::base64UrlEncode($documentId) . self::BASE_TITLE_DELIMITER . Scripto_Document::base64UrlEncode($pageId); }
/** * Get all documents from MediaWiki that have at least one page with text. * * @uses Scripto_Service_MediaWiki::getAllPages() * @return array An array following this format: * <code> * array( * {document ID} => array( * ['mediawiki_titles'] => array( * {page ID} => {mediawiki title}, * {...} * ), * ['document_title'] => {document title} * ), * {...} * ) * </code> */ public function getAllDocuments() { $from = null; $documentTitles = array(); $allDocuments = array(); do { $response = $this->_mediawiki->getAllPages(array('aplimit' => 500, 'apminsize' => 1, 'apprefix' => Scripto_Document::BASE_TITLE_PREFIX, 'apfrom' => $from)); foreach ($response['query']['allpages'] as $value) { // Set the document ID and page ID. $documentIds = Scripto_Document::decodeBaseTitle($value['title']); // Set the page and continue if the document was already set. if (array_key_exists($documentIds[0], $documentTitles)) { $allDocuments[$documentIds[0]]['mediawiki_titles'][$documentIds[1]] = $value['title']; continue; // Set the document. Before getting the title, filter out pages // that are not valid documents. } else { if (!$this->_adapter->documentExists($documentIds[0])) { continue; } $documentTitle = $this->_adapter->getDocumentTitle($documentIds[0]); $documentTitles[$documentIds[0]] = $documentTitle; } $allDocuments[$documentIds[0]] = array('mediawiki_titles' => array($documentIds[1] => $value['title']), 'document_title' => $documentTitle); } // Set the query continue, if any. if (isset($response['query-continue'])) { $from = $response['query-continue']['allpages']['apfrom']; } else { $from = null; } } while ($from); return $allDocuments; }