/** * Produces a table with overview of the URLs to be crawled for each page * * @return string HTML output */ function drawURLs() { global $BACK_PATH, $BE_USER; // Init: $this->duplicateTrack = array(); $this->submitCrawlUrls = t3lib_div::_GP('_crawl'); $this->downloadCrawlUrls = t3lib_div::_GP('_download'); $this->makeCrawlerProcessableChecks(); switch ((string) t3lib_div::_GP('tstamp')) { case 'midnight': $this->scheduledTime = mktime(0, 0, 0); break; case '04:00': $this->scheduledTime = mktime(0, 0, 0) + 4 * 3600; break; case 'now': default: $this->scheduledTime = time(); break; } // $this->reqMinute = t3lib_div::intInRange(t3lib_div::_GP('perminute'),1,10000); // TODO: check relevance $this->reqMinute = 1000; $this->incomingConfigurationSelection = t3lib_div::_GP('configurationSelection'); $this->incomingConfigurationSelection = is_array($this->incomingConfigurationSelection) ? $this->incomingConfigurationSelection : array(''); $this->crawlerObj = t3lib_div::makeInstance('tx_crawler_lib'); $this->crawlerObj->setAccessMode('gui'); $this->crawlerObj->setID = t3lib_div::md5int(microtime()); if (empty($this->incomingConfigurationSelection) || count($this->incomingConfigurationSelection) == 1 && empty($this->incomingConfigurationSelection[0])) { $code = ' <tr> <td colspan="7"><b>' . $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.noConfigSelected') . '</b></td> </tr>'; } else { if ($this->submitCrawlUrls) { $reason = new tx_crawler_domain_reason(); $reason->setReason(tx_crawler_domain_reason::REASON_GUI_SUBMIT); if ($BE_USER instanceof t3lib_beUserAuth) { $username = $BE_USER->user['username']; } $reason->setDetailText('The user ' . $username . ' added pages to the crawler queue manually '); tx_crawler_domain_events_dispatcher::getInstance()->post('invokeQueueChange', $this->findCrawler()->setID, array('reason' => $reason)); } $code = $this->crawlerObj->getPageTreeAndUrls($this->pObj->id, $this->pObj->MOD_SETTINGS['depth'], $this->scheduledTime, $this->reqMinute, $this->submitCrawlUrls, $this->downloadCrawlUrls, array(), $this->incomingConfigurationSelection); } $this->downloadUrls = $this->crawlerObj->downloadUrls; $this->duplicateTrack = $this->crawlerObj->duplicateTrack; $output = ''; if ($code) { $output .= '<h3>' . $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.configuration') . ':</h3>'; $output .= '<input type="hidden" name="id" value="' . intval($this->pObj->id) . '" />'; if (!$this->submitCrawlUrls) { $output .= $this->drawURLs_cfgSelectors() . '<br />'; $output .= '<input type="submit" name="_update" value="' . $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.triggerUpdate') . '" /> '; $output .= '<input type="submit" name="_crawl" value="' . $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.triggerCrawl') . '" /> '; $output .= '<input type="submit" name="_download" value="' . $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.triggerDownload') . '" /><br /><br />'; $output .= $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.count') . ': ' . count(array_keys($this->duplicateTrack)) . '<br />'; $output .= $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.curtime') . ': ' . date('H:i:s', time()) . '<br />'; $output .= '<br /> <table class="lrPadding c-list url-table">' . $this->drawURLs_printTableHeader() . $code . '</table>'; } else { $output .= count(array_keys($this->duplicateTrack)) . ' ' . $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.submitted') . '. <br /><br />'; $output .= '<input type="submit" name="_" value="' . $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.continue') . '" />'; $output .= '<input type="submit" onclick="this.form.elements[\'SET[crawlaction]\'].value=\'log\';" value="' . $GLOBALS['LANG']->sL('LLL:EXT:crawler/modfunc1/locallang.xml:labels.continueinlog') . '" />'; } } // Download Urls to crawl: if ($this->downloadCrawlUrls) { // Creating output header: $mimeType = 'application/octet-stream'; Header('Content-Type: ' . $mimeType); Header('Content-Disposition: attachment; filename=CrawlerUrls.txt'); // Printing the content of the CSV lines: echo implode(chr(13) . chr(10), $this->downloadUrls); // Exits: exit; } // Return output: return $output; }
/** * Function executed by crawler_im.php cli script. * * @return void */ function CLI_main_im() { $this->setAccessMode('cli_im'); $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli_im'); // Force user to admin state and set workspace to "Live": $this->backendUser->user['admin'] = 1; $this->backendUser->setWorkspace(0); // Print help if (!isset($cliObj->cli_args['_DEFAULT'][1])) { $cliObj->cli_validateArgs(); $cliObj->cli_help(); exit; } $cliObj->cli_validateArgs(); if ($cliObj->cli_argValue('-o') === 'exec') { $this->registerQueueEntriesInternallyOnly = TRUE; } if (isset($cliObj->cli_args['_DEFAULT'][2])) { // Crawler is called over TYPO3 BE $pageId = tx_crawler_api::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0); } else { // Crawler is called over cli $pageId = tx_crawler_api::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0); } $configurationKeys = $this->getConfigurationKeys($cliObj); if (!is_array($configurationKeys)) { $configurations = $this->getUrlsForPageId($pageId); if (is_array($configurations)) { $configurationKeys = array_keys($configurations); } else { $configurationKeys = array(); } } if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') { $reason = new tx_crawler_domain_reason(); $reason->setReason(tx_crawler_domain_reason::REASON_GUI_SUBMIT); $reason->setDetailText('The cli script of the crawler added to the queue'); tx_crawler_domain_events_dispatcher::getInstance()->post('invokeQueueChange', $this->setID, array('reason' => $reason)); } if ($this->extensionSettings['cleanUpOldQueueEntries']) { $this->cleanUpOldQueueEntries(); } $this->setID = \TYPO3\CMS\Core\Utility\GeneralUtility::md5int(microtime()); $this->getPageTreeAndUrls($pageId, tx_crawler_api::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99), $this->getCurrentTime(), tx_crawler_api::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000), $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec', $cliObj->cli_argValue('-o') === 'url', \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), 1), $configurationKeys); if ($cliObj->cli_argValue('-o') === 'url') { $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), 1); } elseif ($cliObj->cli_argValue('-o') === 'exec') { $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n"); $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10)); $cliObj->cli_echo("\nProcessing:\n"); foreach ($this->queueEntries as $queueRec) { $p = unserialize($queueRec['parameters']); $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => '); $result = $this->readUrlFromArray($queueRec); $requestResult = unserialize($result['content']); if (is_array($requestResult)) { $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : ''; $cliObj->cli_echo('OK: ' . $resLog . chr(10)); } else { $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10)); } } } elseif ($cliObj->cli_argValue('-o') === 'queue') { $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n"); $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10)); } else { $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", 1); $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), 1); } }