public function displayPageAdvanced() { $res .= "<tr>"; $res .= "<td class='head'>Crawl max depth</td>"; $res .= "<td>"; $param = $this->config->getDefault("source.max_depth", ""); if ($param != "" && !empty($param) || $param == "0") { $res .= "<input type='hidden' name='source_crawl_max_depth' id='source_crawl_max_depth' value='" . $this->config->get("source.max_depth") . "'> Default"; } else { $res .= "<input class='editInputTextSmall' type='text' name='source_crawl_max_depth' id='source_crawl_max_depth' value='" . $this->getValue('crawl_maxdepth', '') . "'> (0 for default)"; } $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Maximum number of simultaneous pages crawled</td>"; $res .= "<td>"; $param = $this->config->getDefault("source.max_simultaneous_item_per_source", ""); if ($param != "" && !empty($param) || $param == "0") { $res .= "<input type='hidden' id='source_crawl_url_concurrency' name='source_crawl_url_concurrency' value='" . $this->config->get("source.max_depth") . "'> Default"; } else { $res .= "<select id='source_crawl_url_concurrency' name='source_crawl_url_concurrency' style='editInputSelect'>"; $res .= "<option value='0'"; if ($this->getValue('crawl_url_concurrency', '') == "0") { $res .= " selected"; } $res .= ">Default</option>"; $res .= "<option value='1'"; if ($this->getValue('crawl_url_concurrency', '') == "1") { $res .= " selected"; } $res .= ">1</option>"; $res .= "<option value='2'"; if ($this->getValue('crawl_url_concurrency', '') == "2") { $res .= " selected"; } $res .= ">2</option>"; $res .= "<option value='4'"; if ($this->getValue('crawl_url_concurrency', '') == "4") { $res .= " selected"; } $res .= ">4</option>"; $res .= "<option value='6'"; if ($this->getValue('crawl_url_concurrency', '') == "6") { $res .= " selected"; } $res .= ">6</option>"; $res .= "<option value='8'"; if ($this->getValue('crawl_url_concurrency', '') == "8") { $res .= " selected"; } $res .= ">8</option>"; $res .= "<option value='12'"; if ($this->getValue('crawl_url_concurrency', '') == "12") { $res .= " selected"; } $res .= ">12</option>"; $res .= "<option value='16'"; if ($this->getValue('crawl_url_concurrency', '') == "16") { $res .= " selected"; } $res .= ">16</option>"; $res .= "<option value='20'"; if ($this->getValue('crawl_url_concurrency', '') == "20") { $res .= " selected"; } $res .= ">20</option>"; $res .= "<option value='24'"; if ($this->getValue('crawl_url_concurrency', '') == "24") { $res .= " selected"; } $res .= ">24</option>"; $res .= "<option value='32'"; if ($this->getValue('crawl_url_concurrency', '') == "32") { $res .= " selected"; } $res .= ">32</option>"; $res .= "<option value='36'"; if ($this->getValue('crawl_url_concurrency', '') == "36") { $res .= " selected"; } $res .= ">36</option>"; $res .= "<option value='40'"; if ($this->getValue('crawl_url_concurrency', '') == "40") { $res .= " selected"; } $res .= ">40</option>"; $res .= "</select>"; } $res .= "</td>"; $res .= "</tr>"; $param = intval($this->config->getDefault("crawler.period", "6")); $res .= "<tr>"; $res .= "<td class='head'>Recrawl period</td>"; $res .= "<td>"; $res .= "<select id='crawl_minimal_period' name='crawl_minimal_period' style='editInputSelect'>"; $res .= "<option value='0'"; if ($this->getValue('crawl_minimal_period', '') == "0") { $res .= " selected"; } $res .= ">default</option>"; $res .= "<option value='999999'"; if ($this->getValue('crawl_minimal_period', '') == "999999") { $res .= " selected"; } $res .= ">on demand</option>"; if ($param <= 1) { $res .= "<option value='1'"; if ($this->getValue('crawl_minimal_period', '') == "1") { $res .= " selected"; } $res .= ">1 hour</option>"; } if ($param <= 3) { $res .= "<option value='3'"; if ($this->getValue('crawl_minimal_period', '') == "3") { $res .= " selected"; } $res .= ">3 hours</option>"; } if ($param <= 6) { $res .= "<option value='6'"; if ($this->getValue('crawl_minimal_period', '') == "6") { $res .= " selected"; } $res .= ">6 hours</option>"; } if ($param <= 12) { $res .= "<option value='12'"; if ($this->getValue('crawl_minimal_period', '') == "12") { $res .= " selected"; } $res .= ">12 hours</option>"; } if ($param <= 24) { $res .= "<option value='24'"; if ($this->getValue('crawl_minimal_period', '') == "24") { $res .= " selected"; } $res .= ">1 day</option>"; } if ($param <= 48) { $res .= "<option value='48'"; if ($this->getValue('crawl_minimal_period', '') == "48") { $res .= " selected"; } $res .= ">2 days</option>"; } if ($param <= 72) { $res .= "<option value='72'"; if ($this->getValue('crawl_minimal_period', '') == "72") { $res .= " selected"; } $res .= ">3 days</option>"; } if ($param <= 96) { $res .= "<option value='96'"; if ($this->getValue('crawl_minimal_period', '') == "96") { $res .= " selected"; } $res .= ">4 days</option>"; } if ($param <= 120) { $res .= "<option value='120'"; if ($this->getValue('crawl_minimal_period', '') == "120") { $res .= " selected"; } $res .= ">5 days</option>"; } if ($param <= 144) { $res .= "<option value='144'"; if ($this->getValue('crawl_minimal_period', '') == "144") { $res .= " selected"; } $res .= ">6 days</option>"; } if ($param <= 168) { $res .= "<option value='168'"; if ($this->getValue('crawl_minimal_period', '') == "168") { $res .= " selected"; } $res .= ">1 week</option>"; } if ($param <= 336) { $res .= "<option value='336'"; if ($this->getValue('crawl_minimal_period', '') == "336") { $res .= " selected"; } $res .= ">2 weeks</option>"; } if ($param <= 504) { $res .= "<option value='504'"; if ($this->getValue('crawl_minimal_period', '') == "504") { $res .= " selected"; } $res .= ">3 weeks</option>"; } if ($param <= 772) { $res .= "<option value='772'"; if ($this->getValue('crawl_minimal_period', '') == "772") { $res .= " selected"; } $res .= ">4 weeks</option>"; } $res .= "</select>"; $res .= "<br><span class='help'><strong>Minimal</strong> period between to crawl"; $res .= "<br>when the source crawl ends, the next crawl date is set based on this parameter</span>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Schedules</td>"; $scheduleJson = '{ "schedules": ['; $schedule = $this->getValue('crawl_schedule', ''); if (isset($schedule) && $schedule != "") { $scheduleXml = simplexml_load_string($schedule); $result = $scheduleXml->xpath('/schedules/schedule'); $sep = ""; while (list(, $node) = each($result)) { $scheduleJson .= $sep . '{ "day": "' . (string) $node->day . '", "start": "' . (string) $node->start . '", "stop": "' . (string) $node->stop . '", "enabled": "' . (string) $node->enabled . '" }'; $sep = ","; } } $scheduleJson .= '] }'; $res .= "<td>"; $res .= "<div id='schedule'>"; $res .= "</div>"; $res .= "<input type='hidden' name='source_schedule' id='source_schedule' value='" . $scheduleJson . "'>"; $res .= "<input type='hidden' name='source_schedule_xml' id='source_schedule_xml' value='" . $schedule . "'>"; $res .= "<a href='javascript:addSchedule();'><img src='images/plus_12.png'> Add schedule</a>"; $res .= "<br /><span class='help'>This parameter requieres MongoDB to be configured</span>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Crawl rules by url</td>"; $res .= "<td>"; $rules = $this->getValue('crawl_filtering_rules', ''); $rulesJson = '{ "rules": ['; if (substr($rules, 0, 1) == "<") { $rulesXml = simplexml_load_string($rules); $result = $rulesXml->xpath('/rules/rule'); $sep = ""; while (list(, $node) = each($result)) { $rulesJson .= $sep . '{ "ope": "' . (string) $node->ope . '", "mode": "' . (string) $node->mode . '", "pat": "' . str_replace("\\", "\\\\", (string) $node->pat) . '", "meta": "' . str_replace("\\", "\\\\", (string) $node->meta) . '", "metap": "' . (string) $node->metap . '" }'; $sep = ","; } } else { $aRules = explode("\n", $rules); $sep = ""; for ($i = 0; $i < count($aRules); $i++) { if ($aRules[$i] != "") { $aItems = explode(":", $aRules[$i]); $rulesJson .= $sep . '{ "ope": "' . $aItems[0] . '", "mode": "' . $aItems[1] . '", "pat": "' . $aItems[3] . '", "meta": "", "metap": "" }'; $sep = ","; } } } $rulesJson .= '] }'; $res .= "<div id='rule'>"; $res .= "</div>"; $res .= "<input type ='hidden' name='source_crawl_filtering_rules' id='source_crawl_filtering_rules' value='" . $rulesJson . "'>"; $res .= "<input type ='hidden' name='source_crawl_filtering_rules_xml' id='source_crawl_filtering_rules_xml' value='" . $rules . "'>"; $res .= "<a href='javascript:addRule();'><img src='images/plus_12.png'> Add rule</a>"; $res .= "<br /><br />Test this URL :<br />"; $res .= "<input class='editInputTextMedium2' type='text' name='test_url' id='test_url' value=''>"; $res .= "<input type='button' value='Test' onClick='testFilteringRules();'><div id='filtering_rules_test_result'></div>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Metadata</td>"; $res .= "<td><textarea name='source_metadata' id='source_metadata' rows='6' cols='70' class='editInputTextarea'>" . fi($this->getValue('metadata', '')) . "</textarea></td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Comment</td>"; $res .= "<td><textarea name='source_comment' id='source_comment' rows='6' cols='70' class='editInputTextarea'>" . fi($this->getValue('comment', '')) . "</textarea></td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Source contact</td>"; $res .= "<td><textarea name='source_contact' id='source_contact' rows='6' cols='70' class='editInputTextarea'>" . fi($this->getValue('contact', '')) . "</textarea></td>"; $res .= "</tr>"; return $res; }
$count = $stmt->execute(); if ($count == 0) { print $s; exit; } $cursor = $stmt->getCursor(); $rs = $cursor->getNext(); $res .= "<form name='account_edit' id='account_edit' action=''><center><table border='0' cellspacing='0' cellpadding='0'>"; $res .= "<tbody>"; $res .= "<tr>"; $res .= "<td class='head'>Id</td>"; $res .= "<td>" . $rs["id"] . "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Name</td>"; $res .= "<td><input class='editInputText' type='text' name='account_name' id='account_name' value='" . fi($rs["name"]) . "'></td>"; $res .= "</tr>"; $engine = $rs["id_engine"]; $aEngines = getAvailableEngines($config); if ($aEngines != null) { $res .= "<tr>"; $res .= "<td class='head'>Engine</td>"; $res .= "<td>"; $res .= "<select id='id_engine' name='id_engine' style='editInputSelect'>"; foreach ($aEngines as $key => $value) { $res .= "<option value='" . $key . "'"; if ($engine == strtolower(trim($key))) { $res .= " selected"; } $res .= ">" . $value . "</option>"; }
$res .= "<select id='target_type' name='target_type' style='editInputSelect'>"; $res .= "<option value='solr' " . ($rs["target_type"] == 'solr' ? 'selected' : '') . ">Solr</option>"; //$res .= "<option value='es' " . ($rs["target_type"] == 'es' ? 'selected' : '') . ">elasticsearch</option>"; $res .= "</select>"; $res .= "</td></tr>"; $res .= "<tr>"; $res .= "<td class='head'>Target paramater</td>"; $res .= "<td><input id='target_parameters' name ='target_parameters' class='editInputText' value='" . fi($rs["target_parameters"]) . "'>"; $res .= "<span class='help'>Optionnal.<br>"; $res .= "<u>Solr:</u><br>provide Solr core url (http://localhost:8080/solr/crawler/).<br>"; //$res .= "<u>elasticsearch:</u><br>provide cluster url including index name (http://localhost:9200/crawler/)."; $res .= "</span></td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Ouput queue directory</td>"; $res .= "<td><input id='queue_dir' name ='queue_dir' class='editInputText' value='" . fi($rs["queue_dir"]) . "'>"; $res .= "<span class='help'>Optional. Use absolute or relative path. Relative path is relative to crawler installation directory.</span>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Available for account</td>"; $res .= "<td>"; if ($id == '1') { $res .= "All <input type='hidden' id='id_account' name='id_account' value='" . $rs["id_account"] . "'>"; } else { $account = $rs["id_account"]; $aAccounts = getAvailableAccounts($config); if ($aAccounts != null) { $res .= "<select id='id_account' name='id_account' style='editInputSelect'>"; foreach ($aAccounts as $key => $value) { $res .= "<option value='" . $key . "'";
public function displayPageAdvanced() { $res .= "<!--tr>"; $res .= "<td class='head'>Language advanced rules</td>"; $res .= "<td><textarea name='source_language_advanced' id='source_language_advanced' rows='6' cols='70' class='editInputTextarea'>" . $this->getValue('language_advanced', '') . "</textarea></td>"; $res .= "</tr-->"; //$res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Protocol strategy</td>"; $res .= "<td>"; $res .= "If the same web page is discovered with both http and https protocol "; $res .= "<select id='source_crawl_protocol_strategy' name='source_crawl_protocol_strategy' style='editInputSelect'>"; $res .= "<option value='1'"; if ($this->getValue('protocol_strategy', '') == "1") { $res .= " selected"; } $res .= ">Keep only http page</option>"; $res .= "<option value='0'"; if ($this->getValue('protocol_strategy', '') == "0") { $res .= " selected"; } $res .= ">Consider http and https as different pages</option>"; $res .= "<option value='2'"; if ($this->getValue('protocol_strategy', '') == "2") { $res .= " selected"; } $res .= ">Keep only https page</option>"; $res .= "</select>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Check deleted</td>"; $res .= "<td>"; $res .= "<select id='source_crawl_checkdelete_strategy' name='source_crawl_checkdelete_strategy' style='editInputSelect'>"; $res .= "<option value='0'"; if ($this->getValue('checkdeleted_strategy', '') == "0") { $res .= " selected"; } $res .= ">Default (as defined in global configuration file)</option>"; $res .= "<option value='1'"; if ($this->getValue('checkdeleted_strategy', '') == "1") { $res .= " selected"; } $res .= ">After each crawl of this web site</option>"; $res .= "</select>"; $res .= "<br /><span class='help'>The crawler can check if previously crawled pages still exist on the web site</span>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Crawl max depth</td>"; $res .= "<td>"; $param = $this->config->getDefault("source.max_depth", ""); if ($param != "" && !empty($param) || $param == "0") { $res .= "<input type='hidden' name='source_crawl_max_depth' id='source_crawl_max_depth' value='" . $this->config->get("source.max_depth") . "'> Default"; } else { $res .= "<input class='editInputTextSmall' type='text' name='source_crawl_max_depth' id='source_crawl_max_depth' value='" . $this->getValue('crawl_maxdepth', '') . "'> (0 for default)"; } $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Crawl child pages only</td>"; $res .= "<td>"; $res .= "<select id='source_crawl_child_only' name='source_crawl_child_only' style='editInputSelect'>"; $res .= "<option value='2'"; if ($this->getValue('crawl_childonly', '') == "2") { $res .= " selected"; } $res .= ">Default</option>"; $res .= "<option value='1'"; if ($this->getValue('crawl_childonly', '') == "1") { $res .= " selected"; } $res .= ">Yes</option>"; $res .= "<option value='0'"; if ($this->getValue('crawl_childonly', '') == "0") { $res .= " selected"; } $res .= ">No</option>"; $res .= "</select>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Maximum number of simultaneous pages crawled</td>"; $res .= "<td>"; $param = $this->config->getDefault("source.max_simultaneous_item_per_source", ""); if ($param != "" && !empty($param) || $param == "0") { $res .= "<input type='hidden' id='source_crawl_url_concurrency' name='source_crawl_url_concurrency' value='" . $this->config->get("source.max_simultaneous_item_per_source") . "'> Default"; } else { $res .= "<select id='source_crawl_url_concurrency' name='source_crawl_url_concurrency' style='editInputSelect'>"; $res .= "<option value='0'"; if ($this->getValue('crawl_url_concurrency', '') == "0") { $res .= " selected"; } $res .= ">Default</option>"; $res .= "<option value='1'"; if ($this->getValue('crawl_url_concurrency', '') == "1") { $res .= " selected"; } $res .= ">1</option>"; $res .= "<option value='2'"; if ($this->getValue('crawl_url_concurrency', '') == "2") { $res .= " selected"; } $res .= ">2</option>"; $res .= "<option value='4'"; if ($this->getValue('crawl_url_concurrency', '') == "4") { $res .= " selected"; } $res .= ">4</option>"; $res .= "<option value='6'"; if ($this->getValue('crawl_url_concurrency', '') == "6") { $res .= " selected"; } $res .= ">6</option>"; $res .= "<option value='8'"; if ($this->getValue('crawl_url_concurrency', '') == "8") { $res .= " selected"; } $res .= ">8</option>"; $res .= "<option value='12'"; if ($this->getValue('crawl_url_concurrency', '') == "12") { $res .= " selected"; } $res .= ">12</option>"; $res .= "<option value='16'"; if ($this->getValue('crawl_url_concurrency', '') == "16") { $res .= " selected"; } $res .= ">16</option>"; $res .= "<option value='20'"; if ($this->getValue('crawl_url_concurrency', '') == "20") { $res .= " selected"; } $res .= ">20</option>"; $res .= "<option value='24'"; if ($this->getValue('crawl_url_concurrency', '') == "24") { $res .= " selected"; } $res .= ">24</option>"; $res .= "<option value='28'"; if ($this->getValue('crawl_url_concurrency', '') == "28") { $res .= " selected"; } $res .= ">28</option>"; $res .= "<option value='32'"; if ($this->getValue('crawl_url_concurrency', '') == "32") { $res .= " selected"; } $res .= ">32</option>"; $res .= "<option value='36'"; if ($this->getValue('crawl_url_concurrency', '') == "36") { $res .= " selected"; } $res .= ">36</option>"; $res .= "<option value='40'"; if ($this->getValue('crawl_url_concurrency', '') == "40") { $res .= " selected"; } $res .= ">40</option>"; $res .= "</select>"; } $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Maximum number of pages read per minutes</td>"; $res .= "<td>"; $res .= "<select id='source_crawl_url_per_minute' name='source_crawl_url_per_minute' style='editInputSelect'>"; $res .= "<option value='0'"; if ($this->getValue('crawl_url_per_minute', '') == "0") { $res .= " selected"; } $res .= ">No limit</option>"; $res .= "<option value='4'"; if ($this->getValue('crawl_url_per_minute', '') == "4") { $res .= " selected"; } $res .= ">4</option>"; $res .= "<option value='6'"; if ($this->getValue('crawl_url_per_minute', '') == "6") { $res .= " selected"; } $res .= ">6</option>"; $res .= "<option value='8'"; if ($this->getValue('crawl_url_per_minute', '') == "8") { $res .= " selected"; } $res .= ">8</option>"; $res .= "<option value='10'"; if ($this->getValue('crawl_url_per_minute', '') == "10") { $res .= " selected"; } $res .= ">10</option>"; $res .= "<option value='12'"; if ($this->getValue('crawl_url_per_minute', '') == "12") { $res .= " selected"; } $res .= ">12</option>"; $res .= "<option value='16'"; if ($this->getValue('crawl_url_per_minute', '') == "16") { $res .= " selected"; } $res .= ">16</option>"; $res .= "<option value='20'"; if ($this->getValue('crawl_url_per_minute', '') == "20") { $res .= " selected"; } $res .= ">20</option>"; $res .= "<option value='24'"; if ($this->getValue('crawl_url_per_minute', '') == "24") { $res .= " selected"; } $res .= ">24</option>"; $res .= "<option value='28'"; if ($this->getValue('crawl_url_per_minute', '') == "28") { $res .= " selected"; } $res .= ">28</option>"; $res .= "<option value='32'"; if ($this->getValue('crawl_url_per_minute', '') == "32") { $res .= " selected"; } $res .= ">32</option>"; $res .= "<option value='36'"; if ($this->getValue('crawl_url_per_minute', '') == "36") { $res .= " selected"; } $res .= ">36</option>"; $res .= "<option value='40'"; if ($this->getValue('crawl_url_per_minute', '') == "40") { $res .= " selected"; } $res .= ">40</option>"; $res .= "<option value='44'"; if ($this->getValue('crawl_url_per_minute', '') == "44") { $res .= " selected"; } $res .= ">44</option>"; $res .= "<option value='48'"; if ($this->getValue('crawl_url_per_minute', '') == "48") { $res .= " selected"; } $res .= ">48</option>"; $res .= "<option value='52'"; if ($this->getValue('crawl_url_per_minute', '') == "52") { $res .= " selected"; } $res .= ">52</option>"; $res .= "<option value='56'"; if ($this->getValue('crawl_url_per_minute', '') == "56") { $res .= " selected"; } $res .= ">56</option>"; $res .= "<option value='60'"; if ($this->getValue('crawl_url_per_minute', '') == "60") { $res .= " selected"; } $res .= ">60</option>"; $res .= "</select>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>User agent (optional)</td>"; $res .= "<td><input class='editInputText' type='text' name='source_user_agent' id='source_user_agent' value='" . fi($this->getValue('user_agent', '')) . "'></td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Ignored session id fields in url</td>"; $res .= "<td><input class='editInputText' type='text' name='source_url_ignore_fields' id='source_url_ignore_fields' value='" . fi($this->getValue('url_ignore_fields', '')) . "'></td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Ignored fields in url (others than session id)</td>"; $res .= "<td><input class='editInputText' type='text' name='source_url_ignore_fields_no_session_id' id='source_url_ignore_fields_no_session_id' value='" . fi($this->getValue('url_ignore_fields_no_session_id', '')) . "'>"; $res .= "<br><span class='help'>Use <strong>*</strong> in order to remove all parameters in urls. Regular expressions are allowed (for instance \"<strong>utm_.*</strong>\")</span>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Automatic HTML page cleaning</td>"; $res .= "<td>"; $res .= "<select id='source_automatic_cleaning' name='source_automatic_cleaning' style='editInputSelect'>"; $res .= "<option value='0'"; if ($this->getValue('automatic_cleaning', '') == "0") { $res .= " selected"; } $res .= ">No</option>"; $res .= "<option value='4'"; if ($this->getValue('automatic_cleaning', '') == "4" || $this->getValue('automatic_cleaning', '') == "") { $res .= " selected"; } $res .= ">Snacktory extractor</option>"; $res .= "<option value='1'"; if ($this->getValue('automatic_cleaning', '') == "1") { $res .= " selected"; } $res .= ">Boilerpipe article extractor</option>"; $res .= "<option value='2'"; if ($this->getValue('automatic_cleaning', '') == "2") { $res .= " selected"; } $res .= ">Boilerpipe default extractor</option>"; $res .= "<option value='3'"; if ($this->getValue('automatic_cleaning', '') == "3") { $res .= " selected"; } $res .= ">Boilerpipe canola extractor</option>"; $res .= "</select>"; $res .= "<br /><br />Test cleaning this page:<br />"; $res .= "<input class='editInputTextMedium2' type='text' name='test_url_cleaning' id='test_url_cleaning' value=''>"; $res .= "<input type='button' value='Test' onClick='testUrlCleaning();'></td>"; $res .= "</td>"; $res .= "</tr>"; $param = intval($this->config->getDefault("crawler.period", "6")); $res .= "<tr>"; $res .= "<td class='head'>Recrawl period</td>"; $res .= "<td>"; $res .= "<select id='crawl_minimal_period' name='crawl_minimal_period' style='editInputSelect'>"; $res .= "<option value='0'"; if ($this->getValue('crawl_minimal_period', '') == "0") { $res .= " selected"; } $res .= ">default</option>"; $res .= "<option value='999999'"; if ($this->getValue('crawl_minimal_period', '') == "999999") { $res .= " selected"; } $res .= ">on demand</option>"; if ($param <= 1) { $res .= "<option value='1'"; if ($this->getValue('crawl_minimal_period', '') == "1") { $res .= " selected"; } $res .= ">1 hour</option>"; } if ($param <= 3) { $res .= "<option value='3'"; if ($this->getValue('crawl_minimal_period', '') == "3") { $res .= " selected"; } $res .= ">3 hours</option>"; } if ($param <= 6) { $res .= "<option value='6'"; if ($this->getValue('crawl_minimal_period', '') == "6") { $res .= " selected"; } $res .= ">6 hours</option>"; } if ($param <= 12) { $res .= "<option value='12'"; if ($this->getValue('crawl_minimal_period', '') == "12") { $res .= " selected"; } $res .= ">12 hours</option>"; } if ($param <= 24) { $res .= "<option value='24'"; if ($this->getValue('crawl_minimal_period', '') == "24") { $res .= " selected"; } $res .= ">1 day</option>"; } if ($param <= 48) { $res .= "<option value='48'"; if ($this->getValue('crawl_minimal_period', '') == "48") { $res .= " selected"; } $res .= ">2 days</option>"; } if ($param <= 72) { $res .= "<option value='72'"; if ($this->getValue('crawl_minimal_period', '') == "72") { $res .= " selected"; } $res .= ">3 days</option>"; } if ($param <= 96) { $res .= "<option value='96'"; if ($this->getValue('crawl_minimal_period', '') == "96") { $res .= " selected"; } $res .= ">4 days</option>"; } if ($param <= 120) { $res .= "<option value='120'"; if ($this->getValue('crawl_minimal_period', '') == "120") { $res .= " selected"; } $res .= ">5 days</option>"; } if ($param <= 144) { $res .= "<option value='144'"; if ($this->getValue('crawl_minimal_period', '') == "144") { $res .= " selected"; } $res .= ">6 days</option>"; } if ($param <= 168) { $res .= "<option value='168'"; if ($this->getValue('crawl_minimal_period', '') == "168") { $res .= " selected"; } $res .= ">1 week</option>"; } if ($param <= 336) { $res .= "<option value='336'"; if ($this->getValue('crawl_minimal_period', '') == "336") { $res .= " selected"; } $res .= ">2 weeks</option>"; } if ($param <= 504) { $res .= "<option value='504'"; if ($this->getValue('crawl_minimal_period', '') == "504") { $res .= " selected"; } $res .= ">3 weeks</option>"; } if ($param <= 772) { $res .= "<option value='772'"; if ($this->getValue('crawl_minimal_period', '') == "772") { $res .= " selected"; } $res .= ">4 weeks</option>"; } $res .= "</select>"; $res .= "<br><span class='help'><strong>Minimal</strong> period between to crawl"; $res .= "<br>when the source crawl ends, the next crawl date is set based on this parameter</span>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Schedules</td>"; $scheduleJson = '{ "schedules": ['; $schedule = $this->getValue('crawl_schedule', ''); if (!empty(trim($schedule))) { $scheduleXml = simplexml_load_string($schedule); $result = $scheduleXml->xpath('/schedules/schedule'); $sep = ""; while (list(, $node) = each($result)) { $scheduleJson .= $sep . '{ "day": "' . (string) $node->day . '", "start": "' . (string) $node->start . '", "stop": "' . (string) $node->stop . '", "enabled": "' . (string) $node->enabled . '" }'; $sep = ","; } } $scheduleJson .= '] }'; $res .= "<td>"; $res .= "<div id='schedule'>"; $res .= "</div>"; $res .= "<input type='hidden' name='source_schedule' id='source_schedule' value='" . $scheduleJson . "'>"; $res .= "<input type='hidden' name='source_schedule_xml' id='source_schedule_xml' value='" . $schedule . "'>"; $res .= "<a href='javascript:addSchedule();'><img src='images/plus_12.png'> Add schedule</a>"; $res .= "<br /><span class='help'>Define allowed and disallowed crawl time. Source crawl will pause and restart according to these schedules"; $res .= "<br />This parameter requires MongoDB to be configured</span>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Crawl rules by url</td>"; $res .= "<td>"; $rules = $this->getValue('crawl_filtering_rules', ''); $rulesJson = '{ "rules": ['; if (!empty(trim($rules))) { if (substr($rules, 0, 1) == "<") { $rulesXml = simplexml_load_string($rules); $result = $rulesXml->xpath('/rules/rule'); $sep = ""; while (list(, $node) = each($result)) { $patitem = (string) $node->pat; $patitem = trim(str_replace("\\", "\\\\", $patitem)); $patitem = trim(preg_replace('/"/', '\\"', $patitem)); $metaitem = (string) $node->meta; $metaitem = trim(str_replace("\\", "\\\\", $metaitem)); $metaitem = trim(preg_replace('/"/', '\\"', $metaitem)); $ignoreparam = (string) $node->ignoreparam; $ignoreparam = trim(str_replace("\\", "\\\\", $ignoreparam)); $ignoreparam = trim(preg_replace('/"/', '\\"', $ignoreparam)); $rulesJson .= $sep . '{ "ope": "' . (string) $node->ope . '", "mode": "' . (string) $node->mode . '", "pat": "' . $patitem . '", "meta": "' . $metaitem . '", "metap": "' . (string) $node->metap . '", "ignoreparam": "' . $ignoreparam . '" }'; $sep = ","; } } else { $aRules = explode("\n", $rules); $sep = ""; for ($i = 0; $i < count($aRules); $i++) { if ($aRules[$i] != "") { $aItems = explode(":", $aRules[$i]); $patitem = $aItems[3]; $patitem = trim(preg_replace('/\\/', '\\\\', $patitem)); $patitem = trim(preg_replace('/"/', '\\"', $patitem)); $rulesJson .= $sep . '{ "ope": "' . $aItems[0] . '", "mode": "' . $aItems[1] . '", "pat": "' . $patitem . '", "meta": "", "metap": "", "ignoreparam": "" }'; $sep = ","; } } } } $rulesJson .= '] }'; $res .= "<div id='rule'>"; $res .= "</div>"; $res .= "<input type ='hidden' name='source_crawl_filtering_rules' id='source_crawl_filtering_rules' value='" . fi($rulesJson) . "'>"; $res .= "<input type ='hidden' name='source_crawl_filtering_rules_xml' id='source_crawl_filtering_rules_xml' value='" . fi($rules) . "'>"; $res .= "<a href='javascript:addRule();'><img src='images/plus_12.png'> Add rule</a>"; $res .= "<br /><br />Test this URL :<br />"; $res .= "<input class='editInputTextMedium2' type='text' name='test_url' id='test_url' value=''>"; $res .= "<input type='button' value='Test' onClick='testFilteringRules();'><div id='filtering_rules_test_result'></div>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Metadata</td>"; $res .= "<td><textarea name='source_metadata' id='source_metadata' rows='6' cols='70' class='editInputTextarea'>" . fi($this->getValue('metadata', '')) . "</textarea>"; $res .= "<br><span class='help'>These metadatas will be added into the output xml files</span>"; $res .= "<br><span class='help'>Syntax:</span>"; $res .= "<br><span class='help'>meta_name1:value1</span>"; $res .= "<br><span class='help'>meta_name1:value2</span>"; $res .= "<br><span class='help'>meta_name2:value3</span>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Authentication Mode</td>"; $res .= "<td>"; $res .= "<select id='auth_mode' name='auth_mode' style='editInputSelect'>"; $res .= "<option value='0'"; if ($this->getValue('auth_mode', '') == "0") { $res .= " selected"; } $res .= ">None</option>"; $res .= "<option value='1'"; if ($this->getValue('auth_mode', '') == "1") { $res .= " selected"; } $res .= ">POST (form)</option>"; $res .= "<option value='2'"; if ($this->getValue('auth_mode', '') == "2") { $res .= " selected"; } $res .= ">GET (form)</option>"; $res .= "<option value='3'"; if ($this->getValue('auth_mode', '') == "3") { $res .= " selected"; } $res .= ">Basic (web server)</option>"; $res .= "</select>"; $res .= "</td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Authentication login</td>"; $res .= "<td><input class='editInputTextMedium' type='text' name='auth_login' id='auth_login' value='" . fi($this->getValue('auth_login', '')) . "'>"; $res .= "<input type='button' value='Test' onClick='testAuthentication();'>"; $res .= "</td></tr>"; $res .= "<tr>"; $res .= "<td class='head'>Authentication password</td>"; $res .= "<td><input class='editInputTextMedium' type='text' name='auth_passwd' id='auth_passwd' value='" . fi($this->getValue('auth_passwd', '')) . "'></td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Authentication param</td>"; $res .= "<td><input class='editInputText' type='text' name='auth_param' id='auth_param' value='" . fi($this->getValue('auth_param', '')) . "'>"; $res .= "<br><span class='help'>Example : http://www.server.com/login.asp|login=$$auth_login$$|password=$$auth_passwd$$</span></td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Comment</td>"; $res .= "<td><textarea name='source_comment' id='source_comment' rows='6' cols='70' class='editInputTextarea'>" . fi($this->getValue('comment', '')) . "</textarea></td>"; $res .= "</tr>"; $res .= "<tr>"; $res .= "<td class='head'>Source contact</td>"; $res .= "<td><textarea name='source_contact' id='source_contact' rows='6' cols='70' class='editInputTextarea'>" . fi($this->getValue('contact', '')) . "</textarea></td>"; $res .= "</tr>"; return $res; }