function crawl($url, $default_date, $default_description, $default_language, $default_title, $default_site_name, $nocrawl) { global $dom; global $input; global $root; global $host; global $links; global $ds; global $categoryid; global $logService; global $level; global $start; global $error; echo "<br>crawl " . $url; if (stripos($url, "hill") !== false) { $break = 1; } $metas = curl_loadLink($url, $default_date, $default_description, $default_language, $default_title, $default_site_name); $logService->log('DEBUG D4CRAWLER', 'returned 1 with metas ', $url, 'crawler'); $text = ''; $identity = "00e98e251067b27107189bd7c8316ba2"; // Hudson Wilde $approved = 1; $success = true; if ($metas) { if (stripos($metas['title'], "Archive") === FALSE && stripos($metas['title'], "Access Denied") === FALSE && stripos($metas['title'], "Print") === FALSE && stripos($metas['title'], "Page not found") === FALSE && stripos($metas['title'], "Articles: Breitbart") === FALSE && stripos($metas['title'], "Articles - Breitbart") === FALSE && stripos($metas['title'], "Hearst Magazines") === FALSE) { $logService->log('DEBUG D4CRAWLER', 'returned with metas ', $url, 'crawler'); $threadid = ''; try { $threadid = $ds->submitTopic($metas['link'], $metas['title'], $metas['image'], $metas['site_name'], $metas['description'], $text, $metas['locale'], $identity, $metas['published_time'], $metas['updated_time'], $metas['author'], 1, 0, $url); $logService->log('DEBUG', 'Returned from submit topic', var_log($metas, '$metas'), 'crawler'); $ds->saveBody($threadid, $metas['body'] . $metas['description'] . $metas['title'] . $metas['site_name']); } catch (Exception $x) { $logService->log('DEBUG', 'Exception', var_log($x), 'd4crawler'); echo var_dump($x); } $success = false; if ($threadid) { try { $logService->log('DEBUG', 'SubmitCategory threadid=' . $threadid . ',categoryid=' . $categoryid, $url, 'crawler'); $success = $ds->submitTopicCategory($categoryid, $approved, $threadid, $identity, 0); if ($metas['author'] && $success) { sleep(1); // to change the shared_time and avoid dups in views // $logService->log('DEBUG','SubmitCategory for author',$author_record['categoryid'],'crawler'); $suc = $ds->submitTopicCategory($metas['author_categoryid'], $approved, $threadid, $identity, 0); // $logService->log('DEBUG','SubmitCategory for author succes='.$suc,$author_record['categoryid'],'crawler'); } } catch (Exception $x) { echo var_dump($x); $logService->log('DEBUG Exception', 'Submitting topic category', var_log($x), 'd4crawler'); } } } else { $logService->log('DEBUG D4Crawler', 'Disallowd metas', var_log($metas, '$metas'), 'd4crawler'); } } else { $logService->log('DEBUG D4Crawler', 'No metas return', '', 'd4crawler'); echo $error; } }
function loadLink($link) { set_time_limit(0); $metas = curl_loadLink($link); set_time_limit(60); if ($metas != false) { echo json_encode(array("success" => true, "total" => sizeof($metas), "metas" => $metas)); } else { echo json_encode(array("success" => true, "total" => "0", "metas" => array())); } }