Exemplo n.º 1
0
function crawl($url, $default_date, $default_description, $default_language, $default_title, $default_site_name, $nocrawl)
{
    global $dom;
    global $input;
    global $root;
    global $host;
    global $links;
    global $ds;
    global $categoryid;
    global $logService;
    global $level;
    global $start;
    global $error;
    echo "<br>crawl " . $url;
    if (stripos($url, "hill") !== false) {
        $break = 1;
    }
    $metas = curl_loadLink($url, $default_date, $default_description, $default_language, $default_title, $default_site_name);
    $logService->log('DEBUG D4CRAWLER', 'returned 1 with metas ', $url, 'crawler');
    $text = '';
    $identity = "00e98e251067b27107189bd7c8316ba2";
    // Hudson Wilde
    $approved = 1;
    $success = true;
    if ($metas) {
        if (stripos($metas['title'], "Archive") === FALSE && stripos($metas['title'], "Access Denied") === FALSE && stripos($metas['title'], "Print") === FALSE && stripos($metas['title'], "Page not found") === FALSE && stripos($metas['title'], "Articles: Breitbart") === FALSE && stripos($metas['title'], "Articles - Breitbart") === FALSE && stripos($metas['title'], "Hearst Magazines") === FALSE) {
            $logService->log('DEBUG D4CRAWLER', 'returned with metas ', $url, 'crawler');
            $threadid = '';
            try {
                $threadid = $ds->submitTopic($metas['link'], $metas['title'], $metas['image'], $metas['site_name'], $metas['description'], $text, $metas['locale'], $identity, $metas['published_time'], $metas['updated_time'], $metas['author'], 1, 0, $url);
                $logService->log('DEBUG', 'Returned from submit topic', var_log($metas, '$metas'), 'crawler');
                $ds->saveBody($threadid, $metas['body'] . $metas['description'] . $metas['title'] . $metas['site_name']);
            } catch (Exception $x) {
                $logService->log('DEBUG', 'Exception', var_log($x), 'd4crawler');
                echo var_dump($x);
            }
            $success = false;
            if ($threadid) {
                try {
                    $logService->log('DEBUG', 'SubmitCategory threadid=' . $threadid . ',categoryid=' . $categoryid, $url, 'crawler');
                    $success = $ds->submitTopicCategory($categoryid, $approved, $threadid, $identity, 0);
                    if ($metas['author'] && $success) {
                        sleep(1);
                        // to change the shared_time and avoid dups in views
                        // $logService->log('DEBUG','SubmitCategory for author',$author_record['categoryid'],'crawler');
                        $suc = $ds->submitTopicCategory($metas['author_categoryid'], $approved, $threadid, $identity, 0);
                        // $logService->log('DEBUG','SubmitCategory for author succes='.$suc,$author_record['categoryid'],'crawler');
                    }
                } catch (Exception $x) {
                    echo var_dump($x);
                    $logService->log('DEBUG Exception', 'Submitting topic category', var_log($x), 'd4crawler');
                }
            }
        } else {
            $logService->log('DEBUG D4Crawler', 'Disallowd metas', var_log($metas, '$metas'), 'd4crawler');
        }
    } else {
        $logService->log('DEBUG D4Crawler', 'No metas return', '', 'd4crawler');
        echo $error;
    }
}
Exemplo n.º 2
0
function loadLink($link)
{
    set_time_limit(0);
    $metas = curl_loadLink($link);
    set_time_limit(60);
    if ($metas != false) {
        echo json_encode(array("success" => true, "total" => sizeof($metas), "metas" => $metas));
    } else {
        echo json_encode(array("success" => true, "total" => "0", "metas" => array()));
    }
}