Example #1
0
 /**
  * Extract data from the downloaded content.
  *
  * @param $html
  * @param $status
  * @param $options
  *
  * @return array
  */
 protected function process($html, $status, $options)
 {
     $data = [];
     $data['laws'] = [];
     $page = crawler($html);
     $last_pager_link = $page->filterXPath('//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/div[2]/span/a[last()]');
     $data['page_count'] = $last_pager_link->count() ? preg_replace('/(.*?)([0-9]+)$/u', '$2', $last_pager_link->attr('href')) : 1;
     $page->filterXPath('//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/dl/dd/ol/li')->each(function (Crawler $node) use(&$data) {
         $url = $node->filterXPath('//a')->attr('href');
         $id = preg_replace('|/laws/show/|u', '', shortURL($url));
         $raw_date = $node->filterXPath('//font[@color="#004499"]')->text();
         $date = $this->parseDate($raw_date, "Date has not been found in #{$id} at text: " . $node->text());
         $data['laws'][$id] = ['id' => $id, 'date' => $date];
     });
     return $data;
 }
<?php

require_once "../admin_conn.php";
require_once "collect_fun.php";
require_once "MovieType.php";
require_once "NBaidu.php";
require_once "tools/ContentManager.php";
require_once "collect_vod.php";
require_once "../score/AutoDouBanParseScore.php";
$host = $_SERVER['HTTP_HOST'];
$crontab = be("all", "crontab");
crawler($crontab);
exit(-1);
function crawler($crontab)
{
    global $db;
    writetofile("crawler_collect.sql", 'crawler start: crontab: ' . $crontab);
    $sql = "SELECT * FROM mac_cj_zhuiju  where status=0 and crontab_desc like'%" . $crontab . "%' GROUP BY m_urltest order by m_urltest ";
    writetofile("crawler_collect.sql", 'crawler start: sql: ' . $sql);
    $rs = $db->query($sql);
    parseVodPad($rs);
    unset($rs);
    writetofile("crawler_collect.sql", 'crawler stop.');
    collect($crontab);
}
function collect($crontab)
{
    writetofile("crawler_collect.sql", 'collect start.');
    global $db;
    $time = date("Y-m-d");
    $count = $db->getOne("SELECT count(*) FROM mac_cj_zhuiju  where status=0 and crontab_desc like'%" . $crontab . "%'  ");
Example #3
0
                if (!in_array($product->getAttribute('href'), $productLinksArray)) {
                    array_push($productLinksArray, $product->getAttribute('href'));
                }
            }
        }
    }
    //var_dump($productLinksArray);
} catch (Exception $e) {
    echo $e->getMessage();
}
/***** Get each product details *****/
try {
    $catalog = new Catalog("Ikea");
    for ($i = 0; $i < sizeof($productLinksArray); $i++) {
        $seen = array();
        $htmlProductDetails = crawler("http://www.ikea.com" . $productLinksArray[$i]);
        $dom = new DOMDocument();
        @$dom->loadHTML($htmlProductDetails);
        $map = array();
        foreach ($dom->getElementsByTagName('div') as $div) {
            if ($div->getAttribute('class') == "productName") {
                $name = preg_replace("/&#?[a-z0-9]{2,8};/i", "", trim($div->nodeValue));
                if (!in_array($name, $seen)) {
                    array_push($seen, $name);
                    $map["name"] = $name;
                    echo $name . "\n";
                }
            }
            if ($div->getAttribute('id') == "custMaterials") {
                $description = preg_replace("/&#?[a-z0-9]{2,8};/i", "", trim($div->nodeValue));
                if (!in_array($description, $seen)) {
Example #4
0
 /**
  * Parse the issuers, states and law types lists from their listing ( http://zakon.rada.gov.ua/laws/stru/a ).
  *
  * @param bool $re_download
  */
 public function parse($re_download)
 {
     $data = download('/laws/stru/a', ['re_download' => $re_download]);
     $list = crawler($data['html']);
     // The loop here is to parse both domestic and international issuers.
     $issuers = [];
     for ($i = 1; $i <= 2; $i++) {
         $XPATH = '//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/div[2]/table[' . $i . ']/tbody/tr/td/table/tbody/tr';
         $group = null;
         $list->filterXPath($XPATH)->each(function ($node) use(&$issuers, &$group, $i) {
             $cells = $node->filterXPath('//td');
             if ($cells->count() == 1) {
                 $text = better_trim($cells->text());
                 if ($text) {
                     $group = $text;
                 }
             } elseif ($cells->count() == 4) {
                 $issuer_link = $node->filterXPath('//td[2]/a');
                 $issuer = new \stdClass();
                 $issuer->url = $issuer_link->attr('href');
                 $issuer->id = str_replace('/laws/main/', '', $issuer->url);
                 $issuer->group_name = $group;
                 $issuer->name = better_trim($issuer_link->filterXPath('//b')->text());
                 $issuer->full_name = preg_replace('|<b>.*?</b> *|u', '', $issuer_link->filterXPath('//font')->html());
                 if ($issuer->full_name) {
                     $issuer->full_name = preg_replace('|^\\((.*?)\\)$|u', '$1', $issuer->full_name);
                     // Swap values.
                     list($issuer->name, $issuer->full_name) = array($issuer->full_name, $issuer->name);
                 }
                 $issuer->website = $issuer_link->count() == 2 ? $issuer_link->last()->attr('href') : null;
                 $issuer->international = $i - 1;
                 $issuers[$issuer->name] = $issuer;
             }
         });
     }
     $this->setIssuers($issuers);
     $XPATH = '//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/div[2]/table[' . 3 . ']/tbody/tr/td/table/tbody/tr';
     $types = [];
     $list->filterXPath($XPATH)->each(function ($node) use(&$types) {
         $cells = $node->filterXPath('//td');
         if ($cells->count() == 4) {
             $type_link = $node->filterXPath('//td[2]/a');
             $type = new \stdClass();
             $type->url = $type_link->attr('href');
             $type->id = str_replace('/laws/main/', '', $type->url);
             $type->name = better_trim($type_link->text());
             $types[$type->name] = $type;
         }
     });
     $this->setTypes($types);
     $XPATH = '//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/div[2]/table[' . 5 . ']/tbody/tr/td/table/tbody/tr';
     $states = [];
     $list->filterXPath($XPATH)->each(function ($node) use(&$states) {
         $cells = $node->filterXPath('//td');
         if ($cells->count() == 4) {
             $state_link = $node->filterXPath('//td[2]/a');
             $state = new \stdClass();
             $state->url = $state_link->attr('href');
             $state->id = str_replace('/laws/main/', '', $state->url);
             $state->name = better_trim($state_link->text());
             $states[$state->name] = $state;
         }
     });
     $this->setStates($states);
 }
function main()
{
    unlink("url.txt");
    $current_url = 'http://www.baidu.com';
    $fp_puts = fopen("url.txt", "a");
    $fp_gets = fopen("url.txt", "r");
    $id = 1;
    do {
        $current_url = trim($current_url);
        $result_url_arr = crawler($id, $current_url);
        if ($result_url_arr) {
            foreach ($result_url_arr as $url) {
                fputs($fp_puts, $url . "\r\n");
            }
        }
        $id++;
    } while ($current_url = fgets($fp_gets, 1024));
}
 /**
  * @param $html
  * @param $default_date
  * @param $url
  *
  * @return bool|string
  * @throws Exceptions\WrongDateException
  */
 public function getRevisionDate($html, $default_date, $url)
 {
     if (strpos($html, 'txt txt-old') !== false) {
         $revision_date = $default_date;
     } else {
         try {
             // OpenData downloaded document.
             if (strpos($html, '<div id="article">')) {
                 $title_text = crawler($html)->filterXPath('//h3[1]')->text();
             } else {
                 $title_text = crawler($html)->filterXPath('//div[@id="pan_title"]')->text();
             }
             if (preg_match('| від ([0-9\\?]{2}\\.[0-9\\?]{2}\\.[0-9\\?]{4})|u', $title_text, $matches)) {
                 $raw_date = $matches[1];
                 if ($raw_date == '??.??.????') {
                     $revision_date = $raw_date;
                 } else {
                     $revision_date = $this->parseDate($raw_date);
                 }
             } else {
                 throw new Exceptions\WrongDateException("Revision date has not been found in text of {$url}");
             }
         } catch (\Exception $e) {
             throw new Exceptions\WrongDateException("Revision date has not been found in text of {$url}");
         }
     }
     return $revision_date;
 }
Example #7
0
    $pid = pcntl_fork();
    set_time_limit(0);

    if ($pid == -1) {
         die("could not fork\n");
    }
    elseif ($pid) {
         //echo "parent pid is " . posix_getpid() . "\n";
    }
    else {
         //echo "child pid is " . posix_getpid() . "\n";
         sleep(1);
         crawler();
    }
}*/
crawler();
function crawler()
{
    $proxyObj = new proxy();
    $mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler');
    $mysqli->query('SET NAMES gbk');
    //for (;;) {
    $hour = date('G');
    $current = time();
    //$sql = "SELECT * FROM keyword WHERE status = 'active' AND clicked_times < times AND ((last_click_time + click_interval) < {$current}) AND ((path1_page < 5 AND path1_page > 0) OR (path2_page < 5 AND path2_page > 0) OR (path3_page < 5 AND path3_page > 0)) ORDER BY last_click_time ASC LIMIT 1";
    $sql = "SELECT * FROM keyword WHERE id = 13 LIMIT 1";
    $result = $mysqli->query($sql);
    $data = array();
    if ($result) {
        $obj = $result->fetch_object();
        $result->close();
Example #8
0
/**
 * 测试用主程序
 */
function main()
{
    $current_url = "http://hao123.com";
    //初始url
    $fp_puts = fopen("url.txt", "ab");
    //记录url列表
    $fp_gets = fopen("url.txt", "r");
    //保存url列表
    do {
        $result_url_arr = crawler($current_url);
        if ($result_url_arr) {
            foreach ($result_url_arr as $url) {
                fputs($fp_puts, $url . "\r\n");
            }
        }
    } while ($current_url = fgets($fp_gets, 1024));
    //不断获得url
}
Example #9
0
 /**
  * Extract data from the downloaded content.
  *
  * @param $html
  * @param $status
  * @param $options
  *
  * @return array
  * @throws Exceptions\DocumentHasErrors
  */
 protected function process($html, $status, $options)
 {
     $law_id = $options['law_id'];
     $data = [];
     $crawler = crawler($html)->filter('.txt');
     $data['card'] = $crawler->html();
     $data['meta'] = [];
     $last_field = null;
     $crawler->filterXPath('//h2[text()="Картка документа"]/following-sibling::dl[1]')->children()->each(function (Crawler $node) use(&$data, &$last_field, $law_id) {
         if ($node->getNode(0)->tagName == 'dt') {
             $last_field = rtrim($node->text(), ':');
             $data['meta'][$last_field] = [];
         } elseif ($node->getNode(0)->tagName == 'dd') {
             if ($last_field == 'Дати') {
                 $data['date'] = $this->parseDate($node->filterXPath('//font')->text(), "Law date is not valid in card of '{$law_id}'");
             }
             $data['meta'][$last_field][] = $node->text();
         }
     });
     if (!isset($data['date'])) {
         throw new Exceptions\DocumentHasErrors("Law date is missing in '{$law_id}'");
     }
     $data['title'] = $crawler->filterXPath('//h1')->html();
     $data['title'] = str_replace(' <img src="http://zakonst.rada.gov.ua/images/fav1.gif" title="Популярний">', '', $data['title']);
     $data['has_text'] = strpos($html, 'Текст відсутній') === false && strpos($html, 'Текст документа') !== false;
     $data['revisions'] = [];
     $last_revision = null;
     $data['active_revision'] = null;
     $crawler->filterXPath('//h2[contains(text(), "Історія документа")]/following-sibling::dl[1]')->children()->each(function (Crawler $node) use(&$data, &$last_revision, $law_id) {
         if ($node->getNode(0)->tagName == 'dt') {
             $raw_date = $node->filterXPath('//span[@style="color: #004499" or @style="color: #006600"]')->text();
             $date = $this->parseDate($raw_date, "Revision date '{$raw_date}' is not valid in card of '{$law_id}'");
             $last_revision = count($data['revisions']);
             $data['revisions'][] = ['law_id' => $law_id, 'date' => $date, 'comment' => []];
             if (!$node->filter('a')->count()) {
                 $data['revisions'][$last_revision]['no_text'] = true;
             }
             if (str_contains($node->text(), 'поточна редакція')) {
                 $data['active_revision'] = $data['revisions'][$last_revision]['date'];
             }
         } elseif ($node->getNode(0)->tagName == 'dd') {
             $comment = $node->html();
             if (strpos($comment, '<a name="Current"></a>') !== false) {
                 $data['active_revision'] = $data['revisions'][$last_revision]['date'];
             }
             $comment = str_replace('<a name="Current"></a>', '', $comment);
             $comment = preg_replace('|<u>(.*?)</u>|u', '$1', $comment);
             $data['revisions'][$last_revision]['comment'][] = $comment;
         }
     });
     foreach ($data['revisions'] as $date => &$revision) {
         $revision['comment'] = implode("\n", $revision['comment']);
     }
     if (!$data['active_revision'] && $data['has_text']) {
         $sub_options = $options;
         $sub_url = '/laws/show/' . $law_id;
         $sub_options['url'] = $sub_url;
         $this->download($sub_url, $options, function ($html, $status, $options) use($data) {
             $d = app()->make('lawgrabber.revision_downloader');
             try {
                 $data['active_revision'] = $d->getRevisionDate($html, '', '');
             } catch (\Exception $e) {
                 throw new Exceptions\DocumentHasErrors("Card has text, but no revisions in '{$law_id}'");
             }
         });
     }
     if (isset($options['check_related']) && $options['check_related']) {
         $changes_link = $crawler->filterXPath('//h2[contains(text(), "Пов\'язані документи")]/following-sibling::dl[1]/*/a/font[text()="Змінює документ..."]/..');
         if ($changes_link->count()) {
             $list = $this->downloadList($changes_link->attr('href'));
             $data['changes_laws'] = $list['laws'];
             for ($i = 2; $i <= $list['page_count']; $i++) {
                 $list = $this->downloadList($changes_link->attr('href') . '/page' . $i);
                 $data['changes_laws'] += $list['laws'];
             }
         }
     }
     return $data;
 }
Example #10
0
 /**
  * @throws \Exception
  */
 function move_files()
 {
     function rrmdir($dir)
     {
         if (is_dir($dir)) {
             $objects = scandir($dir);
             foreach ($objects as $object) {
                 if ($object != "." && $object != "..") {
                     if (filetype($dir . "/" . $object) == "dir") {
                         rrmdir($dir . "/" . $object);
                     } else {
                         unlink($dir . "/" . $object);
                     }
                 }
             }
             reset($objects);
             rmdir($dir);
         }
     }
     function is_dir_empty($dir)
     {
         if (!is_readable($dir)) {
             return NULL;
         }
         return count(scandir($dir)) == 2;
     }
     $base_laws = DOWNLOADS_PATH . 'zakon.rada.gov.ua/laws/';
     $dirs = array_merge(glob($base_laws . 'show/*/*', GLOB_ONLYDIR | GLOB_MARK), glob($base_laws . 'show/*', GLOB_ONLYDIR | GLOB_MARK));
     foreach ($dirs as $dir) {
         if (glob($dir . '*', GLOB_ONLYDIR)) {
             continue;
         }
         if (glob($dir . 'card.html')) {
             continue;
         }
         rrmdir($dir);
     }
     $base_laws = DOWNLOADS_PATH . 'zakon.rada.gov.ua/laws/';
     $files = array_merge(glob($base_laws . 'show/*/*/card.html'), glob($base_laws . 'show/*/card.html'));
     foreach ($files as $file) {
         $law_id = preg_replace('|' . $base_laws . 'show/(.*?)/card.html|', '$1', $file);
         $new_name_card = $base_laws . 'card/' . $law_id . '.html';
         preg_match('|<span style="color: #.*?">(.*?)</span>(?:</b></a>\\n<img src="http://zakonst.rada.gov.ua/images/docs.gif" title="Документ"> <span class="num" style="color:#999999">поточна редакція, .*?, <a href=".*">перейти »</a></span>)?</dt>\\n<dd><a name="Current">|', file_get_contents($file), $matches);
         $revision = isset($matches[1]) ? $matches[1] : null;
         if ($revision) {
             if (!preg_match('|[0-9]{2}\\.[0-9]{2}\\.[0-9]{4}|', $revision) || !date_create_from_format('d.m.Y', $revision)) {
                 throw new \Exception("Revision has not been found in #{$file}.");
             }
             $date = date_format(date_create_from_format('d.m.Y', $revision), 'Ymd');
             $new_name_text = $base_laws . 'show/' . $law_id . '/ed' . $date . '/page.html';
             $old_files = [];
             $text = '';
             if (file_exists($base_laws . 'show/' . $law_id . '/text.html')) {
                 $text_file_name = $base_laws . 'show/' . $law_id . '/text.html';
                 $old_files[] = $text_file_name;
                 $text = crawler(file_get_contents($text_file_name))->filter('.txt')->html();
             } elseif (file_exists($base_laws . 'show/' . $law_id . '/page.html')) {
                 $text_file_name = $base_laws . 'show/' . $law_id . '/page.html';
                 $old_files[] = $text_file_name;
                 $page = crawler(file_get_contents($base_laws . 'show/' . $law_id . '/page.html'));
                 $text = $page->filter('.txt')->html();
                 $pager = $page->filterXPath('(//span[@class="nums"])[1]/br/preceding-sibling::a[1]');
                 $page_count = $pager->count() ? $pager->text() : 1;
                 for ($i = 2; $i <= $page_count; $i++) {
                     $text_file_name = $base_laws . 'show/' . $law_id . '/page' . $i . '.html';
                     if (!file_exists($text_file_name)) {
                         $text = '';
                         foreach ($old_files as $file_name) {
                             unlink($file_name);
                         }
                         break;
                     }
                     $old_files[] = $text_file_name;
                     $page = crawler(file_get_contents($text_file_name));
                     $text .= $page->filter('.txt')->html();
                 }
             }
             if (file_exists($new_name_text)) {
                 foreach ($old_files as $file_name) {
                     unlink($file_name);
                 }
             } elseif ($text) {
                 $new_dir = dirname($new_name_text);
                 if (!is_dir($new_dir)) {
                     mkdir($new_dir, 0777, true);
                 }
                 file_put_contents($new_name_text, '<html><body><div class="txt txt-old">' . $text . '</div></body></html>');
                 touch($new_name_text, filemtime($old_files[0]));
                 foreach ($old_files as $file_name) {
                     unlink($file_name);
                 }
             }
         }
         if (file_exists($new_name_card)) {
             unlink($file);
         } else {
             $new_dir = dirname($new_name_card);
             if (!is_dir($new_dir)) {
                 mkdir($new_dir, 0777, true);
             }
             rename($file, $new_name_card);
         }
         if (is_dir_empty($base_laws . 'show/' . $law_id)) {
             rrmdir($base_laws . 'show/' . $law_id);
             $parent = dirname($base_laws . 'show/' . $law_id);
             if (is_dir_empty($parent)) {
                 rrmdir($parent);
             }
         }
     }
 }