/** * Extract data from the downloaded content. * * @param $html * @param $status * @param $options * * @return array */ protected function process($html, $status, $options) { $data = []; $data['laws'] = []; $page = crawler($html); $last_pager_link = $page->filterXPath('//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/div[2]/span/a[last()]'); $data['page_count'] = $last_pager_link->count() ? preg_replace('/(.*?)([0-9]+)$/u', '$2', $last_pager_link->attr('href')) : 1; $page->filterXPath('//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/dl/dd/ol/li')->each(function (Crawler $node) use(&$data) { $url = $node->filterXPath('//a')->attr('href'); $id = preg_replace('|/laws/show/|u', '', shortURL($url)); $raw_date = $node->filterXPath('//font[@color="#004499"]')->text(); $date = $this->parseDate($raw_date, "Date has not been found in #{$id} at text: " . $node->text()); $data['laws'][$id] = ['id' => $id, 'date' => $date]; }); return $data; }
<?php require_once "../admin_conn.php"; require_once "collect_fun.php"; require_once "MovieType.php"; require_once "NBaidu.php"; require_once "tools/ContentManager.php"; require_once "collect_vod.php"; require_once "../score/AutoDouBanParseScore.php"; $host = $_SERVER['HTTP_HOST']; $crontab = be("all", "crontab"); crawler($crontab); exit(-1); function crawler($crontab) { global $db; writetofile("crawler_collect.sql", 'crawler start: crontab: ' . $crontab); $sql = "SELECT * FROM mac_cj_zhuiju where status=0 and crontab_desc like'%" . $crontab . "%' GROUP BY m_urltest order by m_urltest "; writetofile("crawler_collect.sql", 'crawler start: sql: ' . $sql); $rs = $db->query($sql); parseVodPad($rs); unset($rs); writetofile("crawler_collect.sql", 'crawler stop.'); collect($crontab); } function collect($crontab) { writetofile("crawler_collect.sql", 'collect start.'); global $db; $time = date("Y-m-d"); $count = $db->getOne("SELECT count(*) FROM mac_cj_zhuiju where status=0 and crontab_desc like'%" . $crontab . "%' ");
if (!in_array($product->getAttribute('href'), $productLinksArray)) { array_push($productLinksArray, $product->getAttribute('href')); } } } } //var_dump($productLinksArray); } catch (Exception $e) { echo $e->getMessage(); } /***** Get each product details *****/ try { $catalog = new Catalog("Ikea"); for ($i = 0; $i < sizeof($productLinksArray); $i++) { $seen = array(); $htmlProductDetails = crawler("http://www.ikea.com" . $productLinksArray[$i]); $dom = new DOMDocument(); @$dom->loadHTML($htmlProductDetails); $map = array(); foreach ($dom->getElementsByTagName('div') as $div) { if ($div->getAttribute('class') == "productName") { $name = preg_replace("/&#?[a-z0-9]{2,8};/i", "", trim($div->nodeValue)); if (!in_array($name, $seen)) { array_push($seen, $name); $map["name"] = $name; echo $name . "\n"; } } if ($div->getAttribute('id') == "custMaterials") { $description = preg_replace("/&#?[a-z0-9]{2,8};/i", "", trim($div->nodeValue)); if (!in_array($description, $seen)) {
/** * Parse the issuers, states and law types lists from their listing ( http://zakon.rada.gov.ua/laws/stru/a ). * * @param bool $re_download */ public function parse($re_download) { $data = download('/laws/stru/a', ['re_download' => $re_download]); $list = crawler($data['html']); // The loop here is to parse both domestic and international issuers. $issuers = []; for ($i = 1; $i <= 2; $i++) { $XPATH = '//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/div[2]/table[' . $i . ']/tbody/tr/td/table/tbody/tr'; $group = null; $list->filterXPath($XPATH)->each(function ($node) use(&$issuers, &$group, $i) { $cells = $node->filterXPath('//td'); if ($cells->count() == 1) { $text = better_trim($cells->text()); if ($text) { $group = $text; } } elseif ($cells->count() == 4) { $issuer_link = $node->filterXPath('//td[2]/a'); $issuer = new \stdClass(); $issuer->url = $issuer_link->attr('href'); $issuer->id = str_replace('/laws/main/', '', $issuer->url); $issuer->group_name = $group; $issuer->name = better_trim($issuer_link->filterXPath('//b')->text()); $issuer->full_name = preg_replace('|<b>.*?</b> *|u', '', $issuer_link->filterXPath('//font')->html()); if ($issuer->full_name) { $issuer->full_name = preg_replace('|^\\((.*?)\\)$|u', '$1', $issuer->full_name); // Swap values. list($issuer->name, $issuer->full_name) = array($issuer->full_name, $issuer->name); } $issuer->website = $issuer_link->count() == 2 ? $issuer_link->last()->attr('href') : null; $issuer->international = $i - 1; $issuers[$issuer->name] = $issuer; } }); } $this->setIssuers($issuers); $XPATH = '//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/div[2]/table[' . 3 . ']/tbody/tr/td/table/tbody/tr'; $types = []; $list->filterXPath($XPATH)->each(function ($node) use(&$types) { $cells = $node->filterXPath('//td'); if ($cells->count() == 4) { $type_link = $node->filterXPath('//td[2]/a'); $type = new \stdClass(); $type->url = $type_link->attr('href'); $type->id = str_replace('/laws/main/', '', $type->url); $type->name = better_trim($type_link->text()); $types[$type->name] = $type; } }); $this->setTypes($types); $XPATH = '//*[@id="page"]/div[2]/table/tbody/tr[1]/td[3]/div/div[2]/table[' . 5 . ']/tbody/tr/td/table/tbody/tr'; $states = []; $list->filterXPath($XPATH)->each(function ($node) use(&$states) { $cells = $node->filterXPath('//td'); if ($cells->count() == 4) { $state_link = $node->filterXPath('//td[2]/a'); $state = new \stdClass(); $state->url = $state_link->attr('href'); $state->id = str_replace('/laws/main/', '', $state->url); $state->name = better_trim($state_link->text()); $states[$state->name] = $state; } }); $this->setStates($states); }
function main() { unlink("url.txt"); $current_url = 'http://www.baidu.com'; $fp_puts = fopen("url.txt", "a"); $fp_gets = fopen("url.txt", "r"); $id = 1; do { $current_url = trim($current_url); $result_url_arr = crawler($id, $current_url); if ($result_url_arr) { foreach ($result_url_arr as $url) { fputs($fp_puts, $url . "\r\n"); } } $id++; } while ($current_url = fgets($fp_gets, 1024)); }
/** * @param $html * @param $default_date * @param $url * * @return bool|string * @throws Exceptions\WrongDateException */ public function getRevisionDate($html, $default_date, $url) { if (strpos($html, 'txt txt-old') !== false) { $revision_date = $default_date; } else { try { // OpenData downloaded document. if (strpos($html, '<div id="article">')) { $title_text = crawler($html)->filterXPath('//h3[1]')->text(); } else { $title_text = crawler($html)->filterXPath('//div[@id="pan_title"]')->text(); } if (preg_match('| від ([0-9\\?]{2}\\.[0-9\\?]{2}\\.[0-9\\?]{4})|u', $title_text, $matches)) { $raw_date = $matches[1]; if ($raw_date == '??.??.????') { $revision_date = $raw_date; } else { $revision_date = $this->parseDate($raw_date); } } else { throw new Exceptions\WrongDateException("Revision date has not been found in text of {$url}"); } } catch (\Exception $e) { throw new Exceptions\WrongDateException("Revision date has not been found in text of {$url}"); } } return $revision_date; }
$pid = pcntl_fork(); set_time_limit(0); if ($pid == -1) { die("could not fork\n"); } elseif ($pid) { //echo "parent pid is " . posix_getpid() . "\n"; } else { //echo "child pid is " . posix_getpid() . "\n"; sleep(1); crawler(); } }*/ crawler(); function crawler() { $proxyObj = new proxy(); $mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler'); $mysqli->query('SET NAMES gbk'); //for (;;) { $hour = date('G'); $current = time(); //$sql = "SELECT * FROM keyword WHERE status = 'active' AND clicked_times < times AND ((last_click_time + click_interval) < {$current}) AND ((path1_page < 5 AND path1_page > 0) OR (path2_page < 5 AND path2_page > 0) OR (path3_page < 5 AND path3_page > 0)) ORDER BY last_click_time ASC LIMIT 1"; $sql = "SELECT * FROM keyword WHERE id = 13 LIMIT 1"; $result = $mysqli->query($sql); $data = array(); if ($result) { $obj = $result->fetch_object(); $result->close();
/** * 测试用主程序 */ function main() { $current_url = "http://hao123.com"; //初始url $fp_puts = fopen("url.txt", "ab"); //记录url列表 $fp_gets = fopen("url.txt", "r"); //保存url列表 do { $result_url_arr = crawler($current_url); if ($result_url_arr) { foreach ($result_url_arr as $url) { fputs($fp_puts, $url . "\r\n"); } } } while ($current_url = fgets($fp_gets, 1024)); //不断获得url }
/** * Extract data from the downloaded content. * * @param $html * @param $status * @param $options * * @return array * @throws Exceptions\DocumentHasErrors */ protected function process($html, $status, $options) { $law_id = $options['law_id']; $data = []; $crawler = crawler($html)->filter('.txt'); $data['card'] = $crawler->html(); $data['meta'] = []; $last_field = null; $crawler->filterXPath('//h2[text()="Картка документа"]/following-sibling::dl[1]')->children()->each(function (Crawler $node) use(&$data, &$last_field, $law_id) { if ($node->getNode(0)->tagName == 'dt') { $last_field = rtrim($node->text(), ':'); $data['meta'][$last_field] = []; } elseif ($node->getNode(0)->tagName == 'dd') { if ($last_field == 'Дати') { $data['date'] = $this->parseDate($node->filterXPath('//font')->text(), "Law date is not valid in card of '{$law_id}'"); } $data['meta'][$last_field][] = $node->text(); } }); if (!isset($data['date'])) { throw new Exceptions\DocumentHasErrors("Law date is missing in '{$law_id}'"); } $data['title'] = $crawler->filterXPath('//h1')->html(); $data['title'] = str_replace(' <img src="http://zakonst.rada.gov.ua/images/fav1.gif" title="Популярний">', '', $data['title']); $data['has_text'] = strpos($html, 'Текст відсутній') === false && strpos($html, 'Текст документа') !== false; $data['revisions'] = []; $last_revision = null; $data['active_revision'] = null; $crawler->filterXPath('//h2[contains(text(), "Історія документа")]/following-sibling::dl[1]')->children()->each(function (Crawler $node) use(&$data, &$last_revision, $law_id) { if ($node->getNode(0)->tagName == 'dt') { $raw_date = $node->filterXPath('//span[@style="color: #004499" or @style="color: #006600"]')->text(); $date = $this->parseDate($raw_date, "Revision date '{$raw_date}' is not valid in card of '{$law_id}'"); $last_revision = count($data['revisions']); $data['revisions'][] = ['law_id' => $law_id, 'date' => $date, 'comment' => []]; if (!$node->filter('a')->count()) { $data['revisions'][$last_revision]['no_text'] = true; } if (str_contains($node->text(), 'поточна редакція')) { $data['active_revision'] = $data['revisions'][$last_revision]['date']; } } elseif ($node->getNode(0)->tagName == 'dd') { $comment = $node->html(); if (strpos($comment, '<a name="Current"></a>') !== false) { $data['active_revision'] = $data['revisions'][$last_revision]['date']; } $comment = str_replace('<a name="Current"></a>', '', $comment); $comment = preg_replace('|<u>(.*?)</u>|u', '$1', $comment); $data['revisions'][$last_revision]['comment'][] = $comment; } }); foreach ($data['revisions'] as $date => &$revision) { $revision['comment'] = implode("\n", $revision['comment']); } if (!$data['active_revision'] && $data['has_text']) { $sub_options = $options; $sub_url = '/laws/show/' . $law_id; $sub_options['url'] = $sub_url; $this->download($sub_url, $options, function ($html, $status, $options) use($data) { $d = app()->make('lawgrabber.revision_downloader'); try { $data['active_revision'] = $d->getRevisionDate($html, '', ''); } catch (\Exception $e) { throw new Exceptions\DocumentHasErrors("Card has text, but no revisions in '{$law_id}'"); } }); } if (isset($options['check_related']) && $options['check_related']) { $changes_link = $crawler->filterXPath('//h2[contains(text(), "Пов\'язані документи")]/following-sibling::dl[1]/*/a/font[text()="Змінює документ..."]/..'); if ($changes_link->count()) { $list = $this->downloadList($changes_link->attr('href')); $data['changes_laws'] = $list['laws']; for ($i = 2; $i <= $list['page_count']; $i++) { $list = $this->downloadList($changes_link->attr('href') . '/page' . $i); $data['changes_laws'] += $list['laws']; } } } return $data; }
/** * @throws \Exception */ function move_files() { function rrmdir($dir) { if (is_dir($dir)) { $objects = scandir($dir); foreach ($objects as $object) { if ($object != "." && $object != "..") { if (filetype($dir . "/" . $object) == "dir") { rrmdir($dir . "/" . $object); } else { unlink($dir . "/" . $object); } } } reset($objects); rmdir($dir); } } function is_dir_empty($dir) { if (!is_readable($dir)) { return NULL; } return count(scandir($dir)) == 2; } $base_laws = DOWNLOADS_PATH . 'zakon.rada.gov.ua/laws/'; $dirs = array_merge(glob($base_laws . 'show/*/*', GLOB_ONLYDIR | GLOB_MARK), glob($base_laws . 'show/*', GLOB_ONLYDIR | GLOB_MARK)); foreach ($dirs as $dir) { if (glob($dir . '*', GLOB_ONLYDIR)) { continue; } if (glob($dir . 'card.html')) { continue; } rrmdir($dir); } $base_laws = DOWNLOADS_PATH . 'zakon.rada.gov.ua/laws/'; $files = array_merge(glob($base_laws . 'show/*/*/card.html'), glob($base_laws . 'show/*/card.html')); foreach ($files as $file) { $law_id = preg_replace('|' . $base_laws . 'show/(.*?)/card.html|', '$1', $file); $new_name_card = $base_laws . 'card/' . $law_id . '.html'; preg_match('|<span style="color: #.*?">(.*?)</span>(?:</b></a>\\n<img src="http://zakonst.rada.gov.ua/images/docs.gif" title="Документ"> <span class="num" style="color:#999999">поточна редакція, .*?, <a href=".*">перейти »</a></span>)?</dt>\\n<dd><a name="Current">|', file_get_contents($file), $matches); $revision = isset($matches[1]) ? $matches[1] : null; if ($revision) { if (!preg_match('|[0-9]{2}\\.[0-9]{2}\\.[0-9]{4}|', $revision) || !date_create_from_format('d.m.Y', $revision)) { throw new \Exception("Revision has not been found in #{$file}."); } $date = date_format(date_create_from_format('d.m.Y', $revision), 'Ymd'); $new_name_text = $base_laws . 'show/' . $law_id . '/ed' . $date . '/page.html'; $old_files = []; $text = ''; if (file_exists($base_laws . 'show/' . $law_id . '/text.html')) { $text_file_name = $base_laws . 'show/' . $law_id . '/text.html'; $old_files[] = $text_file_name; $text = crawler(file_get_contents($text_file_name))->filter('.txt')->html(); } elseif (file_exists($base_laws . 'show/' . $law_id . '/page.html')) { $text_file_name = $base_laws . 'show/' . $law_id . '/page.html'; $old_files[] = $text_file_name; $page = crawler(file_get_contents($base_laws . 'show/' . $law_id . '/page.html')); $text = $page->filter('.txt')->html(); $pager = $page->filterXPath('(//span[@class="nums"])[1]/br/preceding-sibling::a[1]'); $page_count = $pager->count() ? $pager->text() : 1; for ($i = 2; $i <= $page_count; $i++) { $text_file_name = $base_laws . 'show/' . $law_id . '/page' . $i . '.html'; if (!file_exists($text_file_name)) { $text = ''; foreach ($old_files as $file_name) { unlink($file_name); } break; } $old_files[] = $text_file_name; $page = crawler(file_get_contents($text_file_name)); $text .= $page->filter('.txt')->html(); } } if (file_exists($new_name_text)) { foreach ($old_files as $file_name) { unlink($file_name); } } elseif ($text) { $new_dir = dirname($new_name_text); if (!is_dir($new_dir)) { mkdir($new_dir, 0777, true); } file_put_contents($new_name_text, '<html><body><div class="txt txt-old">' . $text . '</div></body></html>'); touch($new_name_text, filemtime($old_files[0])); foreach ($old_files as $file_name) { unlink($file_name); } } } if (file_exists($new_name_card)) { unlink($file); } else { $new_dir = dirname($new_name_card); if (!is_dir($new_dir)) { mkdir($new_dir, 0777, true); } rename($file, $new_name_card); } if (is_dir_empty($base_laws . 'show/' . $law_id)) { rrmdir($base_laws . 'show/' . $law_id); $parent = dirname($base_laws . 'show/' . $law_id); if (is_dir_empty($parent)) { rrmdir($parent); } } } }