/** * Static method to crawl the URLs * * @param string $url * @param array $elements * @param string $parent * @param string $start * @param string $time * @return void */ public static function crawl($url, $elements = null, $parent = null, $start = null, $time = null) { // Encode the URL $url = str_replace(array('%3A', '%2F', '%23', '%3F', '%3D', '%25', '%2B'), array(':', '/', '#', '?', '=', '%', '+'), rawurlencode($url)); $slashes = substr_count($url, '/') - 2; if ($slashes > self::$depth) { self::$depth = $slashes; } if (!array_key_exists($url, self::$urls) && !array_key_exists(strtolower($url), self::$urls)) { $spider = new Spider($url, $elements); echo '-> (' . $spider->getCode() . ') ' . $url . PHP_EOL; if ($spider->isError()) { self::$errors[] = array('code' => $spider->getCode(), 'url' => $url, 'parent' => $parent); } else { self::$urls[$url] = $spider; $domain = str_replace(self::$urls[$url]->getSchema(), '', self::$urls[$url]->getBase()); if (strpos($domain, '/') !== false) { $domain = substr($domain, 0, strpos($domain, '/')); } $urls = self::$urls[$url]->getElements('a'); if (null !== $urls) { foreach ($urls as $u) { $expired = false; if (null !== $start && null !== $time) { $expired = time() - $start > $time; } if (!$expired && null !== $u['href'] && $u['href'] != '' && substr($u['href'], 0, 1) != '#' && substr($u['href'], 0, 1) != '?' && stripos($u['href'], $domain) !== false) { self::crawl($u['href'], $elements, $url, $start, $time); } } } } } }
public function process_post($url) { $html = parent::get_html($url); $article = $html->getElementsByTagName("article"); $this->title[] = $article->item(0)->childNodes->item(0)->childNodes->item(0)->nodeValue; $this->post_date[] = $article->item(0)->childNodes->item(0)->childNodes->item(2)->childNodes->item(0)->childNodes->item(2)->childNodes->item(1)->nodeValue; $text = $article->item(0)->childNodes->item(1)->C14N(); $start_pos = strpos($text, '</script></span>'); $my_post = strpos($text, '<plusone') - $start_pos - 880; $this->body[] = substr($text, $start_pos, $my_post); }
public function download($uri, $options = array()) { //Make sure that the curl_options exists. if (!isset($options['curl_options'])) { $options['curl_options'] = array(); } if (isset($options['user_agent'])) { $options['curl_options'][CURLOPT_USERAGENT] = $options['user_agent']; } //Make sure that the content is returned. $options['curl_options'][CURLOPT_RETURNTRANSFER] = true; $options['curl_options'][CURLOPT_NOBODY] = false; $info = Spider::getURIInfo($uri, $options['curl_options']); if (!$info['content']) { throw new Exception('Error downloading ' . $uri . ' ' . $info['content']); } if (in_array($info['http_code'], array(0, 404)) && isset($options['crawl_404_pages']) && !$options['crawl_404_pages']) { throw new Exception('404 page ' . $uri . ' ' . $info['http_code']); } return $info['content']; }
$db->exec('create table SpiderPage ( id serial, uri varchar(255), primary key(id) );'); $db->exec('create table SpiderJavaScript ( id serial, uri varchar(255), script varchar(255), primary key(id) );'); $db->exec('create table SpiderStyleSheet ( id serial, uri varchar(255), style varchar(255), primary key(id) );'); //$pageLogger = new Spider_PageLogger($db); //$javaScriptLogger = new Spider_JavaScriptLogger($db); //$styleSheetLogger = new Spider_StyleSheetLogger($db); $logger = new Spider_Logger(); $downloader = new Spider_Downloader(); $parser = new Spider_Parser(); $spider = new Spider($downloader, $parser); $spider->addLogger($logger); $spider->addUriFilter('Spider_AnchorFilter'); $spider->addUriFilter('Spider_MailtoFilter'); //$spider->addLogger($pageLogger); //$spider->addLogger($styleSheetLogger); //$spider->addLogger($javaScriptLogger); $spider->spider('http://www.unl.edu/fwc/');
function showReports($searchInfo = '') { $userId = isLoggedIn(); if (!empty($searchInfo['from_time'])) { $fromTime = strtotime($searchInfo['from_time'] . ' 00:00:00'); } else { $fromTime = @mktime(0, 0, 0, date('m'), date('d') - 30, date('Y')); } if (!empty($searchInfo['to_time'])) { $toTime = strtotime($searchInfo['to_time'] . ' 23:59:59'); } else { $toTime = @mktime(); } $this->set('fromTime', date('Y-m-d', $fromTime)); $this->set('toTime', date('Y-m-d', $toTime)); $websiteController = new WebsiteController(); $websiteList = $websiteController->__getAllWebsites($userId, true); $this->set('websiteList', $websiteList); $websiteId = empty($searchInfo['website_id']) ? $websiteList[0]['id'] : intval($searchInfo['website_id']); $this->set('websiteId', $websiteId); $conditions = empty($websiteId) ? "" : " and s.website_id={$websiteId}"; $sql = "select s.* ,w.name\r\n\t\t\t\t\t\t\t\tfrom backlinkresults s,websites w \r\n\t\t\t\t\t\t\t\twhere s.website_id=w.id \r\n\t\t\t\t\t\t\t\tand result_time>= {$fromTime} and result_time<={$toTime} {$conditions} \r\n\t\t\t\t\t\t\t\torder by result_time"; $reportList = $this->db->select($sql); $i = 0; $colList = $this->colList; foreach ($colList as $col => $dbCol) { $prevRank[$col] = 0; } # loop throgh rank foreach ($reportList as $key => $repInfo) { foreach ($colList as $col => $dbCol) { $rankDiff[$col] = ''; } foreach ($colList as $col => $dbCol) { if ($i > 0) { $rankDiff[$col] = ($prevRank[$col] - $repInfo[$dbCol]) * -1; if ($rankDiff[$col] > 0) { $rankDiff[$col] = "<font class='green'>({$rankDiff[$col]})</font>"; } elseif ($rankDiff[$col] < 0) { $rankDiff[$col] = "<font class='red'>({$rankDiff[$col]})</font>"; } } $reportList[$key]['rank_diff_' . $col] = empty($rankDiff[$col]) ? '' : $rankDiff[$col]; } foreach ($colList as $col => $dbCol) { $prevRank[$col] = $repInfo[$dbCol]; } $i++; } $websiteInfo = $websiteController->__getWebsiteInfo($websiteId); $websiteUrl = @Spider::removeTrailingSlash(formatUrl($websiteInfo['url'])); $websiteUrl = urldecode($websiteUrl); $this->set('directLinkList', array('google' => $this->backUrlList['google'] . $websiteUrl, 'msn' => $this->backUrlList['msn'] . $websiteUrl, 'alexa' => $this->backUrlList['alexa'] . $websiteUrl)); $this->set('list', array_reverse($reportList, true)); $this->render('backlink/backlinkreport'); }
function proceedInstallation($info) { $db = new DB(); # checking db settings $errMsg = $db->connectDatabase($info['db_host'], $info['db_user'], $info['db_pass'], $info['db_name']); if ($db->error) { $this->startInstallation($info, $errMsg); return; } # checking config file settings if (!is_writable(SP_INSTALL_CONFIG_FILE)) { $this->checkRequirements(true); return; } # checking seo panel web path $info['web_path'] = $this->getWebPath(); if (empty($info['web_path'])) { $errMsg = "Error occured while parsing installation url. Please <a href='http://www.seopanel.in/contact/' target='_blank'>contact</a> Seo Panel team.<br> or <br> Try manual installation by steps specified in <a href='http://www.seopanel.in/install/manual/' target='_blank'>http://www.seopanel.in/install/manual/</a>"; $this->startInstallation($info, $errMsg); return; } # importing data to db $errMsg = $db->importDatabaseFile(SP_INSTALL_DB_FILE); if ($db->error) { $errMsg = "Error occured while importing data: " . $errMsg; $this->startInstallation($info, $errMsg); return; } # importing text file $errMsg = $db->importDatabaseFile(SP_INSTALL_DB_LANG_FILE); if ($db->error) { $errMsg = "Error occured while importing data: " . $errMsg; $this->startInstallation($info, $errMsg); return; } # write to config file $this->writeConfigFile($info); # create API Key if not exists $this->createSeoPanelAPIKey($db); if (gethostbynamel('seopanel.in')) { include_once SP_INSTALL_DIR . '/../libs/spider.class.php'; include_once SP_INSTALL_CONFIG_FILE; $installUpdateUrl = "http://www.seopanel.in/installupdate.php?url=" . urlencode($info['web_path']) . "&ip=" . $_SERVER['SERVER_ADDR'] . "&email=" . urlencode($info['email']); $installUpdateUrl .= "&version=" . SP_INSTALLED; $spider = new Spider(); $spider->getContent($installUpdateUrl, false); } $db = new DB(); $db->connectDatabase($info['db_host'], $info['db_user'], $info['db_pass'], $info['db_name']); // update email for admin $sql = "update users set email='" . addslashes($info['email']) . "' where id=1"; $db->query($sql); // select languages list $sql = "select * from languages where translated=1"; $langList = $db->select($sql); ?> <form method="post" action="<?php echo $info['web_path'] . "/login.php"; ?> "> <h1 class="BlockHeader">Seo Panel Installation Success</h1> <table width="100%" cellspacing="8px" cellpadding="0px" class="formtab"> <tr><th colspan="2" class="headersuccess">Seo Panel installed successfully!</th></tr> <tr> <td class="warning" colspan="2">Warning!</td> </tr> <tr> <td style="border: none;" colspan="2"> <ul class="list"> <li> Please change permission of config file <b><?php echo SP_CONFIG_FILE; ?> </b> to avoid security issues.</li> <li>Please remove installation directory <b>install</b> to avoid security issues.</li> </ul> </td> </tr> <tr> <td class="warning" style="color:black;" colspan="2">Admin Login</td> </tr> <tr> <td style="border-left: none;">Default Language:</td> <td> <select name="lang_code"> <?php foreach ($langList as $langInfo) { $selected = $langInfo['lang_code'] == 'en' ? "selected" : ""; ?> <option value="<?php echo $langInfo['lang_code']; ?> " <?php echo $selected; ?> ><?php echo $langInfo['lang_name']; ?> </option> <?php } ?> </select> </td> </tr> <tr> <td style="border: none;font-weight: normal;font-size: 13px;" colspan="2"> <b>Username:</b> <?php echo SP_ADMIN_USER; ?> <br> <b>Password:</b> <?php echo SP_ADMIN_PASS; ?> <br><br> <b>Note:</b> Please change password of admin after first login. </td> </tr> </table> <input type="hidden" name="sec" value="login"> <input type="hidden" name="userName" value="spadmin"> <input type="hidden" name="password" value="spadmin"> <input type="submit" value="Proceed to admin login >>" name="submit" class="button"> </form> <?php }
<?php include 'globals.inc'; include 'template.IMDB.php'; $s = new Spider(); $imdb = new IMDBTemplate(); $movie = $s->applyTemplate($imdb, 'http://www.imdb.com/title/tt0433362/'); print_r($movie);
function __destruct() { parent::__destruct(); }
public function crawlMetaData($websiteUrl, $keyInput = '', $pageContent = '', $returVal = false) { if (empty($pageContent)) { if (!preg_match('/\\w+/', $websiteUrl)) { return; } if (!stristr($websiteUrl, 'http://')) { $websiteUrl = "http://" . $websiteUrl; } $spider = new Spider(); $ret = $spider->getContent($websiteUrl); } else { $ret['page'] = $pageContent; $metaInfo = array(); } if (!empty($ret['page'])) { if (empty($keyInput)) { # meta title preg_match('/<TITLE>(.*?)<\\/TITLE>/si', $ret['page'], $matches); if (!empty($matches[1])) { if ($returVal) { $metaInfo['page_title'] = $matches[1]; } else { WebsiteController::addInputValue($matches[1], 'webtitle'); } } # meta description preg_match('/<META.*?name="description".*?content="(.*?)"/si', $ret['page'], $matches); if (empty($matches[1])) { preg_match("/<META.*?name='description'.*?content='(.*?)'/si", $ret['page'], $matches); } if (empty($matches[1])) { preg_match('/<META content="(.*?)" name="description"/si', $ret['page'], $matches); } if (!empty($matches[1])) { if ($returVal) { $metaInfo['page_description'] = $matches[1]; } else { WebsiteController::addInputValue($matches[1], 'webdescription'); } } } # meta keywords preg_match('/<META.*?name="keywords".*?content="(.*?)"/si', $ret['page'], $matches); if (empty($matches[1])) { preg_match("/<META.*?name='keywords'.*?content='(.*?)'/si", $ret['page'], $matches); } if (empty($matches[1])) { preg_match('/<META content="(.*?)" name="keywords"/si', $ret['page'], $matches); } if (!empty($matches[1])) { if ($returVal) { $metaInfo['page_keywords'] = $matches[1]; } else { WebsiteController::addInputValue($matches[1], 'webkeywords'); } } } return $metaInfo; }
function checkDirectoryStatus($dirId, $nodebug = 0) { $dirInfo = $this->getDirectoryInfo($dirId); $active = 0; $captcha = 0; $spider = new Spider(); $ret = $spider->getContent(addHttpToUrl($dirInfo['submit_url'])); if (empty($ret['error']) && !empty($ret['page'])) { $page = $ret['page']; $matches = $this->isCategoryExists($page, $dirInfo['category_col']); $active = empty($matches[0]) ? 0 : 1; $captcha = stristr($page, $dirInfo['captcha_script']) ? 1 : 0; } $sql = "update directories set working={$active},is_captcha={$captcha} where id={$dirId}"; $this->db->query($sql); if ($nodebug) { $captchaLabel = $captcha ? "Yes" : "No"; ?> <script type="text/javascript"> document.getElementById('captcha_<?php echo $dirId; ?> ').innerHTML = '<?php echo $captchaLabel; ?> '; </script> <?php echo $this->getStatusLink($dirId, $active); } else { echo "<p class='note notesuccess'>Saved status of directory <b>{$dirInfo['domain']}</b>.....</p>"; } }
<?php // 过滤出核心信息 // 先提取页面主体部分 // http://localhost/github/php_web_spider/application/reader.php?url=http://www.phbs.pku.edu.cn/content-419-2333-1.html // $for = $_GET['for']; $url = urldecode($_GET['url']); header("Content-type:text/html;charset=utf-8"); define('SPIDER_PATH', '../core/'); require_once SPIDER_PATH . 'php_web_spider.php'; require_once SPIDER_PATH . 'simple_html_dom.php'; // 提取核心内容 $sp = new Spider(); $article = $sp->fetch_main_content($url); $info = $sp->fetch_info(); // $echo $article;exit(0); // UI呈现 require_once 'third_party/php_simple_ui/php_simple_ui.php'; $page = new ui_JMPage('新闻详情', $article); $page->header->appendText('<a href="javascript:history.go(-1);" data-role="button" data-icon="home">返回</a>'); $ui = new ui_jQueryMobile($page); echo $ui;
<?php header("Content-type:text/html;charset=utf-8"); define('SPIDER_PATH', '../core/'); require_once SPIDER_PATH . 'php_web_spider.php'; require_once SPIDER_PATH . 'simple_html_dom.php'; //$url = 'http://ieeexplore.ieee.org/search/searchresult.jsp?searchWithin%3Dp_Authors%3A.QT.Zhenyu+Wang.QT.%26refinements%3D4274688882%2C4268599920%2C4268757412%2C4274050053%2C4269644358%2C4269643024%2C4262616522&removeRefinement=4274688882&pageNumber=1&resultAction=REFINE'; // 如果不刷新数据,则显示静态数据 $sp = new Spider(); //echo $sp->fetch($url); //print_r($sp->fetch_results($url)); $papers = array(); $authors = array('王振宇' => 'Zhenyu+Wang', '辛柏成' => 'Baicheng+Xin', '蔡砚刚' => 'Yangang+Cai', '崔同兵' => 'Tongbing+Cui', '文浩丞' => 'Haocheng+Wen', '邢培银' => 'Peiyin+Xing', '韩冰杰' => 'Bingjie+Han', '焦剑波' => 'Jianbo+Jiao', '高璇' => 'Xuan+Gao', '李旭峰' => 'Xufeng+Li', '赵龙' => 'Long+Zhao', '万杰' => 'Jie+Wan', '吕浩' => 'Hao+Lv', '唐骋洲' => 'Chengzhou+Tang', '王磊' => 'Lei+Wang', '杨明辉' => 'Minghui+Yang', '杨爽' => 'Shuang+Yang', '张雷' => 'Lei+Zhang', '刘中欣' => 'Zhongxin+Liu', '彭祎' => 'Yi+Peng', '汤传新' => 'Chuanxin+Tang', '向国庆' => 'Guoqing+Xiang', '张艺' => 'Yi+Zhang', '杜实现' => 'Shixian+Du', '郭梦婷' => 'Mengting+Guo', '黄颖' => 'Ying+Huang', '魏莹荔' => 'Yingli+Wei', '张申' => 'Shen+Zhang', '张欣欣' => 'Xinxin+Zhang', '张杨' => 'Yang+Zhang', '张若楠' => 'Ruonan+Zhang', '黄泽湖' => 'Zehu+Huang', '罗佳佳' => 'Jiajia+Luo', '赵洋' => 'Yang+Zhao'); // 没有处理分页的问题,只取了一页的数据 set_time_limit(0); foreach ($authors as $key => $author) { $papers[$key] = $sp->fetch_results('http://ieeexplore.ieee.org/search/searchresult.jsp?searchWithin%3Dp_Authors%3A.QT.' . $author . '.QT.%26refinements%3D4274688882%2C4268599920%2C4268757412%2C4274050053%2C4269644358%2C4269643024%2C4262616522&removeRefinement=4274688882&pageNumber=1&resultAction=REFINE'); // print_r($papers[$key]); } //exit(); /* * View layer ----------------------------------------------- */ require_once 'third_party/php_simple_ui/php_simple_ui.php'; $list = new ui_JMListView($papers); $list->addFilter('Search'); $page = new ui_JMPage('IEEE Xplore Papers', array($list)); $ui = new ui_jQueryMobile($page); /** * Dump ui ------------------------------------------------ */
/** * 下载图片到本地 * @param $img_url * @return string */ public function download_img($img_url) { $curl = curl_init($img_url); $filename = date('Ymdhis') . '_' . uniqid() . '.jpg'; curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); $imageData = curl_exec($curl); curl_close($curl); $tp = @fopen('img/' . $filename, 'a'); fwrite($tp, $imageData); fclose($tp); return $filename; } } $spider = new Spider(); $spider->main(); //save_spot('http://www.lvmama.com/lvyou/d-chengdu279.html'); //$spider->save_view('http://www.lvmama.com/lvyou/poi/sight-151780.html', 1); //$spider->save_spot('http://www.lvmama.com/lvyou/d-chengdu279.html'); /* * * 份1:d-sichuan278.html 景区页面1:http://www.lvmama.com/lvyou/scenery/d-sichuan278.html 景区URL1:http://www.lvmama.com/lvyou/d-chengdu279.html 景点URL1:http://www.lvmama.com/lvyou/poi/sight-151780.html http://www.lvmama.com/lvyou/poi/sight-151780.html */
//$uid = $matches[1][0]; $urlVote = 'http://fotostrana.ru/meeting/index/click/?ajax=true&uId=' . $uid . '&val=3&rate=5&fake=0&uid=' . $uid; $setAgeUrl = 'http://fotostrana.ru/meeting/?change=1&gender=&age=21&ageTo=25'; $x = $this->spider($setAgeUrl); //var_dump($matches[1][0]); var_dump($res); } } $searchUrl = "http://fotostrana.ru/search/?cityId=8&otherCity=&gender=w&age=22&ageTo=28&height%5Bmetric%5D=0&height%5Benglish%5D=0&weight%5Bmetric%5D=0&weight%5Benglish%5D=0&newOnly=0&online=1&ajax=true&change=1"; //$url = "http://fotostrana.ru/profile/ajax/freeVote?value=1&ftoken-all=63655255ba&userId=30911742"; // $url = "http://fotostrana.ru/profile/ajax/freeVote/?userId=30911742&value=1&ftoken-all=63655255ba"; //$url = "http://fotostrana.ru/contest/vote/votedata/?userId=36306794&dir=up&nominationId=0&ftoken-f-contestSendVote-36306794=wfzzieiefc&ajax=1&sendFreeVote=1"; //$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=29504825&from=profile"; //$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=206878&from=profile"; //$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=36429493&from=profile"; //$url = "http://fotostrana.ru/contest/new/votePopup/?_ajax=1&ajax=1&userId=39607735&from=profile"; $guessurl = "http://fotostrana.ru/guess/ajax/saveGuess?guess_user=39402139&answer%5B1%5D=16&uanswer%5B1%5D=&answer%5B2%5D=41&uanswer%5B2%5D=&answer%5B3%5D=17992&uanswer%5B3%5D=&send_message=on&ajax=1&isPopup=0"; $guessurl = "http://fotostrana.ru/guess/ajax/saveGuess?guess_user=39402139&answer%5B1%5D=16&uanswer%5B1%5D=&answer%5B2%5D=41&uanswer%5B2%5D=&answer%5B3%5D=17992&uanswer%5B3%5D=&ajax=1&isPopup=0"; $sp = new Spider($searchUrl); $sp->preconstruct(); //$sp->meeting(); //$sp->guess($argv[1],$argv[2]); //$x = $sp->visit($argv[1],$argv[2]); //$x = $sp->spider($url); //$y = json_decode($x); //echo $y->html; //var_dump($y); //~ $sp->visit($argv[1],$argv[2]); //~ $j = file_get_contents($url); //~ $x = json_decode($j); //~ var_dump($j);
function generateSitemapFile($sitemapInfo) { $sitemapInfo['project_id'] = intval($sitemapInfo['project_id']); if (!empty($sitemapInfo['project_id'])) { # check whether the sitemap directory is writable if (!is_writable(SP_TMPPATH . "/" . $this->sitemapDir)) { hideDiv('message'); showErrorMsg("Directory '<b>" . SP_TMPPATH . "/" . $this->sitemapDir . "</b>' is not <b>writable</b>. Please change its <b>permission</b> !"); } $saCtrler = $this->createController('SiteAuditor'); $projectInfo = $saCtrler->__getProjectInfo($sitemapInfo['project_id']); $this->section = formatFileName($projectInfo['name']); $this->smType = $sitemapInfo['sm_type']; $this->excludeUrl = $sitemapInfo['exclude_url']; if (!empty($sitemapInfo['freq'])) { $this->changefreq = $sitemapInfo['freq']; } if (!empty($sitemapInfo['priority'])) { $this->priority = $sitemapInfo['priority']; } $auditorComp = $this->createComponent('AuditorComponent'); $pageList = $auditorComp->getAllreportPages(" and project_id=" . $sitemapInfo['project_id']); $urlList = array(); foreach ($pageList as $pageInfo) { $pageInfo['page_url'] = Spider::addTrailingSlash($pageInfo['page_url']); if ($auditorComp->isExcludeLink($pageInfo['page_url'], trim($sitemapInfo['exclude_url']))) { continue; } $urlList[] = $pageInfo['page_url']; } $this->createSitemap($this->smType, $urlList); } else { hideDiv('message'); showErrorMsg("No Website Found!"); } }
function runReport($reportUrl, $projectInfo, $totalLinks) { $spider = new Spider(); $pageInfo = $spider->getPageInfo($reportUrl, $projectInfo['url'], true); if ($rInfo = $this->getReportInfo(" and project_id={$projectInfo['id']} and page_url='{$reportUrl}'")) { $reportInfo['id'] = $rInfo['id']; $reportInfo['page_title'] = addslashes($pageInfo['page_title']); $reportInfo['page_description'] = addslashes($pageInfo['page_description']); $reportInfo['page_keywords'] = addslashes($pageInfo['page_keywords']); $reportInfo['total_links'] = $pageInfo['total_links']; $reportInfo['external_links'] = $pageInfo['external']; $reportInfo['crawled'] = 1; // gooogle pagerank check if ($projectInfo['check_pr']) { $rankCtrler = $this->createController('Rank'); $reportInfo['pagerank'] = $rankCtrler->__getGooglePageRank(Spider::addTrailingSlash($reportUrl)); } // backlinks page check if ($projectInfo['check_backlinks']) { $backlinkCtrler = $this->createController('Backlink'); $backlinkCtrler->url = Spider::addTrailingSlash($reportUrl); $reportInfo['bing_backlinks'] = $backlinkCtrler->__getBacklinks('msn'); $reportInfo['google_backlinks'] = $backlinkCtrler->__getBacklinks('google'); } // indexed page check if ($projectInfo['check_indexed']) { $saturationCtrler = $this->createController('SaturationChecker'); $saturationCtrler->url = Spider::addTrailingSlash($reportUrl); $reportInfo['bing_indexed'] = $saturationCtrler->__getSaturationRank('msn'); $reportInfo['google_indexed'] = $saturationCtrler->__getSaturationRank('google'); } if ($projectInfo['check_brocken']) { $reportInfo['brocken'] = Spider::isLInkBrocken($linkInfo['link_url']); } $this->saveReportInfo($reportInfo, 'update'); // to store sitelinks in page and links reports $i = 0; if (count($pageInfo['site_links']) > 0) { // loo through site links foreach ($pageInfo['site_links'] as $linkInfo) { // if store links if ($projectInfo['store_links_in_page']) { $delete = $i++ ? false : true; $linkInfo['report_id'] = $rInfo['id']; $this->storePagelLinks($linkInfo, $delete); } // if total links saved less than max links allowed for a project if ($totalLinks < $projectInfo['max_links']) { // check whether valid html serving link if (preg_match('/\\.zip$|\\.gz$|\\.tar$|\\.png$|\\.jpg$|\\.jpeg$|\\.gif$|\\.mp3$|\\.flv$|\\.pdf$|\\.m4a$|#$/i', $linkInfo['link_url'])) { continue; } // if found any space in the link $linkInfo['link_url'] = Spider::formatUrl($linkInfo['link_url']); if (!preg_match('/\\S+/', $linkInfo['link_url'])) { continue; } // check whether url needs to be excluded if ($this->isExcludeLink($linkInfo['link_url'], $projectInfo['exclude_links'])) { continue; } // save links for the project report if (!$this->getReportInfo(" and project_id={$projectInfo['id']} and page_url='{$linkInfo['link_url']}'")) { $repInfo['page_url'] = $linkInfo['link_url']; $repInfo['project_id'] = $projectInfo['id']; $this->saveReportInfo($repInfo); $totalLinks++; } } } } // to store external links in page if ($projectInfo['store_links_in_page']) { if (count($pageInfo['external_links']) > 0) { foreach ($pageInfo['external_links'] as $linkInfo) { $delete = $i++ ? false : true; $linkInfo['report_id'] = $rInfo['id']; $linkInfo['extrenal'] = 1; $this->storePagelLinks($linkInfo, $delete); } } } // calculate score of each page and update it $this->updateReportPageScore($rInfo['id']); // calculate score of each page and update it $this->updateProjectPageScore($projectInfo['id']); } }
public function current() { $urlInfo = Spider::getURIInfo(parent::current(), $this->options); return $urlInfo['effective_url']; }
function proceedInstallation($info) { $db = new DB(); # checking db settings $errMsg = $db->connectDatabase($info['db_host'], $info['db_user'], $info['db_pass'], $info['db_name']); if ($db->error) { $this->startInstallation($info, $errMsg); return; } # checking config file settings if (!is_writable(SP_INSTALL_CONFIG_FILE)) { $this->checkRequirements(true); return; } # checking seo panel web path $info['web_path'] = $this->getWebPath(); if (empty($info['web_path'])) { $errMsg = "Error occured while parsing installation url. Please <a href='http://www.seopanel.in/contact/' target='_blank'>contact</a> Seo Panel team."; $this->startInstallation($info, $errMsg); return; } # importing data to db $errMsg = $db->importDatabaseFile(SP_INSTALL_DB_FILE); if ($db->error) { $errMsg = "Error occured while importing data: " . $errMsg; $this->startInstallation($info, $errMsg); return; } # write to config file $this->writeConfigFile($info); if (gethostbynamel('seopanel.in')) { include_once SP_INSTALL_DIR . '/../libs/spider.class.php'; $installUpdateUrl = "http://www.seopanel.in/installupdate.php?url=" . urlencode($info['web_path']) . "&ip=" . $_SERVER['SERVER_ADDR'] . "&email=" . urlencode($info['email']); $spider = new Spider(); $spider->getContent($installUpdateUrl); } ?> <form method="post" action="<?php echo $info['web_path'] . "/login.php"; ?> "> <h1 class="BlockHeader">Seo Panel Installation Success</h1> <table width="100%" cellspacing="8px" cellpadding="0px" class="formtab"> <tr><th colspan="2" class="headersuccess">Seo Panel installed successfully!</th></tr> <tr> <td class="warning">Warning!</td> </tr> <tr> <td style="border: none;"> <ul class="list"> <li> Please change permission of config file <b><?php echo SP_CONFIG_FILE; ?> </b> to avoid security issues.</li> <li>Please remove installation directory <b>install</b> to avoid security issues.</li> </ul> </td> </tr> <tr> <td class="warning" style="color:black;">Admin Login</td> </tr> <tr> <td style="border: none;font-weight: normal;font-size: 13px;"> <b>Username:</b> <?php echo SP_ADMIN_USER; ?> <br> <b>Password:</b> <?php echo SP_ADMIN_PASS; ?> <br><br> <b>Note:</b> Please change password of admin after first login. </td> </tr> </table> <input type="submit" value="Proceed to admin login >>" name="submit" class="button"> </form> <?php }
function importLinks($listInfo) { $userId = isLoggedIn(); $listInfo['project_id'] = intval($listInfo['project_id']); $this->set('post', $listInfo); $errMsg['links'] = formatErrorMsg($this->validate->checkBlank($listInfo['links'])); if (!$this->validate->flagErr) { $totalLinks = $this->getCountcrawledLinks($listInfo['project_id']); $projectInfo = $this->__getProjectInfo($listInfo['project_id']); // if total links greater than max links of a project if ($totalLinks >= $projectInfo['max_links']) { $errMsg['links'] = formatErrorMsg($this->spTextSA['totallinksgreaterallowed'] . " - {$projectInfo['max_links']}"); } else { // check whether links are pages of website $linkInfo = $this->checkExcludeLinks($listInfo['links'], $projectInfo['url'], false); if (!empty($linkInfo['err_msg'])) { $errMsg['links'] = formatErrorMsg($linkInfo['err_msg']); } else { $auditorComp = $this->createComponent('AuditorComponent'); $links = explode(",", $listInfo['links']); $error = false; $linkList = array(); foreach ($links as $i => $link) { $link = Spider::formatUrl(trim($link)); if (empty($link)) { continue; } if ($auditorComp->isExcludeLink($link, $projectInfo['exclude_links'])) { continue; } // check whether url exists or not if ($auditorComp->getReportInfo(" and project_id={$projectInfo['id']} and page_url='" . addslashes($link) . "'")) { $errMsg['links'] = formatErrorMsg($this->spTextSA['Page Link'] . " '<b>{$link}</b>' " . $_SESSION['text']['label']['already exist']); $error = true; break; } else { $totalLinks++; // if total links greater than max links of a project if ($totalLinks > $projectInfo['max_links']) { $error = true; $errMsg['links'] = formatErrorMsg($this->spTextSA['totallinksgreaterallowed'] . " - {$projectInfo['max_links']}"); break; } } $linkList[$link] = 1; } // to save the page if no error occurs if (!$error) { foreach ($linkList as $link => $val) { $reportInfo['page_url'] = $link; $reportInfo['project_id'] = $projectInfo['id']; $auditorComp->saveReportInfo($reportInfo); } $this->showAuditorProjects(); exit; } } } } $this->set('errMsg', $errMsg); $this->showImportProjectLinks(); }
function checkDirectoryStatus($dirId, $nodebug = 0) { $dirId = intval($dirId); $dirInfo = $this->getDirectoryInfo($dirId); $active = 0; $captcha = 0; $spider = new Spider(); $ret = $spider->getContent(addHttpToUrl($dirInfo['submit_url'])); $prUpdate = ''; if (empty($ret['error']) && !empty($ret['page'])) { $page = $ret['page']; $matches = $this->isCategoryExists($page, $dirInfo['category_col']); $active = empty($matches[0]) ? 0 : 1; $captcha = stristr($page, $dirInfo['captcha_script']) ? 1 : 0; if ($this->checkPR) { include_once SP_CTRLPATH . "/rank.ctrl.php"; $rankCtrler = new RankController(); $pagerank = $rankCtrler->__getGooglePageRank($dirInfo['domain']); $prUpdate = ",google_pagerank={$pagerank}"; } } $sql = "update directories set working={$active},is_captcha={$captcha},checked=1 {$prUpdate} where id={$dirId}"; $this->db->query($sql); if ($nodebug) { $captchaLabel = $captcha ? $_SESSION['text']['common']['Yes'] : $_SESSION['text']['common']['No']; ?> <script type="text/javascript"> document.getElementById('captcha_<?php echo $dirId; ?> ').innerHTML = '<?php echo $captchaLabel; ?> '; </script> <?php if ($this->checkPR) { ?> <script type="text/javascript"> document.getElementById('pr_<?php echo $dirId; ?> ').innerHTML = '<?php echo $pagerank; ?> '; </script> <?php } echo $this->getStatusLink($dirId, $active); } else { echo "<p class='note notesuccess'>Saved status of directory <b>{$dirInfo['domain']}</b>.....</p>"; } }
<?php // include Spider class file require_once 'spider.class.php'; // create new Spider object $spider = new Spider('http://onestopshopmarket.com'); // allow files with extension *.txt being spidered $spider->allowType('txt'); // and disable files with that extension $spider->restrictType('txt'); // set it to true if you want to see what is happening on the screen $spider->setVerbose(true); // start spidering website $spider->startSpider(); // all found and fetched links are in that variable $links = $spider->all_links; // print it out print_f($links); ?>
function exportToPdf($content, $fileName = "reports.pdf") { include_once SP_LIBPATH . "/mpdf/mpdf.php"; $mpdf = new mPDF(); $mpdf->useAdobeCJK = true; $mpdf->SetAutoFont(AUTOFONT_ALL); $spider = new Spider(); $ret = $spider->getContent(SP_CSSPATH . "/screen.css"); $stylesheet = str_replace("../../../images", SP_IMGPATH, $ret['page']); $mpdf->WriteHTML($stylesheet, 1); $mpdf->SetDisplayMode('fullpage'); $mpdf->WriteHTML($content, 2); $mpdf->Output($fileName, "I"); exit; }
//$like_user_id = //$post = "action=user.like¶ms[like]=5¶ms[like_user_id]={$like_user_id}"; } } $searchUrl = "http://fotostrana.ru/search/?cityId=8&otherCity=&gender=w&age=22&ageTo=28&height%5Bmetric%5D=0&height%5Benglish%5D=0&weight%5Bmetric%5D=0&weight%5Benglish%5D=0&newOnly=0&online=1&ajax=true&change=1"; //$url = "http://fotostrana.ru/profile/ajax/freeVote?value=1&ftoken-all=63655255ba&userId=30911742"; // $url = "http://fotostrana.ru/profile/ajax/freeVote/?userId=30911742&value=1&ftoken-all=63655255ba"; //$url = "http://fotostrana.ru/contest/vote/votedata/?userId=36306794&dir=up&nominationId=0&ftoken-f-contestSendVote-36306794=wfzzieiefc&ajax=1&sendFreeVote=1"; //$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=29504825&from=profile"; //$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=206878&from=profile"; //$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=36429493&from=profile"; //$url = "http://fotostrana.ru/contest/new/votePopup/?_ajax=1&ajax=1&userId=39607735&from=profile"; $guessurl = "http://fotostrana.ru/guess/ajax/saveGuess?guess_user=39402139&answer%5B1%5D=16&uanswer%5B1%5D=&answer%5B2%5D=41&uanswer%5B2%5D=&answer%5B3%5D=17992&uanswer%5B3%5D=&send_message=on&ajax=1&isPopup=0"; $guessurl = "http://fotostrana.ru/guess/ajax/saveGuess?guess_user=39402139&answer%5B1%5D=16&uanswer%5B1%5D=&answer%5B2%5D=41&uanswer%5B2%5D=&answer%5B3%5D=17992&uanswer%5B3%5D=&ajax=1&isPopup=0"; $searchUrl = ''; $sp = new Spider(); $url = "http://fotostrana.ru/user/autologin/?u=11935779&h=10aaaec8776a62a"; $x = $sp->spider($url); $url = "http://fotostrana.ru/user/11935779"; //$x = $sp->spider($url); //$sp->collectIds(1,2); //$sp->visit($argv[1],$argv[2]); $sp->meeting(); //$sp->guess(10,20); //$sp->guess(21,30); //$sp->guess($argv[1],$argv[2]); //$sp->guess($argv[1],$argv[2]); //$x = $sp->visit($argv[1],$argv[2]); //$y = json_decode($x); //echo $y->html; //var_dump($y);
<?php /** Get the ratings for Daybreakers **/ include 'globals.inc'; echo '<h2>Daybreakers ratings</h2>'; $s = new Spider(); echo 'IMDB: ' . $s->qf(".//*[@id='tn15rating']/div[1]/div/div[2]/b", 'http://www.imdb.com/title/tt0433362/')->inner . '<br/>'; echo 'Metacritic: ' . $s->qf(".//*[@id='metascore']", 'http://www.metacritic.com/film/titles/daybreakers')->inner . '<br/>'; echo 'Rotten Tomatoes: ' . $s->qf(".//*[@id='tomatometer_score']/span[1]", 'http://www.rottentomatoes.com/m/daybreakers/')->inner . '<br/>';
<?php // module test include 'Controller.class.php'; // test Spider class $spider = new Spider('深圳娃娃鱼事件'); echo $spider->get(1); // test Controller class $controller = new Controller(); for ($i = 0; $i < 2; $i++) { $controller->work(5); } $controller->work(5); $controller->task('深圳多名官员吃娃娃鱼'); // test Analyse class $analyse = new Analyse('深圳多名官员吃娃娃鱼', 1); echo $analyse->total(); var_dump($analyse->results());
function checkDirectoryStatus($dirId, $nodebug = 0) { $dirId = intval($dirId); $dirInfo = $this->getDirectoryInfo($dirId); $active = 0; $captcha = 0; $spider = new Spider(); $ret = $spider->getContent(addHttpToUrl($dirInfo['submit_url'])); $prUpdate = ''; $searchUpdate = ''; $extraValUpdate = ''; if (empty($ret['error']) && !empty($ret['page'])) { $page = $ret['page']; $matches = $this->isCategoryExists($page, $dirInfo['category_col']); $active = empty($matches[0]) ? 0 : 1; $captcha = stristr($page, $dirInfo['captcha_script']) ? 1 : 0; // to check search script if (stristr($page, 'name="search"')) { $searchUpdate = ",search_script='index.php?search=[--keyword--]'"; } // to check the value of the LINK_TYPE if phpld directory if ($dirInfo['script_type_id'] == 1 && preg_match('/name="LINK_TYPE" value="(\\d)"/s', $page)) { $subject = array('LINK_TYPE=reciprocal', 'LINK_TYPE=normal', 'LINK_TYPE=free'); $replace = array('reciprocal=1&LINK_TYPE=1', 'LINK_TYPE=2', 'LINK_TYPE=3'); $dirInfo['extra_val'] = str_replace($subject, $replace, $dirInfo['extra_val']); $extraValUpdate = ",extra_val='{$dirInfo['extra_val']}'"; } if ($this->checkPR) { include_once SP_CTRLPATH . "/rank.ctrl.php"; $rankCtrler = new RankController(); $pagerank = $rankCtrler->__getGooglePageRank($dirInfo['domain']); $prUpdate = ",google_pagerank={$pagerank}"; } } $sql = "update directories set working={$active},is_captcha={$captcha},checked=1 {$prUpdate} {$searchUpdate} {$extraValUpdate} where id={$dirId}"; $this->db->query($sql); if ($nodebug) { $captchaLabel = $captcha ? $_SESSION['text']['common']['Yes'] : $_SESSION['text']['common']['No']; ?> <script type="text/javascript"> document.getElementById('captcha_<?php echo $dirId; ?> ').innerHTML = '<?php echo $captchaLabel; ?> '; </script> <?php if ($this->checkPR) { ?> <script type="text/javascript"> document.getElementById('pr_<?php echo $dirId; ?> ').innerHTML = '<?php echo $pagerank; ?> '; </script> <?php } echo $this->getStatusLink($dirId, $active); } else { echo "<p class='note notesuccess'>Saved status of directory <b>{$dirInfo['domain']}</b>.....</p>"; } }
function isLInkBrocken($url) { $header = Spider::getHeader($url); if (stristr($header, '404 Not Found')) { return true; } else { return false; } }
function __construct($url) { // 如果数据库中可以查到对应url的配置,则载入,否则新建数据并载入 parent::__construct($url); }
<?php // Init include 'globals.inc'; $s = new Spider(); // Get an array of search results from teh google $a = $s->qa('.//*[@id="res"]/div/ol/li/h3/a', 'http://www.google.co.nz/search?q=daybreakers'); // Print the headers and the array echo '<pre>', print_r($s->getHead()), '</pre>'; echo '<pre>', print_r($a), '</pre>'; // Get a full DOMList of the search results $list = $s->qq('.//*[@id="res"]/div/ol/li/h3/a'); // Echo the first nodes innertext echo $list(0)->inner; // Print a list of the search results from google echo '<ul>'; foreach ($list() as $a) { echo '<li><a href="' . $a->href . '">' . $a->inner . '</a></li>'; } echo '</ul>'; // Get the score from metacritic $score = $s->qf('.//*[@id="metascore"]', 'http://www.metacritic.com/film/titles/daybreakers')->inner; echo "<p>Score for Daybreakers: {$score}</p>";
// 用户、社区功能 // 小工具 通过AJAX 过滤时间 // 配置搜索选项,通过提交表单,不用AJAX // 不建议使用appendText // 可折叠 可折叠列表不支持计数气泡 // 分页 显示更多 // 自动填入过滤常用词 新闻 通知 学术讲座 // 获取配置信息 $filter = isset($_GET['filter']) ? $_GET['filter'] : "week"; header("Content-type:text/html;charset=utf-8"); // 根据配置信息抓取数据 define('SPIDER_PATH', '../core/'); require_once SPIDER_PATH . 'php_web_spider.php'; require_once SPIDER_PATH . 'simple_html_dom.php'; // 会分析提交的UI配置信息 $sp = new Spider(); // $tmp = $sp-> fetch_news('http://www.phbs.pku.edu.cn/index.php?m=content&c=index&a=lists&catid=419'); // print_r($tmp);exit(0); // 网址信息数据 建议从数据库中获取,相同学院只是不同path,根地址相同 $news['信息工程学院'] = $sp->fetch_news('http://www.ece.pku.edu.cn/index.php?m=content&c=index&a=lists&catid=502', $filter); // 提供特殊形式链接 $news['汇丰商学院'] = $sp->fetch_news('http://www.phbs.pku.edu.cn/index.php?m=content&c=index&a=lists&catid=419', $filter); // $news['化学生物学与生物技术学院'] = $sp-> fetch_news('http://www.scbb.pkusz.edu.cn/index.php?m=content&c=index&a=lists&catid=862'); // $news['环境与能源学院'] = $sp-> fetch_news('http://see.pkusz.edu.cn/news_cn.aspx'); // $news['城市规划与设计学院'] = $sp-> fetch_news('http://see.pkusz.edu.cn/news_cn.aspx'); // $news['城市规划与设计学院'] = $sp-> fetch_news('http://sam.pkusz.edu.cn/index.php?m=content&c=index&a=lists&catid=395'); // 讲座信息 $lecture['信息工程学院'] = $sp->fetch_news('http://www.ece.pku.edu.cn/index.php?m=content&c=index&a=lists&catid=503', $filter); $lecture['汇丰商学院'] = $sp->fetch_news('http://www.phbs.pku.edu.cn/list-812-1.html', $filter); // $lecture['新材料学院'] = $sp-> fetch_news('http://sam.pkusz.edu.cn/index.php?m=content&c=index&a=lists&catid=809'); // Undefined variable: find_link in D:\Program Files\xampp\htdocs\GitHub\php_web_spider\core\php_web_spider.php on line 259