Example #1
0
 /**
  * Static method to crawl the URLs
  *
  * @param  string $url
  * @param  array  $elements
  * @param  string $parent
  * @param  string $start
  * @param  string $time
  * @return void
  */
 public static function crawl($url, $elements = null, $parent = null, $start = null, $time = null)
 {
     // Encode the URL
     $url = str_replace(array('%3A', '%2F', '%23', '%3F', '%3D', '%25', '%2B'), array(':', '/', '#', '?', '=', '%', '+'), rawurlencode($url));
     $slashes = substr_count($url, '/') - 2;
     if ($slashes > self::$depth) {
         self::$depth = $slashes;
     }
     if (!array_key_exists($url, self::$urls) && !array_key_exists(strtolower($url), self::$urls)) {
         $spider = new Spider($url, $elements);
         echo '-> (' . $spider->getCode() . ') ' . $url . PHP_EOL;
         if ($spider->isError()) {
             self::$errors[] = array('code' => $spider->getCode(), 'url' => $url, 'parent' => $parent);
         } else {
             self::$urls[$url] = $spider;
             $domain = str_replace(self::$urls[$url]->getSchema(), '', self::$urls[$url]->getBase());
             if (strpos($domain, '/') !== false) {
                 $domain = substr($domain, 0, strpos($domain, '/'));
             }
             $urls = self::$urls[$url]->getElements('a');
             if (null !== $urls) {
                 foreach ($urls as $u) {
                     $expired = false;
                     if (null !== $start && null !== $time) {
                         $expired = time() - $start > $time;
                     }
                     if (!$expired && null !== $u['href'] && $u['href'] != '' && substr($u['href'], 0, 1) != '#' && substr($u['href'], 0, 1) != '?' && stripos($u['href'], $domain) !== false) {
                         self::crawl($u['href'], $elements, $url, $start, $time);
                     }
                 }
             }
         }
     }
 }
Example #2
0
 public function process_post($url)
 {
     $html = parent::get_html($url);
     $article = $html->getElementsByTagName("article");
     $this->title[] = $article->item(0)->childNodes->item(0)->childNodes->item(0)->nodeValue;
     $this->post_date[] = $article->item(0)->childNodes->item(0)->childNodes->item(2)->childNodes->item(0)->childNodes->item(2)->childNodes->item(1)->nodeValue;
     $text = $article->item(0)->childNodes->item(1)->C14N();
     $start_pos = strpos($text, '</script></span>');
     $my_post = strpos($text, '<plusone') - $start_pos - 880;
     $this->body[] = substr($text, $start_pos, $my_post);
 }
Example #3
0
 public function download($uri, $options = array())
 {
     //Make sure that the curl_options exists.
     if (!isset($options['curl_options'])) {
         $options['curl_options'] = array();
     }
     if (isset($options['user_agent'])) {
         $options['curl_options'][CURLOPT_USERAGENT] = $options['user_agent'];
     }
     //Make sure that the content is returned.
     $options['curl_options'][CURLOPT_RETURNTRANSFER] = true;
     $options['curl_options'][CURLOPT_NOBODY] = false;
     $info = Spider::getURIInfo($uri, $options['curl_options']);
     if (!$info['content']) {
         throw new Exception('Error downloading ' . $uri . ' ' . $info['content']);
     }
     if (in_array($info['http_code'], array(0, 404)) && isset($options['crawl_404_pages']) && !$options['crawl_404_pages']) {
         throw new Exception('404 page ' . $uri . ' ' . $info['http_code']);
     }
     return $info['content'];
 }
Example #4
0
File: basic.php Project: unl/Spider
$db->exec('create table SpiderPage (
    id serial,
    uri varchar(255),
    primary key(id)
);');
$db->exec('create table SpiderJavaScript (
    id serial,
    uri varchar(255),
    script varchar(255),
    primary key(id)
);');
$db->exec('create table SpiderStyleSheet (
    id serial,
    uri varchar(255),
    style varchar(255),
    primary key(id)
);');
//$pageLogger       = new Spider_PageLogger($db);
//$javaScriptLogger = new Spider_JavaScriptLogger($db);
//$styleSheetLogger = new Spider_StyleSheetLogger($db);
$logger = new Spider_Logger();
$downloader = new Spider_Downloader();
$parser = new Spider_Parser();
$spider = new Spider($downloader, $parser);
$spider->addLogger($logger);
$spider->addUriFilter('Spider_AnchorFilter');
$spider->addUriFilter('Spider_MailtoFilter');
//$spider->addLogger($pageLogger);
//$spider->addLogger($styleSheetLogger);
//$spider->addLogger($javaScriptLogger);
$spider->spider('http://www.unl.edu/fwc/');
Example #5
0
 function showReports($searchInfo = '')
 {
     $userId = isLoggedIn();
     if (!empty($searchInfo['from_time'])) {
         $fromTime = strtotime($searchInfo['from_time'] . ' 00:00:00');
     } else {
         $fromTime = @mktime(0, 0, 0, date('m'), date('d') - 30, date('Y'));
     }
     if (!empty($searchInfo['to_time'])) {
         $toTime = strtotime($searchInfo['to_time'] . ' 23:59:59');
     } else {
         $toTime = @mktime();
     }
     $this->set('fromTime', date('Y-m-d', $fromTime));
     $this->set('toTime', date('Y-m-d', $toTime));
     $websiteController = new WebsiteController();
     $websiteList = $websiteController->__getAllWebsites($userId, true);
     $this->set('websiteList', $websiteList);
     $websiteId = empty($searchInfo['website_id']) ? $websiteList[0]['id'] : intval($searchInfo['website_id']);
     $this->set('websiteId', $websiteId);
     $conditions = empty($websiteId) ? "" : " and s.website_id={$websiteId}";
     $sql = "select s.* ,w.name\r\n\t\t\t\t\t\t\t\tfrom backlinkresults s,websites w \r\n\t\t\t\t\t\t\t\twhere s.website_id=w.id \r\n\t\t\t\t\t\t\t\tand result_time>= {$fromTime} and result_time<={$toTime} {$conditions}  \r\n\t\t\t\t\t\t\t\torder by result_time";
     $reportList = $this->db->select($sql);
     $i = 0;
     $colList = $this->colList;
     foreach ($colList as $col => $dbCol) {
         $prevRank[$col] = 0;
     }
     # loop throgh rank
     foreach ($reportList as $key => $repInfo) {
         foreach ($colList as $col => $dbCol) {
             $rankDiff[$col] = '';
         }
         foreach ($colList as $col => $dbCol) {
             if ($i > 0) {
                 $rankDiff[$col] = ($prevRank[$col] - $repInfo[$dbCol]) * -1;
                 if ($rankDiff[$col] > 0) {
                     $rankDiff[$col] = "<font class='green'>({$rankDiff[$col]})</font>";
                 } elseif ($rankDiff[$col] < 0) {
                     $rankDiff[$col] = "<font class='red'>({$rankDiff[$col]})</font>";
                 }
             }
             $reportList[$key]['rank_diff_' . $col] = empty($rankDiff[$col]) ? '' : $rankDiff[$col];
         }
         foreach ($colList as $col => $dbCol) {
             $prevRank[$col] = $repInfo[$dbCol];
         }
         $i++;
     }
     $websiteInfo = $websiteController->__getWebsiteInfo($websiteId);
     $websiteUrl = @Spider::removeTrailingSlash(formatUrl($websiteInfo['url']));
     $websiteUrl = urldecode($websiteUrl);
     $this->set('directLinkList', array('google' => $this->backUrlList['google'] . $websiteUrl, 'msn' => $this->backUrlList['msn'] . $websiteUrl, 'alexa' => $this->backUrlList['alexa'] . $websiteUrl));
     $this->set('list', array_reverse($reportList, true));
     $this->render('backlink/backlinkreport');
 }
Example #6
0
    function proceedInstallation($info)
    {
        $db = new DB();
        # checking db settings
        $errMsg = $db->connectDatabase($info['db_host'], $info['db_user'], $info['db_pass'], $info['db_name']);
        if ($db->error) {
            $this->startInstallation($info, $errMsg);
            return;
        }
        # checking config file settings
        if (!is_writable(SP_INSTALL_CONFIG_FILE)) {
            $this->checkRequirements(true);
            return;
        }
        # checking seo panel web path
        $info['web_path'] = $this->getWebPath();
        if (empty($info['web_path'])) {
            $errMsg = "Error occured while parsing installation url. Please <a href='http://www.seopanel.in/contact/' target='_blank'>contact</a> Seo Panel team.<br> or <br> Try manual installation by steps specified in <a href='http://www.seopanel.in/install/manual/' target='_blank'>http://www.seopanel.in/install/manual/</a>";
            $this->startInstallation($info, $errMsg);
            return;
        }
        # importing data to db
        $errMsg = $db->importDatabaseFile(SP_INSTALL_DB_FILE);
        if ($db->error) {
            $errMsg = "Error occured while importing data: " . $errMsg;
            $this->startInstallation($info, $errMsg);
            return;
        }
        # importing text file
        $errMsg = $db->importDatabaseFile(SP_INSTALL_DB_LANG_FILE);
        if ($db->error) {
            $errMsg = "Error occured while importing data: " . $errMsg;
            $this->startInstallation($info, $errMsg);
            return;
        }
        # write to config file
        $this->writeConfigFile($info);
        # create API Key if not exists
        $this->createSeoPanelAPIKey($db);
        if (gethostbynamel('seopanel.in')) {
            include_once SP_INSTALL_DIR . '/../libs/spider.class.php';
            include_once SP_INSTALL_CONFIG_FILE;
            $installUpdateUrl = "http://www.seopanel.in/installupdate.php?url=" . urlencode($info['web_path']) . "&ip=" . $_SERVER['SERVER_ADDR'] . "&email=" . urlencode($info['email']);
            $installUpdateUrl .= "&version=" . SP_INSTALLED;
            $spider = new Spider();
            $spider->getContent($installUpdateUrl, false);
        }
        $db = new DB();
        $db->connectDatabase($info['db_host'], $info['db_user'], $info['db_pass'], $info['db_name']);
        // update email for admin
        $sql = "update users set email='" . addslashes($info['email']) . "' where id=1";
        $db->query($sql);
        // select languages list
        $sql = "select * from languages where translated=1";
        $langList = $db->select($sql);
        ?>
		
		<form method="post" action="<?php 
        echo $info['web_path'] . "/login.php";
        ?>
">
		<h1 class="BlockHeader">Seo Panel Installation Success</h1>
		<table width="100%" cellspacing="8px" cellpadding="0px" class="formtab">
			<tr><th colspan="2" class="headersuccess">Seo Panel installed successfully!</th></tr>
			<tr>
				<td class="warning" colspan="2">Warning!</td>
			</tr>
			<tr>
				<td style="border: none;" colspan="2">
					<ul class="list">
						<li> Please change permission of config file <b><?php 
        echo SP_CONFIG_FILE;
        ?>
</b> to avoid security issues.</li>
						<li>Please remove installation directory <b>install</b> to avoid security issues.</li>
					</ul>
				</td>
			</tr>
			<tr>
				<td class="warning" style="color:black;" colspan="2">Admin Login</td>
			</tr>
			<tr>
				<td style="border-left: none;">Default Language:</td>
				<td>
					<select name="lang_code">
            			<?php 
        foreach ($langList as $langInfo) {
            $selected = $langInfo['lang_code'] == 'en' ? "selected" : "";
            ?>
			
            				<option value="<?php 
            echo $langInfo['lang_code'];
            ?>
" <?php 
            echo $selected;
            ?>
><?php 
            echo $langInfo['lang_name'];
            ?>
</option>
            				<?php 
        }
        ?>
            		</select>
				</td>
			</tr>
			<tr>
				<td style="border: none;font-weight: normal;font-size: 13px;" colspan="2">
					<b>Username:</b> <?php 
        echo SP_ADMIN_USER;
        ?>
<br>
					<b>Password:</b> <?php 
        echo SP_ADMIN_PASS;
        ?>
<br><br>
					<b>Note:</b> Please change password of admin after first login.
				</td>
			</tr>
		</table>
		<input type="hidden" name="sec" value="login">
		<input type="hidden" name="userName" value="spadmin">
		<input type="hidden" name="password" value="spadmin">
		<input type="submit" value="Proceed to admin login >>" name="submit" class="button">
		</form>
		<?php 
    }
Example #7
0
<?php

include 'globals.inc';
include 'template.IMDB.php';
$s = new Spider();
$imdb = new IMDBTemplate();
$movie = $s->applyTemplate($imdb, 'http://www.imdb.com/title/tt0433362/');
print_r($movie);
Example #8
0
 function __destruct()
 {
     parent::__destruct();
 }
Example #9
0
 public function crawlMetaData($websiteUrl, $keyInput = '', $pageContent = '', $returVal = false)
 {
     if (empty($pageContent)) {
         if (!preg_match('/\\w+/', $websiteUrl)) {
             return;
         }
         if (!stristr($websiteUrl, 'http://')) {
             $websiteUrl = "http://" . $websiteUrl;
         }
         $spider = new Spider();
         $ret = $spider->getContent($websiteUrl);
     } else {
         $ret['page'] = $pageContent;
         $metaInfo = array();
     }
     if (!empty($ret['page'])) {
         if (empty($keyInput)) {
             # meta title
             preg_match('/<TITLE>(.*?)<\\/TITLE>/si', $ret['page'], $matches);
             if (!empty($matches[1])) {
                 if ($returVal) {
                     $metaInfo['page_title'] = $matches[1];
                 } else {
                     WebsiteController::addInputValue($matches[1], 'webtitle');
                 }
             }
             # meta description
             preg_match('/<META.*?name="description".*?content="(.*?)"/si', $ret['page'], $matches);
             if (empty($matches[1])) {
                 preg_match("/<META.*?name='description'.*?content='(.*?)'/si", $ret['page'], $matches);
             }
             if (empty($matches[1])) {
                 preg_match('/<META content="(.*?)" name="description"/si', $ret['page'], $matches);
             }
             if (!empty($matches[1])) {
                 if ($returVal) {
                     $metaInfo['page_description'] = $matches[1];
                 } else {
                     WebsiteController::addInputValue($matches[1], 'webdescription');
                 }
             }
         }
         # meta keywords
         preg_match('/<META.*?name="keywords".*?content="(.*?)"/si', $ret['page'], $matches);
         if (empty($matches[1])) {
             preg_match("/<META.*?name='keywords'.*?content='(.*?)'/si", $ret['page'], $matches);
         }
         if (empty($matches[1])) {
             preg_match('/<META content="(.*?)" name="keywords"/si', $ret['page'], $matches);
         }
         if (!empty($matches[1])) {
             if ($returVal) {
                 $metaInfo['page_keywords'] = $matches[1];
             } else {
                 WebsiteController::addInputValue($matches[1], 'webkeywords');
             }
         }
     }
     return $metaInfo;
 }
Example #10
0
    function checkDirectoryStatus($dirId, $nodebug = 0)
    {
        $dirInfo = $this->getDirectoryInfo($dirId);
        $active = 0;
        $captcha = 0;
        $spider = new Spider();
        $ret = $spider->getContent(addHttpToUrl($dirInfo['submit_url']));
        if (empty($ret['error']) && !empty($ret['page'])) {
            $page = $ret['page'];
            $matches = $this->isCategoryExists($page, $dirInfo['category_col']);
            $active = empty($matches[0]) ? 0 : 1;
            $captcha = stristr($page, $dirInfo['captcha_script']) ? 1 : 0;
        }
        $sql = "update directories set working={$active},is_captcha={$captcha} where id={$dirId}";
        $this->db->query($sql);
        if ($nodebug) {
            $captchaLabel = $captcha ? "Yes" : "No";
            ?>
			<script type="text/javascript">
				document.getElementById('captcha_<?php 
            echo $dirId;
            ?>
').innerHTML = '<?php 
            echo $captchaLabel;
            ?>
';
			</script>
			<?php 
            echo $this->getStatusLink($dirId, $active);
        } else {
            echo "<p class='note notesuccess'>Saved status of directory <b>{$dirInfo['domain']}</b>.....</p>";
        }
    }
<?php

// 过滤出核心信息
// 先提取页面主体部分
// http://localhost/github/php_web_spider/application/reader.php?url=http://www.phbs.pku.edu.cn/content-419-2333-1.html
// $for = $_GET['for'];
$url = urldecode($_GET['url']);
header("Content-type:text/html;charset=utf-8");
define('SPIDER_PATH', '../core/');
require_once SPIDER_PATH . 'php_web_spider.php';
require_once SPIDER_PATH . 'simple_html_dom.php';
// 提取核心内容
$sp = new Spider();
$article = $sp->fetch_main_content($url);
$info = $sp->fetch_info();
// $echo $article;exit(0);
// UI呈现
require_once 'third_party/php_simple_ui/php_simple_ui.php';
$page = new ui_JMPage('新闻详情', $article);
$page->header->appendText('<a href="javascript:history.go(-1);" data-role="button" data-icon="home">返回</a>');
$ui = new ui_jQueryMobile($page);
echo $ui;
<?php

header("Content-type:text/html;charset=utf-8");
define('SPIDER_PATH', '../core/');
require_once SPIDER_PATH . 'php_web_spider.php';
require_once SPIDER_PATH . 'simple_html_dom.php';
//$url = 'http://ieeexplore.ieee.org/search/searchresult.jsp?searchWithin%3Dp_Authors%3A.QT.Zhenyu+Wang.QT.%26refinements%3D4274688882%2C4268599920%2C4268757412%2C4274050053%2C4269644358%2C4269643024%2C4262616522&removeRefinement=4274688882&pageNumber=1&resultAction=REFINE';
// 如果不刷新数据,则显示静态数据
$sp = new Spider();
//echo $sp->fetch($url);
//print_r($sp->fetch_results($url));
$papers = array();
$authors = array('王振宇' => 'Zhenyu+Wang', '辛柏成' => 'Baicheng+Xin', '蔡砚刚' => 'Yangang+Cai', '崔同兵' => 'Tongbing+Cui', '文浩丞' => 'Haocheng+Wen', '邢培银' => 'Peiyin+Xing', '韩冰杰' => 'Bingjie+Han', '焦剑波' => 'Jianbo+Jiao', '高璇' => 'Xuan+Gao', '李旭峰' => 'Xufeng+Li', '赵龙' => 'Long+Zhao', '万杰' => 'Jie+Wan', '吕浩' => 'Hao+Lv', '唐骋洲' => 'Chengzhou+Tang', '王磊' => 'Lei+Wang', '杨明辉' => 'Minghui+Yang', '杨爽' => 'Shuang+Yang', '张雷' => 'Lei+Zhang', '刘中欣' => 'Zhongxin+Liu', '彭祎' => 'Yi+Peng', '汤传新' => 'Chuanxin+Tang', '向国庆' => 'Guoqing+Xiang', '张艺' => 'Yi+Zhang', '杜实现' => 'Shixian+Du', '郭梦婷' => 'Mengting+Guo', '黄颖' => 'Ying+Huang', '魏莹荔' => 'Yingli+Wei', '张申' => 'Shen+Zhang', '张欣欣' => 'Xinxin+Zhang', '张杨' => 'Yang+Zhang', '张若楠' => 'Ruonan+Zhang', '黄泽湖' => 'Zehu+Huang', '罗佳佳' => 'Jiajia+Luo', '赵洋' => 'Yang+Zhao');
// 没有处理分页的问题,只取了一页的数据
set_time_limit(0);
foreach ($authors as $key => $author) {
    $papers[$key] = $sp->fetch_results('http://ieeexplore.ieee.org/search/searchresult.jsp?searchWithin%3Dp_Authors%3A.QT.' . $author . '.QT.%26refinements%3D4274688882%2C4268599920%2C4268757412%2C4274050053%2C4269644358%2C4269643024%2C4262616522&removeRefinement=4274688882&pageNumber=1&resultAction=REFINE');
    //    print_r($papers[$key]);
}
//exit();
/*
 * View layer -----------------------------------------------
 */
require_once 'third_party/php_simple_ui/php_simple_ui.php';
$list = new ui_JMListView($papers);
$list->addFilter('Search');
$page = new ui_JMPage('IEEE Xplore Papers', array($list));
$ui = new ui_jQueryMobile($page);
/**
 * Dump ui ------------------------------------------------
 */
Example #13
0
    /**
     * 下载图片到本地
     * @param $img_url
     * @return string
     */
    public function download_img($img_url)
    {
        $curl = curl_init($img_url);
        $filename = date('Ymdhis') . '_' . uniqid() . '.jpg';
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        $imageData = curl_exec($curl);
        curl_close($curl);
        $tp = @fopen('img/' . $filename, 'a');
        fwrite($tp, $imageData);
        fclose($tp);
        return $filename;
    }
}
$spider = new Spider();
$spider->main();
//save_spot('http://www.lvmama.com/lvyou/d-chengdu279.html');
//$spider->save_view('http://www.lvmama.com/lvyou/poi/sight-151780.html', 1);
//$spider->save_spot('http://www.lvmama.com/lvyou/d-chengdu279.html');
/*
*
* 份1:d-sichuan278.html
景区页面1:http://www.lvmama.com/lvyou/scenery/d-sichuan278.html
景区URL1:http://www.lvmama.com/lvyou/d-chengdu279.html
景点URL1:http://www.lvmama.com/lvyou/poi/sight-151780.html
http://www.lvmama.com/lvyou/poi/sight-151780.html
*/
Example #14
0
        //$uid = $matches[1][0];
        $urlVote = 'http://fotostrana.ru/meeting/index/click/?ajax=true&uId=' . $uid . '&val=3&rate=5&fake=0&uid=' . $uid;
        $setAgeUrl = 'http://fotostrana.ru/meeting/?change=1&gender=&age=21&ageTo=25';
        $x = $this->spider($setAgeUrl);
        //var_dump($matches[1][0]);
        var_dump($res);
    }
}
$searchUrl = "http://fotostrana.ru/search/?cityId=8&otherCity=&gender=w&age=22&ageTo=28&height%5Bmetric%5D=0&height%5Benglish%5D=0&weight%5Bmetric%5D=0&weight%5Benglish%5D=0&newOnly=0&online=1&ajax=true&change=1";
//$url = "http://fotostrana.ru/profile/ajax/freeVote?value=1&ftoken-all=63655255ba&userId=30911742";
// $url = "http://fotostrana.ru/profile/ajax/freeVote/?userId=30911742&value=1&ftoken-all=63655255ba";
//$url = "http://fotostrana.ru/contest/vote/votedata/?userId=36306794&dir=up&nominationId=0&ftoken-f-contestSendVote-36306794=wfzzieiefc&ajax=1&sendFreeVote=1";
//$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=29504825&from=profile";
//$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=206878&from=profile";
//$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=36429493&from=profile";
//$url = "http://fotostrana.ru/contest/new/votePopup/?_ajax=1&ajax=1&userId=39607735&from=profile";
$guessurl = "http://fotostrana.ru/guess/ajax/saveGuess?guess_user=39402139&answer%5B1%5D=16&uanswer%5B1%5D=&answer%5B2%5D=41&uanswer%5B2%5D=&answer%5B3%5D=17992&uanswer%5B3%5D=&send_message=on&ajax=1&isPopup=0";
$guessurl = "http://fotostrana.ru/guess/ajax/saveGuess?guess_user=39402139&answer%5B1%5D=16&uanswer%5B1%5D=&answer%5B2%5D=41&uanswer%5B2%5D=&answer%5B3%5D=17992&uanswer%5B3%5D=&ajax=1&isPopup=0";
$sp = new Spider($searchUrl);
$sp->preconstruct();
//$sp->meeting();
//$sp->guess($argv[1],$argv[2]);
//$x = $sp->visit($argv[1],$argv[2]);
//$x = $sp->spider($url);
//$y = json_decode($x);
//echo $y->html;
//var_dump($y);
//~ $sp->visit($argv[1],$argv[2]);
//~ $j = file_get_contents($url);
//~ $x = json_decode($j);
//~ var_dump($j);
Example #15
0
 function generateSitemapFile($sitemapInfo)
 {
     $sitemapInfo['project_id'] = intval($sitemapInfo['project_id']);
     if (!empty($sitemapInfo['project_id'])) {
         # check whether the sitemap directory is writable
         if (!is_writable(SP_TMPPATH . "/" . $this->sitemapDir)) {
             hideDiv('message');
             showErrorMsg("Directory '<b>" . SP_TMPPATH . "/" . $this->sitemapDir . "</b>' is not <b>writable</b>. Please change its <b>permission</b> !");
         }
         $saCtrler = $this->createController('SiteAuditor');
         $projectInfo = $saCtrler->__getProjectInfo($sitemapInfo['project_id']);
         $this->section = formatFileName($projectInfo['name']);
         $this->smType = $sitemapInfo['sm_type'];
         $this->excludeUrl = $sitemapInfo['exclude_url'];
         if (!empty($sitemapInfo['freq'])) {
             $this->changefreq = $sitemapInfo['freq'];
         }
         if (!empty($sitemapInfo['priority'])) {
             $this->priority = $sitemapInfo['priority'];
         }
         $auditorComp = $this->createComponent('AuditorComponent');
         $pageList = $auditorComp->getAllreportPages(" and project_id=" . $sitemapInfo['project_id']);
         $urlList = array();
         foreach ($pageList as $pageInfo) {
             $pageInfo['page_url'] = Spider::addTrailingSlash($pageInfo['page_url']);
             if ($auditorComp->isExcludeLink($pageInfo['page_url'], trim($sitemapInfo['exclude_url']))) {
                 continue;
             }
             $urlList[] = $pageInfo['page_url'];
         }
         $this->createSitemap($this->smType, $urlList);
     } else {
         hideDiv('message');
         showErrorMsg("No Website Found!");
     }
 }
 function runReport($reportUrl, $projectInfo, $totalLinks)
 {
     $spider = new Spider();
     $pageInfo = $spider->getPageInfo($reportUrl, $projectInfo['url'], true);
     if ($rInfo = $this->getReportInfo(" and project_id={$projectInfo['id']} and page_url='{$reportUrl}'")) {
         $reportInfo['id'] = $rInfo['id'];
         $reportInfo['page_title'] = addslashes($pageInfo['page_title']);
         $reportInfo['page_description'] = addslashes($pageInfo['page_description']);
         $reportInfo['page_keywords'] = addslashes($pageInfo['page_keywords']);
         $reportInfo['total_links'] = $pageInfo['total_links'];
         $reportInfo['external_links'] = $pageInfo['external'];
         $reportInfo['crawled'] = 1;
         // gooogle pagerank check
         if ($projectInfo['check_pr']) {
             $rankCtrler = $this->createController('Rank');
             $reportInfo['pagerank'] = $rankCtrler->__getGooglePageRank(Spider::addTrailingSlash($reportUrl));
         }
         // backlinks page check
         if ($projectInfo['check_backlinks']) {
             $backlinkCtrler = $this->createController('Backlink');
             $backlinkCtrler->url = Spider::addTrailingSlash($reportUrl);
             $reportInfo['bing_backlinks'] = $backlinkCtrler->__getBacklinks('msn');
             $reportInfo['google_backlinks'] = $backlinkCtrler->__getBacklinks('google');
         }
         // indexed page check
         if ($projectInfo['check_indexed']) {
             $saturationCtrler = $this->createController('SaturationChecker');
             $saturationCtrler->url = Spider::addTrailingSlash($reportUrl);
             $reportInfo['bing_indexed'] = $saturationCtrler->__getSaturationRank('msn');
             $reportInfo['google_indexed'] = $saturationCtrler->__getSaturationRank('google');
         }
         if ($projectInfo['check_brocken']) {
             $reportInfo['brocken'] = Spider::isLInkBrocken($linkInfo['link_url']);
         }
         $this->saveReportInfo($reportInfo, 'update');
         // to store sitelinks in page and links reports
         $i = 0;
         if (count($pageInfo['site_links']) > 0) {
             // loo through site links
             foreach ($pageInfo['site_links'] as $linkInfo) {
                 // if store links
                 if ($projectInfo['store_links_in_page']) {
                     $delete = $i++ ? false : true;
                     $linkInfo['report_id'] = $rInfo['id'];
                     $this->storePagelLinks($linkInfo, $delete);
                 }
                 // if total links saved less than max links allowed for a project
                 if ($totalLinks < $projectInfo['max_links']) {
                     // check whether valid html serving link
                     if (preg_match('/\\.zip$|\\.gz$|\\.tar$|\\.png$|\\.jpg$|\\.jpeg$|\\.gif$|\\.mp3$|\\.flv$|\\.pdf$|\\.m4a$|#$/i', $linkInfo['link_url'])) {
                         continue;
                     }
                     // if found any space in the link
                     $linkInfo['link_url'] = Spider::formatUrl($linkInfo['link_url']);
                     if (!preg_match('/\\S+/', $linkInfo['link_url'])) {
                         continue;
                     }
                     // check whether url needs to be excluded
                     if ($this->isExcludeLink($linkInfo['link_url'], $projectInfo['exclude_links'])) {
                         continue;
                     }
                     // save links for the project report
                     if (!$this->getReportInfo(" and project_id={$projectInfo['id']} and page_url='{$linkInfo['link_url']}'")) {
                         $repInfo['page_url'] = $linkInfo['link_url'];
                         $repInfo['project_id'] = $projectInfo['id'];
                         $this->saveReportInfo($repInfo);
                         $totalLinks++;
                     }
                 }
             }
         }
         // to store external links in page
         if ($projectInfo['store_links_in_page']) {
             if (count($pageInfo['external_links']) > 0) {
                 foreach ($pageInfo['external_links'] as $linkInfo) {
                     $delete = $i++ ? false : true;
                     $linkInfo['report_id'] = $rInfo['id'];
                     $linkInfo['extrenal'] = 1;
                     $this->storePagelLinks($linkInfo, $delete);
                 }
             }
         }
         // calculate score of each page and update it
         $this->updateReportPageScore($rInfo['id']);
         // calculate score of each page and update it
         $this->updateProjectPageScore($projectInfo['id']);
     }
 }
Example #17
0
 public function current()
 {
     $urlInfo = Spider::getURIInfo(parent::current(), $this->options);
     return $urlInfo['effective_url'];
 }
Example #18
0
    function proceedInstallation($info)
    {
        $db = new DB();
        # checking db settings
        $errMsg = $db->connectDatabase($info['db_host'], $info['db_user'], $info['db_pass'], $info['db_name']);
        if ($db->error) {
            $this->startInstallation($info, $errMsg);
            return;
        }
        # checking config file settings
        if (!is_writable(SP_INSTALL_CONFIG_FILE)) {
            $this->checkRequirements(true);
            return;
        }
        # checking seo panel web path
        $info['web_path'] = $this->getWebPath();
        if (empty($info['web_path'])) {
            $errMsg = "Error occured while parsing installation url. Please <a href='http://www.seopanel.in/contact/' target='_blank'>contact</a> Seo Panel team.";
            $this->startInstallation($info, $errMsg);
            return;
        }
        # importing data to db
        $errMsg = $db->importDatabaseFile(SP_INSTALL_DB_FILE);
        if ($db->error) {
            $errMsg = "Error occured while importing data: " . $errMsg;
            $this->startInstallation($info, $errMsg);
            return;
        }
        # write to config file
        $this->writeConfigFile($info);
        if (gethostbynamel('seopanel.in')) {
            include_once SP_INSTALL_DIR . '/../libs/spider.class.php';
            $installUpdateUrl = "http://www.seopanel.in/installupdate.php?url=" . urlencode($info['web_path']) . "&ip=" . $_SERVER['SERVER_ADDR'] . "&email=" . urlencode($info['email']);
            $spider = new Spider();
            $spider->getContent($installUpdateUrl);
        }
        ?>
		
		<form method="post" action="<?php 
        echo $info['web_path'] . "/login.php";
        ?>
">
		<h1 class="BlockHeader">Seo Panel Installation Success</h1>
		<table width="100%" cellspacing="8px" cellpadding="0px" class="formtab">
			<tr><th colspan="2" class="headersuccess">Seo Panel installed successfully!</th></tr>
			<tr>
				<td class="warning">Warning!</td>
			</tr>
			<tr>
				<td style="border: none;">
					<ul class="list">
						<li> Please change permission of config file <b><?php 
        echo SP_CONFIG_FILE;
        ?>
</b> to avoid security issues.</li>
						<li>Please remove installation directory <b>install</b> to avoid security issues.</li>
					</ul>
				</td>
			</tr>
			<tr>
				<td class="warning" style="color:black;">Admin Login</td>
			</tr>
			<tr>
				<td style="border: none;font-weight: normal;font-size: 13px;">
					<b>Username:</b> <?php 
        echo SP_ADMIN_USER;
        ?>
<br>
					<b>Password:</b> <?php 
        echo SP_ADMIN_PASS;
        ?>
<br><br>
					<b>Note:</b> Please change password of admin after first login.
				</td>
			</tr>
		</table>				
		<input type="submit" value="Proceed to admin login >>" name="submit" class="button">
		</form>
		<?php 
    }
 function importLinks($listInfo)
 {
     $userId = isLoggedIn();
     $listInfo['project_id'] = intval($listInfo['project_id']);
     $this->set('post', $listInfo);
     $errMsg['links'] = formatErrorMsg($this->validate->checkBlank($listInfo['links']));
     if (!$this->validate->flagErr) {
         $totalLinks = $this->getCountcrawledLinks($listInfo['project_id']);
         $projectInfo = $this->__getProjectInfo($listInfo['project_id']);
         // if total links greater than max links of a project
         if ($totalLinks >= $projectInfo['max_links']) {
             $errMsg['links'] = formatErrorMsg($this->spTextSA['totallinksgreaterallowed'] . " - {$projectInfo['max_links']}");
         } else {
             // check whether links are pages of website
             $linkInfo = $this->checkExcludeLinks($listInfo['links'], $projectInfo['url'], false);
             if (!empty($linkInfo['err_msg'])) {
                 $errMsg['links'] = formatErrorMsg($linkInfo['err_msg']);
             } else {
                 $auditorComp = $this->createComponent('AuditorComponent');
                 $links = explode(",", $listInfo['links']);
                 $error = false;
                 $linkList = array();
                 foreach ($links as $i => $link) {
                     $link = Spider::formatUrl(trim($link));
                     if (empty($link)) {
                         continue;
                     }
                     if ($auditorComp->isExcludeLink($link, $projectInfo['exclude_links'])) {
                         continue;
                     }
                     // check whether url exists or not
                     if ($auditorComp->getReportInfo(" and project_id={$projectInfo['id']} and page_url='" . addslashes($link) . "'")) {
                         $errMsg['links'] = formatErrorMsg($this->spTextSA['Page Link'] . " '<b>{$link}</b>' " . $_SESSION['text']['label']['already exist']);
                         $error = true;
                         break;
                     } else {
                         $totalLinks++;
                         // if total links greater than max links of a project
                         if ($totalLinks > $projectInfo['max_links']) {
                             $error = true;
                             $errMsg['links'] = formatErrorMsg($this->spTextSA['totallinksgreaterallowed'] . " - {$projectInfo['max_links']}");
                             break;
                         }
                     }
                     $linkList[$link] = 1;
                 }
                 // to save the page if no error occurs
                 if (!$error) {
                     foreach ($linkList as $link => $val) {
                         $reportInfo['page_url'] = $link;
                         $reportInfo['project_id'] = $projectInfo['id'];
                         $auditorComp->saveReportInfo($reportInfo);
                     }
                     $this->showAuditorProjects();
                     exit;
                 }
             }
         }
     }
     $this->set('errMsg', $errMsg);
     $this->showImportProjectLinks();
 }
Example #20
0
    function checkDirectoryStatus($dirId, $nodebug = 0)
    {
        $dirId = intval($dirId);
        $dirInfo = $this->getDirectoryInfo($dirId);
        $active = 0;
        $captcha = 0;
        $spider = new Spider();
        $ret = $spider->getContent(addHttpToUrl($dirInfo['submit_url']));
        $prUpdate = '';
        if (empty($ret['error']) && !empty($ret['page'])) {
            $page = $ret['page'];
            $matches = $this->isCategoryExists($page, $dirInfo['category_col']);
            $active = empty($matches[0]) ? 0 : 1;
            $captcha = stristr($page, $dirInfo['captcha_script']) ? 1 : 0;
            if ($this->checkPR) {
                include_once SP_CTRLPATH . "/rank.ctrl.php";
                $rankCtrler = new RankController();
                $pagerank = $rankCtrler->__getGooglePageRank($dirInfo['domain']);
                $prUpdate = ",google_pagerank={$pagerank}";
            }
        }
        $sql = "update directories set working={$active},is_captcha={$captcha},checked=1 {$prUpdate} where id={$dirId}";
        $this->db->query($sql);
        if ($nodebug) {
            $captchaLabel = $captcha ? $_SESSION['text']['common']['Yes'] : $_SESSION['text']['common']['No'];
            ?>
			<script type="text/javascript">
				document.getElementById('captcha_<?php 
            echo $dirId;
            ?>
').innerHTML = '<?php 
            echo $captchaLabel;
            ?>
';
			</script>
			<?php 
            if ($this->checkPR) {
                ?>
				<script type="text/javascript">
					document.getElementById('pr_<?php 
                echo $dirId;
                ?>
').innerHTML = '<?php 
                echo $pagerank;
                ?>
';
				</script>
				<?php 
            }
            echo $this->getStatusLink($dirId, $active);
        } else {
            echo "<p class='note notesuccess'>Saved status of directory <b>{$dirInfo['domain']}</b>.....</p>";
        }
    }
Example #21
0
<?php

// include Spider class file
require_once 'spider.class.php';
// create new Spider object
$spider = new Spider('http://onestopshopmarket.com');
// allow files with extension *.txt being spidered
$spider->allowType('txt');
// and disable files with that extension
$spider->restrictType('txt');
// set it to true if you want to see what is happening on the screen
$spider->setVerbose(true);
// start spidering website
$spider->startSpider();
// all found and fetched links are in that variable
$links = $spider->all_links;
// print it out
print_f($links);
?>
 
Example #22
0
function exportToPdf($content, $fileName = "reports.pdf")
{
    include_once SP_LIBPATH . "/mpdf/mpdf.php";
    $mpdf = new mPDF();
    $mpdf->useAdobeCJK = true;
    $mpdf->SetAutoFont(AUTOFONT_ALL);
    $spider = new Spider();
    $ret = $spider->getContent(SP_CSSPATH . "/screen.css");
    $stylesheet = str_replace("../../../images", SP_IMGPATH, $ret['page']);
    $mpdf->WriteHTML($stylesheet, 1);
    $mpdf->SetDisplayMode('fullpage');
    $mpdf->WriteHTML($content, 2);
    $mpdf->Output($fileName, "I");
    exit;
}
Example #23
0
        //$like_user_id =
        //$post = "action=user.like&params[like]=5&params[like_user_id]={$like_user_id}";
    }
}
$searchUrl = "http://fotostrana.ru/search/?cityId=8&otherCity=&gender=w&age=22&ageTo=28&height%5Bmetric%5D=0&height%5Benglish%5D=0&weight%5Bmetric%5D=0&weight%5Benglish%5D=0&newOnly=0&online=1&ajax=true&change=1";
//$url = "http://fotostrana.ru/profile/ajax/freeVote?value=1&ftoken-all=63655255ba&userId=30911742";
// $url = "http://fotostrana.ru/profile/ajax/freeVote/?userId=30911742&value=1&ftoken-all=63655255ba";
//$url = "http://fotostrana.ru/contest/vote/votedata/?userId=36306794&dir=up&nominationId=0&ftoken-f-contestSendVote-36306794=wfzzieiefc&ajax=1&sendFreeVote=1";
//$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=29504825&from=profile";
//$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=206878&from=profile";
//$url = "http://fotostrana.ru/contest/new/votePopup/_ajax=1&ajax=1&userId=36429493&from=profile";
//$url = "http://fotostrana.ru/contest/new/votePopup/?_ajax=1&ajax=1&userId=39607735&from=profile";
$guessurl = "http://fotostrana.ru/guess/ajax/saveGuess?guess_user=39402139&answer%5B1%5D=16&uanswer%5B1%5D=&answer%5B2%5D=41&uanswer%5B2%5D=&answer%5B3%5D=17992&uanswer%5B3%5D=&send_message=on&ajax=1&isPopup=0";
$guessurl = "http://fotostrana.ru/guess/ajax/saveGuess?guess_user=39402139&answer%5B1%5D=16&uanswer%5B1%5D=&answer%5B2%5D=41&uanswer%5B2%5D=&answer%5B3%5D=17992&uanswer%5B3%5D=&ajax=1&isPopup=0";
$searchUrl = '';
$sp = new Spider();
$url = "http://fotostrana.ru/user/autologin/?u=11935779&h=10aaaec8776a62a";
$x = $sp->spider($url);
$url = "http://fotostrana.ru/user/11935779";
//$x = $sp->spider($url);
//$sp->collectIds(1,2);
//$sp->visit($argv[1],$argv[2]);
$sp->meeting();
//$sp->guess(10,20);
//$sp->guess(21,30);
//$sp->guess($argv[1],$argv[2]);
//$sp->guess($argv[1],$argv[2]);
//$x = $sp->visit($argv[1],$argv[2]);
//$y = json_decode($x);
//echo $y->html;
//var_dump($y);
Example #24
0
<?php

/** Get the ratings for Daybreakers **/
include 'globals.inc';
echo '<h2>Daybreakers ratings</h2>';
$s = new Spider();
echo 'IMDB: ' . $s->qf(".//*[@id='tn15rating']/div[1]/div/div[2]/b", 'http://www.imdb.com/title/tt0433362/')->inner . '<br/>';
echo 'Metacritic: ' . $s->qf(".//*[@id='metascore']", 'http://www.metacritic.com/film/titles/daybreakers')->inner . '<br/>';
echo 'Rotten Tomatoes: ' . $s->qf(".//*[@id='tomatometer_score']/span[1]", 'http://www.rottentomatoes.com/m/daybreakers/')->inner . '<br/>';
Example #25
0
<?php

// module test
include 'Controller.class.php';
// test Spider class
$spider = new Spider('深圳娃娃鱼事件');
echo $spider->get(1);
// test Controller class
$controller = new Controller();
for ($i = 0; $i < 2; $i++) {
    $controller->work(5);
}
$controller->work(5);
$controller->task('深圳多名官员吃娃娃鱼');
// test Analyse class
$analyse = new Analyse('深圳多名官员吃娃娃鱼', 1);
echo $analyse->total();
var_dump($analyse->results());
Example #26
0
    function checkDirectoryStatus($dirId, $nodebug = 0)
    {
        $dirId = intval($dirId);
        $dirInfo = $this->getDirectoryInfo($dirId);
        $active = 0;
        $captcha = 0;
        $spider = new Spider();
        $ret = $spider->getContent(addHttpToUrl($dirInfo['submit_url']));
        $prUpdate = '';
        $searchUpdate = '';
        $extraValUpdate = '';
        if (empty($ret['error']) && !empty($ret['page'])) {
            $page = $ret['page'];
            $matches = $this->isCategoryExists($page, $dirInfo['category_col']);
            $active = empty($matches[0]) ? 0 : 1;
            $captcha = stristr($page, $dirInfo['captcha_script']) ? 1 : 0;
            // to check search script
            if (stristr($page, 'name="search"')) {
                $searchUpdate = ",search_script='index.php?search=[--keyword--]'";
            }
            // to check  the value of the LINK_TYPE if phpld directory
            if ($dirInfo['script_type_id'] == 1 && preg_match('/name="LINK_TYPE" value="(\\d)"/s', $page)) {
                $subject = array('LINK_TYPE=reciprocal', 'LINK_TYPE=normal', 'LINK_TYPE=free');
                $replace = array('reciprocal=1&LINK_TYPE=1', 'LINK_TYPE=2', 'LINK_TYPE=3');
                $dirInfo['extra_val'] = str_replace($subject, $replace, $dirInfo['extra_val']);
                $extraValUpdate = ",extra_val='{$dirInfo['extra_val']}'";
            }
            if ($this->checkPR) {
                include_once SP_CTRLPATH . "/rank.ctrl.php";
                $rankCtrler = new RankController();
                $pagerank = $rankCtrler->__getGooglePageRank($dirInfo['domain']);
                $prUpdate = ",google_pagerank={$pagerank}";
            }
        }
        $sql = "update directories set working={$active},is_captcha={$captcha},checked=1 {$prUpdate} {$searchUpdate} {$extraValUpdate} where id={$dirId}";
        $this->db->query($sql);
        if ($nodebug) {
            $captchaLabel = $captcha ? $_SESSION['text']['common']['Yes'] : $_SESSION['text']['common']['No'];
            ?>
			<script type="text/javascript">
				document.getElementById('captcha_<?php 
            echo $dirId;
            ?>
').innerHTML = '<?php 
            echo $captchaLabel;
            ?>
';
			</script>
			<?php 
            if ($this->checkPR) {
                ?>
				<script type="text/javascript">
					document.getElementById('pr_<?php 
                echo $dirId;
                ?>
').innerHTML = '<?php 
                echo $pagerank;
                ?>
';
				</script>
				<?php 
            }
            echo $this->getStatusLink($dirId, $active);
        } else {
            echo "<p class='note notesuccess'>Saved status of directory <b>{$dirInfo['domain']}</b>.....</p>";
        }
    }
Example #27
0
 function isLInkBrocken($url)
 {
     $header = Spider::getHeader($url);
     if (stristr($header, '404 Not Found')) {
         return true;
     } else {
         return false;
     }
 }
 function __construct($url)
 {
     // 如果数据库中可以查到对应url的配置,则载入,否则新建数据并载入
     parent::__construct($url);
 }
Example #29
0
<?php

// Init
include 'globals.inc';
$s = new Spider();
// Get an array of search results from teh google
$a = $s->qa('.//*[@id="res"]/div/ol/li/h3/a', 'http://www.google.co.nz/search?q=daybreakers');
// Print the headers and the array
echo '<pre>', print_r($s->getHead()), '</pre>';
echo '<pre>', print_r($a), '</pre>';
// Get a full DOMList of the search results
$list = $s->qq('.//*[@id="res"]/div/ol/li/h3/a');
// Echo the first nodes innertext
echo $list(0)->inner;
// Print a list of the search results from google
echo '<ul>';
foreach ($list() as $a) {
    echo '<li><a href="' . $a->href . '">' . $a->inner . '</a></li>';
}
echo '</ul>';
// Get the score from metacritic
$score = $s->qf('.//*[@id="metascore"]', 'http://www.metacritic.com/film/titles/daybreakers')->inner;
echo "<p>Score for Daybreakers: {$score}</p>";
// 用户、社区功能
// 小工具 通过AJAX 过滤时间
// 配置搜索选项,通过提交表单,不用AJAX
// 不建议使用appendText
// 可折叠 可折叠列表不支持计数气泡
// 分页 显示更多
// 自动填入过滤常用词 新闻 通知 学术讲座
// 获取配置信息
$filter = isset($_GET['filter']) ? $_GET['filter'] : "week";
header("Content-type:text/html;charset=utf-8");
// 根据配置信息抓取数据
define('SPIDER_PATH', '../core/');
require_once SPIDER_PATH . 'php_web_spider.php';
require_once SPIDER_PATH . 'simple_html_dom.php';
// 会分析提交的UI配置信息
$sp = new Spider();
// $tmp = $sp-> fetch_news('http://www.phbs.pku.edu.cn/index.php?m=content&c=index&a=lists&catid=419');
// print_r($tmp);exit(0);
// 网址信息数据 建议从数据库中获取,相同学院只是不同path,根地址相同
$news['信息工程学院'] = $sp->fetch_news('http://www.ece.pku.edu.cn/index.php?m=content&c=index&a=lists&catid=502', $filter);
// 提供特殊形式链接
$news['汇丰商学院'] = $sp->fetch_news('http://www.phbs.pku.edu.cn/index.php?m=content&c=index&a=lists&catid=419', $filter);
// $news['化学生物学与生物技术学院'] = $sp-> fetch_news('http://www.scbb.pkusz.edu.cn/index.php?m=content&c=index&a=lists&catid=862');
// $news['环境与能源学院'] = $sp-> fetch_news('http://see.pkusz.edu.cn/news_cn.aspx');
// $news['城市规划与设计学院'] = $sp-> fetch_news('http://see.pkusz.edu.cn/news_cn.aspx');
// $news['城市规划与设计学院'] = $sp-> fetch_news('http://sam.pkusz.edu.cn/index.php?m=content&c=index&a=lists&catid=395');
// 讲座信息
$lecture['信息工程学院'] = $sp->fetch_news('http://www.ece.pku.edu.cn/index.php?m=content&c=index&a=lists&catid=503', $filter);
$lecture['汇丰商学院'] = $sp->fetch_news('http://www.phbs.pku.edu.cn/list-812-1.html', $filter);
// $lecture['新材料学院'] = $sp-> fetch_news('http://sam.pkusz.edu.cn/index.php?m=content&c=index&a=lists&catid=809');
// Undefined variable: find_link in D:\Program Files\xampp\htdocs\GitHub\php_web_spider\core\php_web_spider.php on line 259