示例#1
0
function getDomainLinks($url, $domain)
{
    global $domainLinks;
    $snoopy = new Snoopy();
    $snoopy->agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17";
    $snoopy->rawheaders['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
    $snoopy->rawheaders['Accept-Charset'] = 'GBK,utf-8;q=0.7,*;q=0.3';
    $snoopy->rawheaders['Connection'] = 'keep-alive';
    $snoopy->rawheaders['Accept-Language'] = 'zh-CN,zh;q=0.8';
    $snoopy->rawheaders['Cache-Control'] = 'max-age=0';
    $links = array();
    if ($snoopy->fetchlinks($url)) {
        foreach ($snoopy->results as $link) {
            if (stripos($link, $domain) === false) {
                continue;
            }
            if (in_array($link, $domainLinks)) {
                continue;
            }
            $domainLinks[] = $link;
            echo $link . "\r\n";
            getDomainLinks($link, $domain);
        }
    }
}
示例#2
0
文件: index.php 项目: edmundwong/V604
function fetchLinks($s_url, $re)
{
    $o_snoopy = new Snoopy();
    $o_snoopy->fetchlinks($s_url);
    $a_links_temp = $o_snoopy->results;
    $a_links = array();
    $i = 0;
    foreach ($a_links_temp as $key => $value) {
        if (preg_match($re, $a_links_temp[$key]) && !isSameLink($a_links, $a_links_temp[$key])) {
            $a_links[$i++] = $a_links_temp[$key];
        }
    }
    return $a_links;
}
示例#3
0
$news = new Newses();
$newstype = new Newstypes();
$snoopy = new Snoopy();
$typeoption = new Typeoption();
$conditions = array();
$tpl_file = "news.gather";
if (isset($_POST['gather']) && !empty($_POST['rules'])) {
    set_time_limit(180);
    $rules = stripslashes($_POST['rules']);
    $tmp_rules = explode("\r\n", $rules);
    if (!empty($tmp_rules) && count($tmp_rules) == 4) {
        list($remote_url, $remote_url_match, $remote_title_match, $remote_content_match) = $tmp_rules;
    } else {
        flash();
    }
    $snoopy->fetchlinks($remote_url);
    $urls = array();
    $urls = $snoopy->results;
    if (empty($urls)) {
        flash();
    }
    foreach ($urls as $key => $value) {
        //fetched url
        if (!preg_match($remote_url_match, $value)) {
            unset($urls[$key]);
        }
    }
    $urls = array_unique($urls);
    $u = 0;
    $sql[] = $title = $content = array();
    foreach ($urls as $key => $value) {
示例#4
0
function get_page_link($url)
{
    $snoopy = new Snoopy();
    $snoopy->fetchlinks($url);
    $all_link = $snoopy->results;
    $re = is_array($all_link) ? array_unique($all_link) : $all_link;
    return $re;
}
function stbv_main($incomingTB)
{
    global $stbv_opt, $stbv_val;
    ####################################
    # We only deal with trackbacks
    ####################################
    if ($incomingTB['comment_type'] != 'trackback') {
        return $incomingTB;
    }
    ####################################
    # Get trackback information
    ####################################
    $stbv_val['comment_author'] = $incomingTB['comment_author'];
    $stbv_val['comment_author_url'] = $incomingTB['comment_author_url'];
    $stbv_val['comment_post_permalink'] = get_permalink($incomingTB['comment_post_ID']);
    $stbv_val['comment_post_permalink'] = preg_replace('/\\/$/', '', $stbv_val['comment_post_permalink']);
    // Remove trailing slash
    $stbv_val['comment_post_ID'] = $incomingTB['comment_post_ID'];
    ####################################
    # Get Plugin options
    ####################################
    if ($stbv_opt['stbv_accuracy'] == 'open') {
        if (is_string($stbv_opt['stbv_blogurls'])) {
            if (strlen($stbv_opt['stbv_blogurls']) > 9) {
                $stbv_blogurlsArray = explode(' ', $stbv_opt['stbv_blogurls']);
            }
        }
    }
    ####################################
    # 'Is Spam' flag is FALSE by default. Below we check several things
    # and this flag will become true as soon as we have any doubts.
    ####################################
    $stbv_val['is_spam'] = false;
    ####################################
    # If a Snoopy problem occurrs (Snoopy can't be loaded or a snoopy error
    # occurred), this variable will be set to TRUE
    ####################################
    $stbv_val['snoopy_problem'] = false;
    ####################################
    # If Author's URL is not correct, it will be considered as spam.
    ####################################
    if (!$stbv_val['is_spam'] && substr($stbv_val['comment_author_url'], 0, 4) != 'http') {
        $stbv_val['log_info'][]['warning'] = 'Author\'s URL was found not to be correct';
        $stbv_val['is_spam'] = true;
    }
    ####################################
    # Phase 1 (IP) -  Verify IP address
    ####################################
    if (!$stbv_val['is_spam'] && $stbv_opt['stbv_validateIP'] == '1') {
        $tmpSender_IP = preg_replace('/[^0-9.]/', '', $_SERVER['REMOTE_ADDR']);
        $authDomainname = stbv_get_domainname_from_uri($stbv_val['comment_author_url']);
        $tmpURL_IP = preg_replace('/[^0-9.]/', '', gethostbyname($authDomainname));
        if ($tmpSender_IP != $tmpURL_IP) {
            $stbv_val['log_info'][]['info'] = 'Sender\'s IP address (' . $tmpSender_IP . ') not equal to IP address of host (' . $tmpURL_IP . ').';
            $stbv_val['is_spam'] = true;
        } else {
            $stbv_val['log_info'][]['info'] = 'IP address (' . $tmpSender_IP . ') was found to be valid.';
        }
    } elseif ($stbv_opt['stbv_validateIP'] != '1') {
        $stbv_val['log_info'][]['info'] = 'IP address validation (Phase 1) skipped since it is not enabled in the plugin\'s options.';
    }
    ####################################
    # Phase 2 (URL) -  Snoopy
    ####################################
    if ($stbv_opt['stbv_validateURL'] == '1') {
        # Loading snoopy and create snoopy object. In case of
        # failure it is being considered as spam, just in case.
        if (!$stbv_val['is_spam'] && !stbv_loadSnoopy()) {
            // Loading snoopy failed
            $stbv_val['log_info'][]['warning'] = 'Loading PHP Snoopy class failed. Phase 2 skipped.';
            $stbv_val['snoopy_problem'] = true;
        } else {
            // Create new Snoopy object
            $stbvSnoopy = new Snoopy();
        }
        # Fetch all URLs of the author's web page
        if (!$stbv_val['is_spam'] && !$stbv_val['snoopy_problem'] && !@$stbvSnoopy->fetchlinks($stbv_val['comment_author_url'])) {
            // Snoopy couldn't couldn't reach the target website, Snoopy error occurred, or something else...
            $stbv_val['log_info'][]['warning'] = 'Snoopy couldn\\t find something on the source website or Snoopy error occurred. Phase 2 skipped.';
            $stbv_val['snoopy_problem'] = true;
        } else {
            $stbvAuthorUrlArray = $stbvSnoopy->results;
        }
        # Check if URL array contains link to website
        if (!$stbv_val['is_spam'] && !$stbv_val['snoopy_problem'] && is_array($stbvAuthorUrlArray)) {
            $loopSuccess = false;
            foreach ($stbvAuthorUrlArray as $loopUrl) {
                // Remove trailing slash, "/trackback" and "/trackback/"
                $loopUrl = preg_replace('/(\\/|\\/trackback|\\/trackback\\/)$/', '', $loopUrl);
                if ($stbv_opt['stbv_accuracy'] == 'open' && is_array($stbv_blogurlsArray)) {
                    // We have more than one URL to be checked
                    $loopInnerSuccess = false;
                    foreach ($stbv_blogurlsArray as $loopOptionsURL) {
                        // Check if the first chars of the URL of remote page contain URL of the options
                        if (substr($loopUrl, 0, strlen($loopOptionsURL)) == $loopOptionsURL) {
                            $loopInnerSuccess = true;
                            break;
                        }
                    }
                    if ($loopInnerSuccess) {
                        $loopSuccess = true;
                        break;
                    }
                } else {
                    // Strict mode or no URLs provided so we check strictly the permalink only!
                    if ($loopUrl == $stbv_val['comment_post_permalink']) {
                        $loopSuccess = true;
                        break;
                    }
                }
            }
            if (!$loopSuccess) {
                $stbv_val['log_info'][]['info'] = 'The target URL was not found on the source website, therefore the trackback is considered to be spam.';
                $stbv_val['is_spam'] = true;
            } else {
                $stbv_val['log_info'][]['info'] = 'The trackback is considered to be valid: URL was found on the source website.';
            }
        }
    } else {
        // if ( $stbv_opt['stbv_validateURL'] == '1' )
        $stbv_val['log_info'][]['info'] = 'URL validation (Phase 2) skipped since it is not enabled in the plugin\'s options.';
    }
    ####################################
    # Now we know if we have a trackback spam or not.
    ####################################
    if ($stbv_opt['stbv_moderrors'] == '1' && $stbv_val['snoopy_problem']) {
        if ($stbv_opt['stbv_enablelog'] == '1') {
            stbv_log_addentry('Trackback placed into comment moderation due to an occurred problem while retrieving URLs from source website.');
        }
        if ($stbv_opt['stbv_addblockinfo'] == '1') {
            $incomingTB['comment_author'] = '[BLOCKED BY STBV] ' . $incomingTB['comment_author'];
        }
        add_filter('pre_comment_approved', create_function('$a', 'return \'0\';'));
        return $incomingTB;
    } elseif (!$stbv_val['is_spam']) {
        # **** No Trackback Spam ***
        if ($stbv_opt['stbv_enablelog'] == '1') {
            stbv_log_addentry('Trackback approved.');
        }
        return $incomingTB;
    } else {
        # **** It is Trackback Spam ***
        # We put trackback in moderation queue, mark as spam or delete right away
        switch ($stbv_opt['stbv_action']) {
            case 'delete':
                if ($stbv_opt['stbv_enablelog'] == '1') {
                    stbv_log_addentry('Trackback discarded.');
                }
                die('Your trackback has been rejected.');
                break;
            case 'spam':
                if ($stbv_opt['stbv_enablelog'] == '1') {
                    stbv_log_addentry('Trackback marked as spam.');
                }
                if ($stbv_opt['stbv_addblockinfo'] == '1') {
                    $incomingTB['comment_author'] = '[BLOCKED BY STBV] ' . $incomingTB['comment_author'];
                }
                add_filter('pre_comment_approved', create_function('$a', 'return \'spam\';'));
                return $incomingTB;
                break;
            default:
                if ($stbv_opt['stbv_enablelog'] == '1') {
                    stbv_log_addentry('Trackback placed into comment moderation.');
                }
                if ($stbv_opt['stbv_addblockinfo'] == '1') {
                    $incomingTB['comment_author'] = '[BLOCKED BY STBV] ' . $incomingTB['comment_author'];
                }
                add_filter('pre_comment_approved', create_function('$a', 'return \'0\';'));
                return $incomingTB;
        }
    }
}
示例#6
0
function add_links_insite($link, $old, $numm, $ooo, $site_id, $include_word, $not_include_word)
{
    if (!is_url($link)) {
        return false;
    }
    global $db, $config;
    /* $spider=new spider;  //系统自带蜘蛛
         echo "<b>网站编码</b>(默认GB2312)<b>:";
         $spider->url($link);
         echo "</b><br>";
         $links= $spider->get_insite_links();
    	*/
    //$site_url=GetSiteUrl($link);
    $url_old = GetSiteUrl($old);
    echo "原始页=" . $url_old . " - - <";
    echo "首层 id=" . $site_id . "> - - <";
    echo "包含字段=" . $include_word . ">";
    echo "<br>";
    /*if($ooo==0)
      {
      		$site=$db->get_one("select * from ve123_sites where url='".$url_old."'");
      		$site_id=$site["site_id"];
      		$include_word=$site["include_word"];  
      		$not_include_word=$site["not_include_word"]; 
      		$spider_depth=$site["spider_depth"];  
      }  */
    $snoopy = new Snoopy();
    //国外snoopy程序
    $snoopy->fetchlinks($link);
    $links = $snoopy->results;
    $links = check_wai($links, $numm, $link);
    $links = array_values(array_unique($links));
    foreach ((array) $links as $value) {
        $row = $db->get_one("select * from ve123_links_temp where url='" . $value . "'");
        if (empty($row)) {
            $arral = array('url' => $value, 'site_id' => $site_id);
            $db->insert("ve123_links_temp", $arral);
        }
        $value = rtrim($value, "/");
        $row = $db->get_one("select * from ve123_links where url='" . $value . "'");
        if (check_include($value, $include_word, $not_include_word)) {
            if (empty($row) && is_url($value)) {
                echo "<font color=#C60A00><b>抓取到:</b></font>";
                $array = array('url' => $value, 'site_id' => $site_id, 'level' => '1');
                $db->insert("ve123_links", $array);
            } else {
                echo "<b>已存在了:</b>";
            }
            echo "<a href=" . $value . " target=_blank>" . $value . "</a><br>";
            ob_flush();
            flush();
            //$row=$db->get_one("select * from ve123_links_temp where url='".$value."'");
            // if(empty($row)&&is_url($value))
            // {
            //    $array=array('url'=>$value,'site_id'=>$site_id);
            //     $db->insert("ve123_links_temp",$array);
            // }
        }
    }
}
示例#7
0
function wphc_check_hidden_tag($comment)
{
    // admins can do what they like
    if (is_admin()) {
        return $comment;
    }
    // get our options
    $type = $comment['comment_type'];
    $options = wphc_option();
    $spam = false;
    if ($type == "trackback" || $type == "pingback") {
        // check the website's IP against the url it's sending as a trackback
        if ($options['validate-ip']) {
            $server_ip = isset($_SERVER['HTTP_X_FORWARDED_FOR']) ? $_SERVER['HTTP_X_FORWARDED_FOR'] : $_SERVER['REMOTE_ADDR'];
            $web_ip = gethostbyname(parse_url($comment['comment_author_url'], PHP_URL_HOST));
            $ipv = $server_ip != $web_ip;
            $spam = $spam || $ipv;
            if ($options['logging'] && $ipv) {
                $comment['comment_content'] .= "\n\n[WORDPRESS HASHCASH] The comment's server IP (" . $server_ip . ") doesn't match the" . " comment's URL host IP (" . $web_ip . ") and so is spam.";
            }
        }
        // look for our link in the page itself
        if (!$spam && $options['validate-url']) {
            if (!class_exists('Snoopy')) {
                require_once ABSPATH . WPINC . '/class-snoopy.php';
            }
            $permalink = get_permalink($comment['comment_post_ID']);
            $permalink = preg_replace('/\\/$/', '', $permalink);
            $snoop = new Snoopy();
            if (@$snoop->fetchlinks($comment['comment_author_url'])) {
                $found = false;
                if (!empty($snoop->results)) {
                    foreach ($snoop->results as $url) {
                        $url = preg_replace('/(\\/|\\/trackback|\\/trackback\\/)$/', '', $url);
                        if ($url == $permalink) {
                            $found = true;
                        }
                    }
                }
                if ($options['logging'] && !$found) {
                    $comment['comment_content'] .= "\n\n[WORDPRESS HASHCASH] The comment's actual post text did not contain your blog url (" . $permalink . ") and so is spam.";
                }
                $spam = $spam || !$found;
            } else {
                $spam = true;
                if ($options['logging']) {
                    $comment['comment_content'] .= "\n\n[WORDPRESS HASHCASH] Snoopy failed to fetch results for the comment blog url (" . $comment['comment_author_url'] . ") with error '" . $snoop->error . "' and so is spam.";
                }
            }
        }
    } else {
        // Check the wphc values against the last five keys
        $spam = !in_array($_POST["wphc_value"], $options['key']);
        if ($options['logging'] && $spam) {
            $comment['comment_content'] .= "\n\n[WORDPRESS HASHCASH] The poster sent us '" . intval($_POST["wphc_value"]) . " which is not a hashcash value.";
        }
    }
    if ($spam) {
        $options['comments-spam'] = (int) $options['comments-spam'] + 1;
        wphc_option($options);
        switch ($options['moderation']) {
            case 'delete':
                add_filter('comment_post', create_function('$id', 'wp_delete_comment($id); die(\'This comment has been deleted by WP Hashcash\');'));
                break;
            case 'akismet':
                add_filter('pre_comment_approved', create_function('$a', 'return \'spam\';'));
                break;
            case 'moderate':
            default:
                add_filter('pre_comment_approved', create_function('$a', 'return 0;'));
                break;
        }
    } else {
        $options['comments-ham'] = (int) $options['comments-ham'] + 1;
        wphc_option($options);
    }
    return $comment;
}
示例#8
0
文件: crawler.php 项目: Jiumiking/qw
 public function jd_item($url)
 {
     $snoopy = new Snoopy();
     $snoopy->fetchlinks($url);
     $link_all = $snoopy->results;
     if (!empty($link_all)) {
         foreach ($link_all as $key => $value) {
             $link_decode = urldecode($value);
             $link_info = parse_url($link_decode);
             if ($link_info['host'] == 'item.jd.com' && empty($link_info['fragment'])) {
                 $link_item[] = $link_decode;
             }
         }
         if (!empty($link_item)) {
             $link_item = array_unique($link_item);
             $data = $this->jd_data($link_item);
             if (!empty($data)) {
                 $this->jd_inserts($data);
             }
             return true;
         }
     }
     return array();
 }