コード例 #1
0
if (filter_var($_GET['url'], FILTER_VALIDATE_URL)) {
    if ($type == 'stumbleupon') {
        $content = parse("http://www.stumbleupon.com/services/1.01/badge.getinfo?url={$url}");
        $result = json_decode($content);
        if (isset($result->result->views)) {
            $json['count'] = $result->result->views;
        }
    }
    if ($type == 'googlePlus') {
        //source http://www.helmutgranda.com/2011/11/01/get-a-url-google-count-via-php/
        $content = file_get_contents("https://plusone.google.com/u/0/_/+1/fastbutton?url=" . urlencode($_GET['url']) . "&count=true");
        $doc = new DOMdocument();
        libxml_use_internal_errors(true);
        $doc->loadHTML($content);
        $doc->saveHTML();
        $num = $doc->getElementById('aggregateCount')->textContent;
        $json['count'] = $num ? $num : '0';
    }
}
echo str_replace('\\/', '/', json_encode($json));
function parse($encUrl)
{
    $options = array(CURLOPT_RETURNTRANSFER => true, CURLOPT_HEADER => false, CURLOPT_FOLLOWLOCATION => true, CURLOPT_ENCODING => "", CURLOPT_USERAGENT => 'sharrre', CURLOPT_AUTOREFERER => true, CURLOPT_CONNECTTIMEOUT => 5, CURLOPT_TIMEOUT => 10, CURLOPT_MAXREDIRS => 3, CURLOPT_SSL_VERIFYHOST => 0, CURLOPT_SSL_VERIFYPEER => false);
    $ch = curl_init();
    $options[CURLOPT_URL] = $encUrl;
    curl_setopt_array($ch, $options);
    $content = curl_exec($ch);
    $err = curl_errno($ch);
    $errmsg = curl_error($ch);
    curl_close($ch);
    if ($errmsg != '' || $err != '') {
コード例 #2
0
    foreach ($nodeParent->childNodes as $filho) {
        if ($filho->hasAttributes()) {
            if (strcmp($filho->getAttribute('class'), $className) === 0) {
                return $filho;
            }
        }
    }
}
function imprimirPosts($redditPosts)
{
    foreach ($redditPosts as $post) {
        echo "---------------------------------------" . "\n";
        //echo "id: 		 ".$post->getAttribute('id')."\n";
        echo "subrredit: " . $post->getAttribute('data-subreddit') . "\n";
        echo "author:    " . $post->getAttribute('data-author') . "\n";
        echo "rank:      " . $post->firstChild->nextSibling->nodeValue . "\n";
        echo "score:     " . obterFilhoPeloValorDoAtributoClass($post, "midcol unvoted")->firstChild->nextSibling->nextSibling->nodeValue . "\n";
        echo "link:      " . $post->getElementsByTagName('a')->item(0)->getAttribute('href') . "\n";
        echo "---------------------------------------";
    }
}
$html = consumirUrl(REDDIT_URL);
$dom = new DOMdocument();
//ignorar os warnings de html mal estruturado
@$dom->loadHTML($html);
//siteTable é a div onde estã os posts
$redditMainContent = $dom->getElementById('siteTable');
//o objectivo é remover os elementos div q não têm nada lá dentro
retirarChildNodesSemId($redditMainContent);
$redditPosts = $redditMainContent->childNodes;
imprimirPosts($redditPosts);
コード例 #3
0
function crawl_product_details($ASIN_URL, $ASIN, $uniqid)
{
    // Crawl details from amazon
    try {
        $image1 = $image2 = $image3 = $image4 = $image5 = $image6 = $image7 = $image8 = $image9 = $image10 = '';
        $body = file_get_contents($ASIN_URL);
        $title = $brand = $list_price = $price = $you_save = $shipping = $category = $description = $feature = '';
        $doc = new DOMdocument();
        libxml_use_internal_errors(true);
        $doc->loadHTML($body);
        $docxpath = new DOMXpath($doc);
        $title = $doc->getElementById('productTitle');
        $brand = $doc->getElementById('brand');
        $price = $doc->getElementById('priceblock_ourprice');
        $you_save = $doc->getElementById('regularprice_savings');
        $feature = $doc->getElementById('feature-bullets');
        $available = FALSE;
        $shipping = "FREE";
        $availablity_element = $docxpath->query('//span[@class="a-size-medium a-color-success"]');
        foreach ($availablity_element as $availablity) {
            $available = TRUE;
        }
        if ($available == FALSE) {
            return;
        }
        if (isset($title->textContent)) {
            $title = $title->textContent;
        }
        if (isset($brand->textContent)) {
            $brand = $brand->textContent;
        }
        if (isset($price->textContent)) {
            $price = $price->textContent;
        }
        if (isset($you_save->textContent)) {
            $you_save = $you_save->textContent;
        }
        if (isset($feature->textContent)) {
            $feature = $feature->textContent;
        }
        $image_element = $doc->getElementById('landingImage');
        $images = $image_element->getAttribute('data-a-dynamic-image');
        $regex = '/\\b(https?|ftp|file|http):\\/\\/[-A-Z0-9+&@#\\/%?=~_|$!:,.;]*[A-Z0-9+&@#\\/%=~_|$]/i';
        preg_match_all($regex, $images, $matches);
        $urls = $matches[0];
        // go over all links
        $i = 0;
        foreach ($urls as $url) {
            if (strpos($url, 'L.jpg') !== false) {
                $i++;
                ${'image' . $i} = $url;
            }
        }
        $list_prices = $docxpath->query('//td[@class="a-span12 a-color-secondary a-size-base a-text-strike"]');
        foreach ($list_prices as $list_price) {
            $list_price = $list_price->textContent;
        }
        $shipping_prices = $docxpath->query('//span[@class="a-size-small a-color-secondary shipping3P"]');
        foreach ($shipping_prices as $shipping_price) {
            $shipping_price = $shipping_price->textContent;
            $shipping_price = preg_replace('/[a-zA-Z]+/', '', $shipping_price);
            $shipping_price = str_replace('+', '', $shipping_price);
            $shipping_price;
        }
        $category_element = $docxpath->query('//a[@class="nav-a nav-b"]');
        foreach ($category_element as $category_meta) {
            $category = $category_meta->textContent;
        }
        $description_element = $doc->getElementsByTagName('meta');
        foreach ($description_element as $description_meta) {
            if ($description_meta->getAttribute('name') === 'description') {
                $description = $description_meta->getAttribute('content');
            }
        }
        /*
          $Product = array(
          'ASIN' => $ASIN,
          'Title' => $title,
          'Brand' => $brand,
          'List Price' => $list_price,
          'Price' => $price,
          'You Save' => $you_save,
          'Shipping' => $shipping,
          'Category' => $category,
          'Description' => $description,
          'Feature' => $feature,
          'Image1' => $image1,
          'Image2' => $image2,
          'Image3' => $image3,
          'Image4' => $image4,
          'Image5' => $image5,
          'Image6' => $image6,
          'Image7' => $image7,
          'Image8' => $image8,
          'Image9' => $image9,
          'Image10' => $image10
        );*/
        $product = array($ASIN, $title, $brand, $list_price, $price, $you_save, $shipping, $category, $description, $feature, $image1, $image2, $image3, $image4, $image5, $image6, $image7, $image8, $image9, $image10);
        $GLOBALS['count'] = $GLOBALS['count'] + 1;
        if ($GLOBALS['count'] % 100 == 0) {
            sleep(5);
        }
        $upload_dir = wp_upload_dir();
        $fileOutput = $upload_dir['path'] . '/' . $uniqid . '.csv';
        $fp = fopen($fileOutput, "a");
        fputcsv($fp, $product);
        fclose($fp);
    } catch (Exception $e) {
        print $e;
    }
}