Example #1
0
 public static function oParseHtml($iHtmlID)
 {
     $oHtml = Ad::oGetHtml($iHtmlID);
     $oAd = new Ad();
     $oAd->oPage->iHtmlID = $oHtml->id;
     $oAd->oPage->sListID = $oHtml->list;
     $oAd->oPage->sFetched = $oHtml->fetched;
     $oAd->oPage->sDomain = $oHtml->domain_hash;
     $oAd->oPage->sUrl = $oHtml->url;
     $oDom = HtmlDomParser::str_get_html($oHtml->html);
     $oMainInfo = $oDom->find('.panel-body > .row', 0);
     $sOrange = $oDom->find('.headline-key-facts', 0)->innertext;
     $sSquareMeters = Utilitu::sPregRead('#röße:\\s+([,\\d]+)m#', $sOrange);
     $oAd->oPhysical->nSquareMeters = floatval($sSquareMeters);
     $aKostenRows = $oMainInfo->find('.col-sm-5 tbody tr');
     $aKostenRowsByLabel = array();
     foreach ($aKostenRows as $oKostenRow) {
         $aCells = $oKostenRow->find('td');
         $sLabel = trim($aCells[0]->plaintext, "\t\n :");
         $sValue = trim($aCells[1]->plaintext, "\t\n ");
         $iPrice = 100 * intval(str_replace(array(',', '€'), array('.', ''), $sValue));
         $aKostenRowsByLabel[$sLabel] = array('oDom' => $oKostenRow, 'sLabel' => $sLabel, 'sValue' => $sValue, 'iPrice' => $iPrice);
     }
     $aKostenMap = array('iCold' => 'Miete', 'iNeben' => 'Nebenkosten', 'iOther' => 'Sonstige Kosten', 'iBail' => 'Kaution', 'iBuy' => 'Abschlagszahlung');
     foreach ($aKostenMap as $sTarget => $sSource) {
         if (isset($aKostenRowsByLabel[$sSource])) {
             $oAd->oPrice->{$sTarget} = $aKostenRowsByLabel[$sSource]['iPrice'];
         }
     }
     $oAd->oPrice->iWarm = $oAd->oPrice->iCold + $oAd->oPrice->iNeben;
     $sAddressHtml = $oMainInfo->find('.col-sm-4 > p', 0)->innertext;
     $sAddress = trim($sAddressHtml);
     $sAddress = str_replace("\n", '', $sAddress);
     $sAddress = preg_replace('#<br ?/?>\\s+#', "\n", $sAddress);
     $aAddress = explode("\n", $sAddress);
     $oAd->oAddress->sCity = 'Aachen';
     $oAd->oAddress->sZip = Utilitu::sPregRead('#\\s*(\\d+)#', $aAddress[0]);
     $oAd->oAddress->sStreet = trim($aAddress[1]);
     $sGeocodeAddress = $oAd->oAddress->sStreet . ', ' . $oAd->oAddress->sZip . ' ' . 'Aachen';
     $oCoords = Maps::oGetCoords($sGeocodeAddress);
     $oAd->oAddress->oCoords = $oCoords;
     $aImageDoms = $oDom->find('img.sp-image');
     foreach ($aImageDoms as $oImageDom) {
         if (!isset($oImageDom->attr['data-large'])) {
             continue;
         }
         $oImage = new StdClass();
         $oImage->sUrl = str_replace('/./', '/', $oImageDom->attr['data-large']);
         $sFileType = Utilitu::sPregRead('#\\.([^\\.]+)$#', $oImage->sUrl);
         $oImage->sFile = self::$sImagesFolder . md5($oImage->sUrl) . '.' . $sFileType;
         if (!file_exists($oImage->sFile)) {
             $sImage = Curl::sGet($oImage->sUrl);
             if (Curl::iGetLastStatus() == 200) {
                 file_put_contents($oImage->sFile, $sImage);
             }
         }
         if (file_exists($oImage->sFile)) {
             $oAd->oPage->aImages[] = $oImage;
         }
     }
     $aDescription = array();
     $aDescriptionBlocks = $oDom->find('#infobox_nachrichtsenden', 0)->parent->find('.freitext');
     foreach ($aDescriptionBlocks as $oDescriptionBlock) {
         $aDescription[] = $oDescriptionBlock->plaintext;
     }
     $oAd->oPage->sDescription = implode("\n\n", $aDescription);
     $oAd->oPage->sDescription = preg_replace('#\\n\\s+#', "\n", $oAd->oPage->sDescription);
     $oAd->oPage->sDescription = str_replace('&nbsp;', '', $oAd->oPage->sDescription);
     $aPotentialDates = $oDom->find('.col-sm-4 .col-sm-12');
     foreach ($aPotentialDates as $oPotentialDate) {
         if (preg_match('#^\\s*Angebot vom:\\s*(.+)\\s*$#', $oPotentialDate->plaintext, $aMatch)) {
             $sDate = date('Y-m-d H:i:s', strtotime($aMatch[1]));
             $oAd->oPage->sCreated = $sDate;
             $oAd->oPage->sChanged = $sDate;
         }
     }
     /// TODO: oContact
     Ad::iRemoveAdsByUrl($oHtml->url);
     $oAd->vSave();
     DirectDB::bUpdate('ads_htmls', array('parsed' => true), $oHtml->id);
     return $oAd;
 }
Example #2
0
            $ad = new App\Ad();
            $ad->id = $id;
            $ad->title = $title;
            $ad->description = $description;
            $ad->price = $price;
            $ad->link = $link;
            $ad->save();
            echo "Added " . $id . " {$price}<br/>";
        } else {
            echo "Item already on database " . $id . "<br/>";
        }
    }
    #return false;
});
Route::get('htmlparser', function () {
    $parser = new HtmlDomParser();
    $html = $parser->file_get_html('http://www.kijiji.ca/v-mountain-bike/kitchener-waterloo/eranger-electric-mid-drive-fat-bike-48v-750w/1101241419');
    echo $html->plaintext;
    foreach ($html->find('span[itemprop=price]') as $span) {
        $price = $span->plaintext;
    }
    echo $price;
    foreach ($html->find('div[id=ImageThumbnails] img') as $img) {
        $src = str_replace('$_14', '$_27', $img->src);
        echo "<img src='{$src}'>";
    }
});
Route::get('mailtest', function () {
    $ads = App\Ad::whereEmailed(false)->get();
    foreach ($ads as $ad) {
        $data['ad'] = $ad;
 /**
  * 查询关键词排名
  * @param  string $keyword 关键词
  * @param  string $url     要查询的网站
  * @return string|int          查询结果
  */
 public function getRank($keyword = '阿瑞吡坦', $url = 'www.c-aring.com', $limit = 1)
 {
     Vendor('HtmlDomParser.HtmlDomParser');
     $retry = 0;
     $index = 0;
     for ($p = 0; $p < $limit; $p++) {
         $html = \HtmlDomParser::file_get_html('http://www.baidu.com/s?wd=' . urlencode($keyword) . '&pn=' . $p * 10);
         if (!$html->root) {
             if ($retry < 3) {
                 // 查询失败的话,重试两次
                 $p--;
                 $retry++;
             } else {
                 $retry = 0;
                 // 超出查询次数,重试次数归零
             }
             continue;
         }
         $retry = 0;
         // 查询成功,重试次数归零
         $ret = $html->find('div[class=c-container]');
         foreach ($ret as $i => $e) {
             $index++;
             $url_span = $e->find('span[class=g]');
             if (count($url_span) == 0) {
                 $url_span = $e->find('span[class=c-showurl]');
             }
             if (strpos($url_span[0]->plaintext, $url) !== false) {
                 return $index;
             }
         }
     }
     return '不在前' . $limit . '页';
 }