public static function oParseHtml($iHtmlID) { $oHtml = Ad::oGetHtml($iHtmlID); $oAd = new Ad(); $oAd->oPage->iHtmlID = $oHtml->id; $oAd->oPage->sListID = $oHtml->list; $oAd->oPage->sFetched = $oHtml->fetched; $oAd->oPage->sDomain = $oHtml->domain_hash; $oAd->oPage->sUrl = $oHtml->url; $oDom = HtmlDomParser::str_get_html($oHtml->html); $oMainInfo = $oDom->find('.panel-body > .row', 0); $sOrange = $oDom->find('.headline-key-facts', 0)->innertext; $sSquareMeters = Utilitu::sPregRead('#röße:\\s+([,\\d]+)m#', $sOrange); $oAd->oPhysical->nSquareMeters = floatval($sSquareMeters); $aKostenRows = $oMainInfo->find('.col-sm-5 tbody tr'); $aKostenRowsByLabel = array(); foreach ($aKostenRows as $oKostenRow) { $aCells = $oKostenRow->find('td'); $sLabel = trim($aCells[0]->plaintext, "\t\n :"); $sValue = trim($aCells[1]->plaintext, "\t\n "); $iPrice = 100 * intval(str_replace(array(',', '€'), array('.', ''), $sValue)); $aKostenRowsByLabel[$sLabel] = array('oDom' => $oKostenRow, 'sLabel' => $sLabel, 'sValue' => $sValue, 'iPrice' => $iPrice); } $aKostenMap = array('iCold' => 'Miete', 'iNeben' => 'Nebenkosten', 'iOther' => 'Sonstige Kosten', 'iBail' => 'Kaution', 'iBuy' => 'Abschlagszahlung'); foreach ($aKostenMap as $sTarget => $sSource) { if (isset($aKostenRowsByLabel[$sSource])) { $oAd->oPrice->{$sTarget} = $aKostenRowsByLabel[$sSource]['iPrice']; } } $oAd->oPrice->iWarm = $oAd->oPrice->iCold + $oAd->oPrice->iNeben; $sAddressHtml = $oMainInfo->find('.col-sm-4 > p', 0)->innertext; $sAddress = trim($sAddressHtml); $sAddress = str_replace("\n", '', $sAddress); $sAddress = preg_replace('#<br ?/?>\\s+#', "\n", $sAddress); $aAddress = explode("\n", $sAddress); $oAd->oAddress->sCity = 'Aachen'; $oAd->oAddress->sZip = Utilitu::sPregRead('#\\s*(\\d+)#', $aAddress[0]); $oAd->oAddress->sStreet = trim($aAddress[1]); $sGeocodeAddress = $oAd->oAddress->sStreet . ', ' . $oAd->oAddress->sZip . ' ' . 'Aachen'; $oCoords = Maps::oGetCoords($sGeocodeAddress); $oAd->oAddress->oCoords = $oCoords; $aImageDoms = $oDom->find('img.sp-image'); foreach ($aImageDoms as $oImageDom) { if (!isset($oImageDom->attr['data-large'])) { continue; } $oImage = new StdClass(); $oImage->sUrl = str_replace('/./', '/', $oImageDom->attr['data-large']); $sFileType = Utilitu::sPregRead('#\\.([^\\.]+)$#', $oImage->sUrl); $oImage->sFile = self::$sImagesFolder . md5($oImage->sUrl) . '.' . $sFileType; if (!file_exists($oImage->sFile)) { $sImage = Curl::sGet($oImage->sUrl); if (Curl::iGetLastStatus() == 200) { file_put_contents($oImage->sFile, $sImage); } } if (file_exists($oImage->sFile)) { $oAd->oPage->aImages[] = $oImage; } } $aDescription = array(); $aDescriptionBlocks = $oDom->find('#infobox_nachrichtsenden', 0)->parent->find('.freitext'); foreach ($aDescriptionBlocks as $oDescriptionBlock) { $aDescription[] = $oDescriptionBlock->plaintext; } $oAd->oPage->sDescription = implode("\n\n", $aDescription); $oAd->oPage->sDescription = preg_replace('#\\n\\s+#', "\n", $oAd->oPage->sDescription); $oAd->oPage->sDescription = str_replace(' ', '', $oAd->oPage->sDescription); $aPotentialDates = $oDom->find('.col-sm-4 .col-sm-12'); foreach ($aPotentialDates as $oPotentialDate) { if (preg_match('#^\\s*Angebot vom:\\s*(.+)\\s*$#', $oPotentialDate->plaintext, $aMatch)) { $sDate = date('Y-m-d H:i:s', strtotime($aMatch[1])); $oAd->oPage->sCreated = $sDate; $oAd->oPage->sChanged = $sDate; } } /// TODO: oContact Ad::iRemoveAdsByUrl($oHtml->url); $oAd->vSave(); DirectDB::bUpdate('ads_htmls', array('parsed' => true), $oHtml->id); return $oAd; }
$ad = new App\Ad(); $ad->id = $id; $ad->title = $title; $ad->description = $description; $ad->price = $price; $ad->link = $link; $ad->save(); echo "Added " . $id . " {$price}<br/>"; } else { echo "Item already on database " . $id . "<br/>"; } } #return false; }); Route::get('htmlparser', function () { $parser = new HtmlDomParser(); $html = $parser->file_get_html('http://www.kijiji.ca/v-mountain-bike/kitchener-waterloo/eranger-electric-mid-drive-fat-bike-48v-750w/1101241419'); echo $html->plaintext; foreach ($html->find('span[itemprop=price]') as $span) { $price = $span->plaintext; } echo $price; foreach ($html->find('div[id=ImageThumbnails] img') as $img) { $src = str_replace('$_14', '$_27', $img->src); echo "<img src='{$src}'>"; } }); Route::get('mailtest', function () { $ads = App\Ad::whereEmailed(false)->get(); foreach ($ads as $ad) { $data['ad'] = $ad;
/** * 查询关键词排名 * @param string $keyword 关键词 * @param string $url 要查询的网站 * @return string|int 查询结果 */ public function getRank($keyword = '阿瑞吡坦', $url = 'www.c-aring.com', $limit = 1) { Vendor('HtmlDomParser.HtmlDomParser'); $retry = 0; $index = 0; for ($p = 0; $p < $limit; $p++) { $html = \HtmlDomParser::file_get_html('http://www.baidu.com/s?wd=' . urlencode($keyword) . '&pn=' . $p * 10); if (!$html->root) { if ($retry < 3) { // 查询失败的话,重试两次 $p--; $retry++; } else { $retry = 0; // 超出查询次数,重试次数归零 } continue; } $retry = 0; // 查询成功,重试次数归零 $ret = $html->find('div[class=c-container]'); foreach ($ret as $i => $e) { $index++; $url_span = $e->find('span[class=g]'); if (count($url_span) == 0) { $url_span = $e->find('span[class=c-showurl]'); } if (strpos($url_span[0]->plaintext, $url) !== false) { return $index; } } } return '不在前' . $limit . '页'; }