/** * @param string $query * @param int $pageNum * @return array */ public static function searchFromNet($query = "", $pageNum = 1) { if (empty($query)) { return []; } $url = Config::get('maltapark.url') . Config::get('maltapark.pageSearch') . $query . Config::get('maltapark.pageNum') . $pageNum; $content = Helpers\FakeBrowser::get($url); $items = []; $crawler = new Crawler(null, $url); $crawler->addContent($content, "text/html"); foreach ($crawler->filter('#item_list') as $node) { $item = new ListItem(Helpers\Dom::getHtml($node)); $items[] = $item; } return $items; }
public function run() { $url = $this->argument[1]; $content = FakeBrowser::get($url); $items = []; $crawler = new Crawler(null, $url); $crawler->addContent($content, "text/html"); foreach ($crawler->filter('.result_box') as $node) { $item = Helpers\Dom::getHtml($node); $items[] = $item; } echo "Found " . count($items) . " items"; echo "\n"; echo "parsing items\n"; $i = 1; $withEmail = []; $notEmail = []; foreach ($items as $item) { echo $i . " "; $emailUrl = RegExp::getFirstMatch('/;" href="(.+?)\\?v=e/', $item); if ($emailUrl !== null) { $emailUrl .= "?v=e"; $withEmail[] = $emailUrl; echo $emailUrl; echo " had email"; } else { $notEmailUrl = RegExp::getFirstMatch('/;" href="(.+?)".+><h2>/', $item); $notEmail[] = $notEmailUrl; echo $notEmailUrl; echo " had no email"; } echo "\n"; $i++; } echo "Fetching the email from YellowPage (total:" . count($withEmail) . ")"; echo "\n"; $i = 1; foreach ($withEmail as $emailUrl) { echo "{$i}. "; $content = FakeBrowser::get($emailUrl); $idMail = RegExp::getFirstMatch('/href=".+send-email.html\\?(.+?)"/', $content); if (null !== $idMail) { //Get Email $ypEmailUrl = 'https://www.yellow.com.mt/development-tools/send-email-iframe.html?'; $ypEmailUrl .= $idMail; $iframe = FakeBrowser::get($ypEmailUrl); $email = RegExp::getFirstMatch('/company_email" value="(.+?)"/', $iframe); if ($email !== null) { echo " found email: {$email}\n"; $this->resultMails[] = $email; } else { echo " error on iframe\n"; } } else { echo "Error on" . $emailUrl . "\n"; } $i++; } $i = 1; $websites = []; $notWebsite = []; echo "\nAttempt to get other email (total:" . count($notEmail) . ")\n"; foreach ($notEmail as $item) { echo "{$i}. "; $websiteUrl = FakeBrowser::get($item); $websiteUrl = RegExp::getFirstMatch('/redirector.php\\?yelref=[^"](.+?)"/', $websiteUrl); if ($websiteUrl !== null) { $websiteUrl = "h" . $websiteUrl; echo "found webSite: {$websiteUrl}\n"; $websites[] = $websiteUrl; } else { $notWebsite[] = $item; echo "{$item} does not have a website apparently\n"; } $i++; } $this->noWebSiteOnYP = $notWebsite; echo "\nChecking in the company websites (total:" . count($websites) . ")\n"; $i = 1; $notMailOnIndex = []; foreach ($websites as $website) { echo "{$i}. "; $contactPage = $this->searchContactPage($website); if (!empty($contactPage)) { echo " contact page not Empty for {$website}, searching for email\n"; $email = RegExp::getFirstMatch('/mailto:(.+?)"/', $contactPage); if (!empty($email)) { echo "\t found {$email}\n"; $this->resultMails[] = $email; } else { echo "\t no mailto found\n"; echo "\t trying from homePage\n"; $contactPage = FakeBrowser::get($website); $email = RegExp::getFirstMatch('/mailto:(.+?)"/', $contactPage); if (!empty($email)) { echo "\t found {$email}\n"; $this->resultMails[] = $email; } else { echo "\t no mailto found\n"; $notMailOnIndex[] = $website; } } } else { $notMailOnIndex[] = $website; } $i++; } $this->websiteNoEmail = $notMailOnIndex; //now we should crawl those $notMailOnIndex from scratch }