示例#1
0
 /**
  * Override this method to get access to all information about a page or file the crawler found and received.
  *
  * Everytime the crawler found and received a document on it's way this method will be called.
  * The crawler passes all information about the currently received page or file to this method
  * by a PHPCrawlerDocumentInfo-object.
  *
  * Please see the {@link PHPCrawlerDocumentInfo} documentation for a list of all properties describing the
  * html-document.
  *
  * Example:
  * <code>
  * class MyCrawler extends PHPCrawler
  * {
  *   function handleDocumentInfo($PageInfo)
  *   {
  *     // Print the URL of the document
  *     echo "URL: ".$PageInfo->url."<br />";
  *
  *     // Print the http-status-code
  *     echo "HTTP-statuscode: ".$PageInfo->http_status_code."<br />";
  *
  *     // Print the number of found links in this document
  *     echo "Links found: ".count($PageInfo->links_found_url_descriptors)."<br />";
  *
  *     // ..
  *   }
  * }
  * </code>
  *
  * @param PHPCrawlerDocumentInfo $PageInfo A PHPCrawlerDocumentInfo-object containing all information about the currently received document.
  *                                         Please see the reference of the {@link PHPCrawlerDocumentInfo}-class for detailed information.
  * @return int                             The crawling-process will stop immedeatly if you let this method return any negative value.
  *
  * @section 3 Overridable methods / User data-processing
  */
 public function handleDocumentInfo(PHPCrawlerDocumentInfo $PageInfo)
 {
     $pagehtml = new \simple_html_dom();
     $pagehtml = str_get_html($PageInfo->content);
     $previousURL = DB::select('select * from results where url = ?', [$PageInfo->url]);
     $months_trans = array('januar' => '01', 'februar' => '02', 'mart' => '03', 'april' => '04', 'maj' => '05', 'juni' => '06', 'juli' => '07', 'august' => '08', 'septembar' => '09', 'oktobar' => '10', 'novembar' => '11', 'decembar' => '12');
     $months_trans_CR = array('sijecnja' => '01', 'veljace' => '02', 'ozujka' => '03', 'travnja' => '04', 'svibnja' => '05', 'lipnja' => '06', 'srpnja' => '07', 'kolovoza' => '08', 'rujna' => '09', 'listopada' => '10', 'studenog' => '11', 'prosinca' => '12');
     $months_trans_EN = array('jan' => '01', 'feb' => '02', 'mar' => '03', 'apr' => '04', 'may' => '05', 'jun' => '06', 'jul' => '07', 'aug' => '08', 'sep' => '09', 'okt' => '10', 'nov' => '11', 'dec' => '12');
     if ($pagehtml && $previousURL == NULL) {
         if ($pagehtml->find('body', 0) != NULL) {
             print $PageInfo->url;
             print ' ||| ';
             $result = new results();
             $title = $pagehtml->find('title', 0);
             if ($title != NULL) {
                 $result->page_title = $title->innertext;
             } else {
                 $result->page_title = "Untitled";
             }
             $result->url = $PageInfo->url;
             $result->content = $pagehtml->find('body', 0)->plaintext;
             if (array_key_exists('description', $PageInfo->meta_attributes)) {
                 $result->description = $PageInfo->meta_attributes['description'];
             } else {
                 $result->description = substr($result->content, 0, 255);
             }
             $result->portal = $PageInfo->host;
             $result->last_update = date("Y/m/d h:i:s");
             //This Section grabs the date of the article based on its code structure
             //---------------------------------
             if ($result->portal == "www.avaz.ba") {
                 $container = $pagehtml->find('div[class=box article] div[class=misc] span', 0);
                 if ($container != NULL) {
                     $container = preg_replace("/[^0-9]/", "", $container->plaintext);
                     $result->date = substr($container, 4, 4) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.klix.ba") {
                 if (array_key_exists('twitter:url', $PageInfo->meta_attributes)) {
                     $year = '20' . substr($PageInfo->meta_attributes['twitter:url'], -9, -7);
                     $month = substr($PageInfo->meta_attributes['twitter:url'], -7, -5);
                     $day = substr($PageInfo->meta_attributes['twitter:url'], -5, -3);
                     $date = $year . '-' . $month . '-' . $day;
                     if (checkdate($month, $day, $year)) {
                         $result->date = $date;
                         $result->isArticle = true;
                     }
                 }
             } elseif ($result->portal == "www.radiosarajevo.ba" || $result->portal == "radiosarajevo.ba") {
                 $container = $pagehtml->find('div[class=news_date]', 0);
                 if ($container != NULL) {
                     $container = trim($container->plaintext);
                     $dayyear = preg_replace("/[^0-9]/", "", $container);
                     $day = substr($dayyear, 0, 2);
                     $year = substr($dayyear, 2, 4);
                     $month = preg_replace("/[^A-z]/", "", $container);
                     $month = substr($month, 0, -1);
                     $month = @$months_trans[$month];
                     $result->date = $year . '-' . $month . '-' . $day;
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "balkans.aljazeera.net") {
                 $container = $pagehtml->find('meta[property=article:published_time]', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->content, 0, 10);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.nezavisne.com") {
                 $container = $pagehtml->find('time[class=dateline text-muted]', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->datetime, 0, 10);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.fokus.ba") {
                 $container = $pagehtml->find('meta[property=article:modified_time]', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->content, 0, 10);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.bljesak.info" || $result->portal == "bljesak.info") {
                 $container = $pagehtml->find('div[class=info] div', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/(Č|č)/', 'c', $container->plaintext);
                     print $container . '||';
                     $container = preg_replace('/(Ž|ž)/', 'z', $container);
                     print $container . '||';
                     $container = preg_replace('/\\w+: \\w+, /', "", $container);
                     print $container . '||';
                     $dayyear = preg_replace('/[^0-9]/', "", $container);
                     $day = substr($dayyear, 0, 2);
                     $year = substr($dayyear, 2, 4);
                     $month = preg_replace("/[0-9\\s\\.:]/", "", $container);
                     $month = @$months_trans_CR[$month];
                     $result->date = $year . '-' . $month . '-' . $day;
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.ekskluziva.ba" || $result->portal == "ekskluziva.ba") {
                 $container = $pagehtml->find('div[class=boxNaslov clanaknaslov] div[class=datum]', 0);
                 if ($container != NULL) {
                     $container = trim($container->plaintext);
                     $result->date = '20' . substr($container, 6, 2) . '-' . substr($container, 3, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.abc.ba" || $result->portal == "abc.ba") {
                 $container = $pagehtml->find('div[class=all datum]', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9]/', "", $container->plaintext);
                     $result->date = substr($container, 4, 4) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.vecernji.ba" || $result->portal == "vecernji.ba") {
                 $container = $pagehtml->find('header[class=detail_head cf] div[class=top cf] aside[class=meta] p', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9\\.]/', "", $container->plaintext);
                     if (strlen($container) == 14) {
                         $result->date = substr($container, 4, 4) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     } elseif (strlen($container) == 12) {
                         $result->date = substr($container, 4, 4) . '-' . '0' . substr($container, 2, 1) . '-' . '0' . substr($container, 0, 1);
                     } elseif (preg_match('/\\.\\d\\./', $container)) {
                         $result->date = substr($container, 5, 4) . '-' . '0' . substr($container, 3, 1) . '-' . substr($container, 0, 2);
                     } else {
                         $result->date = substr($container, 5, 4) . '-' . substr($container, 2, 2) . '-' . '0' . substr($container, 0, 1);
                     }
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.biscani.net" || $result->portal == "biscani.ba") {
                 $container = $pagehtml->find('div[id=post-info22] a', 0);
                 if ($container != NULL) {
                     // $container=preg_replace('/\/[0-9 \.A-z]+/',"",$container->plaintext);
                     // $year_day=preg_replace('/[^0-9\.]/',"",$container);
                     // $year=substr($year_day,-4);
                     // $day=preg_replace('/\..+/','',$year_day);
                     // if(strlen($day)==1)$day='0'.$day;
                     // $month=preg_replace('/\s\d+\s.+/','',$container);
                     // $month=preg_replace('/[^A-z]/',"",$month);
                     // $index=strtolower($month).'a';
                     // To be continued
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.cazin.net" || $result->portal == "cazin.ba") {
                 $container = $pagehtml->find('p[class=postmeta] span time', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9]/', "", $container->plaintext);
                     $result->date = substr($container, 4, 4) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.faktor.ba" || $result->portal == "faktor.ba") {
                 $container = $pagehtml->find('meta[property=article:modified_time]', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->content, 0, 10);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.glassrpske.com" || $result->portal == "glassrpske.ba") {
                 $container = $pagehtml->find('div[class=vijest] cite', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9]/', "", $container->plaintext);
                     $result->date = substr($container, 4, 4) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.n1info.com" || $result->portal == "n1info.ba" || $result->portal == "ba.n1info.com") {
                 $container = $pagehtml->find('article time', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->datetime, 0, 10);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.hayat.ba" || $result->portal == "hayat.ba") {
                 $container = $pagehtml->find('div[id=content] div[class=date]', 0);
                 if ($container != NULL) {
                     $day = preg_replace('/[^0-9]/', "", $container->plaintext);
                     $day = substr($day, 0, -4);
                     if (strlen($day) == 1) {
                         $day = '0' . $day;
                     }
                     $year = date('Y');
                     $month = preg_replace('/[^A-z]/', "", $container->plaintext);
                     $result->date = $year . '-' . @$months_trans_EN[$month] . '-' . $day;
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.haber.ba" || $result->portal == "haber.ba") {
                 $container = $pagehtml->find('meta[property=article:modified_time]', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->content, 0, 10);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.krajina.ba" || $result->portal == "krajina.ba") {
                 $container = $pagehtml->find('article div header div[class=td-post-date] time', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->datetime, 0, 10);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.oslobodjenje.ba" || $result->portal == "oslobodjenje.ba") {
                 $container = $pagehtml->find('div[id=noviarticle] p[class=author]', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9]/', "", $container->plaintext);
                     $result->date = substr($container, 4, 4) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.bh-index.com" || $result->portal == "bh-index.ba") {
                 $container = $pagehtml->find('meta[property=article:modified_time]', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->content, 0, 10);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.source.ba" || $result->portal == "source.ba") {
                 $container = $pagehtml->find('div[class=vrijemeObjaveClanka]', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9\\.]/', "", $container->plaintext);
                     $container = substr($container, 0, -4);
                     $year = substr($container, -4);
                     $day = preg_replace('/\\..*/', "", $container);
                     if (strlen($day) == 1) {
                         $day = '0' . $day;
                     }
                     preg_match('/\\..*\\./', $container, $month);
                     $month = preg_replace('/[^0-9]/', "", $month[0]);
                     if (strlen($month) == 1) {
                         $month = '0' . $month;
                     }
                     $result->date = $year . '-' . $month . '-' . $day;
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.depo.ba" || $result->portal == "depo.ba") {
                 $container = $pagehtml->find('p[class=writtenBy]', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9]/', "", $container->plaintext);
                     $result->date = '20' . substr($container, 4, 2) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.vijesti.ba" || $result->portal == "vijesti.ba") {
                 $container = $pagehtml->find('div[class=articleView article] h2', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9]/', "", $container->plaintext);
                     $result->date = substr($container, 4, 4) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.novi.ba" || $result->portal == "novi.ba") {
                 $container = $pagehtml->find('div[class=article-view] header div div[class=misc] div[class=info] div[class=date] span', 0);
                 if ($container != NULL) {
                     $container = preg_replace('/[^0-9]/', "", $container->plaintext);
                     $result->date = substr($container, 4, 4) . '-' . substr($container, 2, 2) . '-' . substr($container, 0, 2);
                     $result->isArticle = true;
                 }
             } elseif ($result->portal == "www.sportsport.ba" || $result->portal == "sportsport.ba") {
                 $container = $pagehtml->find('div[id=news_page] article time', 0);
                 if ($container != NULL) {
                     $result->date = substr($container->datetime, 0, 10);
                     $result->isArticle = true;
                 }
             }
             if ($result->isArticle) {
                 $socialshares = array();
                 $socialshares = $this->CountShares($PageInfo->url);
                 $result->fb_likes = $socialshares['fb_likes'];
                 $result->fb_shares = $socialshares['fb_shares'];
                 $result->fb_comments = $socialshares['fb_comments'];
                 $result->gp_shares = $socialshares['gp_shares'];
                 $result->total_shares = $socialshares['fb_likes'] + $socialshares['fb_shares'] + $socialshares['fb_comments'] + $socialshares['gp_shares'];
             }
             @$result->save();
         }
     } elseif ($pagehtml == false) {
         Log::info('FAULTY URL:' . $PageInfo->url);
     } else {
         if ($previousURL[0]->isArticle) {
             $date1 = new DateTime("now");
             $date2 = new DateTime($previousURL[0]->date);
             $diff = date_diff($date2, $date1);
             if ($diff->days <= 30) {
                 $socialshares = array();
                 $socialshares = $this->CountShares($PageInfo->url);
                 $total_shares = $socialshares['fb_likes'] + $socialshares['fb_shares'] + $socialshares['fb_comments'] + $socialshares['gp_shares'];
                 DB::table('results')->where("url", $PageInfo->url)->update(['fb_likes' => $socialshares['fb_likes'], 'fb_shares' => $socialshares['fb_shares'], 'fb_comments' => $socialshares['fb_comments'], 'gp_shares' => $socialshares['gp_shares'], 'total_shares' => $total_shares]);
             }
         }
         print 'Duplicate Record ||| ';
     }
     flush();
 }
示例#2
0
 public function searchportals($keyword, $datefrom, $dateto, $portal)
 {
     $keyword = '%' . $keyword . '%';
     if ($datefrom == NULL || $dateto == NULL) {
         $search_items = results::where('content', 'LIKE', $keyword)->whereIn('portal', $portal)->orderBy('total_shares', 'desc')->get(['url', 'portal', 'page_title', 'date', 'fb_likes', 'fb_shares', 'fb_comments', 'gp_shares', 'total_shares']);
     } else {
         $datefrom = preg_replace('#\\/#', '-', $datefrom);
         $dateto = preg_replace('#\\/#', '-', $dateto);
         $search_items = results::where('content', 'LIKE', $keyword)->where(function ($query) use($datefrom, $dateto) {
             $query->whereBetween('date', [$datefrom, $dateto])->orWhere('date', '=', NULL);
         })->whereIn('portal', $portal)->orderBy('total_shares', 'desc')->get(['url', 'portal', 'page_title', 'date', 'fb_likes', 'fb_shares', 'fb_comments', 'gp_shares', 'total_shares']);
     }
     $fb_likes = $search_items->sum('fb_likes');
     $fb_shares = $search_items->sum('fb_shares');
     $fb_comments = $search_items->sum('fb_comments');
     $gp_shares = $search_items->sum('gp_shares');
     $total_shares = $search_items->sum('total_shares');
     $social_stats = array('fb_likes' => $fb_likes, 'fb_shares' => $fb_shares, 'fb_comments' => $fb_comments, 'gp_shares' => $gp_shares, 'total_shares' => $total_shares);
     $this->update_social_count($social_stats);
     return $search_items;
 }