/** * Getting crowler * @param string $url * @return boolean|\Crowler */ public static function getAnalyzer($url) { // Check is robots allowed if (!Robots::robots_allowed($url, Config::$agent_name)) { Providers::change_url_status($url, Providers::URLS_TYPE_ROBOTS_NOT_ALLOWED); _w('Robots not allowed'); return false; } // Create object $obj = new ContentAnalyzer($url); if (!$obj->getCONTENT_DATA()) { Providers::change_url_status($url, Providers::URLS_TYPE_ERROR_NO_DATA); return false; } return $obj; }
<?php include 'libs/general.php'; CronManager::init(__FILE__, 10); try { // GET URLs $urls = Providers::get_lead_urls(); _w('Got ' . count($urls) . ' urls'); /// PROCESS URLS foreach ($urls as $url) { // Parse array to url $url_w = urls::create_url($url); _w('Getting url ' . $url_w); // Init content analyzer $ca = ContentAnalyzer::getAnalyzer($url_w); // If content ignored if (!$ca) { _w('ignored'); // Skip continue; } // _w($ca->getOgDescription()); // die; _w('Createing general CA data'); _w('populating search table'); if (Providers::create_search_item($ca)) { _w("search data inserted"); } else { _w("unable to insert search data"); } _w('setting status to indexed');
/** * Creating search item data * @param ContentAnalyzer $ca * @return type */ public static function create_search_item(ContentAnalyzer $ca) { $desc = ''; $meta = $ca->getMetaTags(); if (isset($meta['description'])) { $desc = is_array($meta['description']) ? implode(' ', $meta['description']) : $meta['description']; } return db::create_fulltext_item($ca->getUrlId(), $ca->getTitle(), $desc, $ca->getPlainContent()); }
/** * Creating search item data * @param ContentAnalyzer $ca * @return type */ public static function create_search_item(ContentAnalyzer $ca) { $desc = ''; $meta = $ca->getMetaTags(); $title = $ca->getTitle(); $url = $ca->getUrl(); if ($title == '') { $title = $ca->getOgTitle(); } $content = $ca->getPlainContent(); if ($content == '') { $content = $ca->getOgDescription(); } if ($title == '') { return false; } if (isset($meta['description'])) { $desc = is_array($meta['description']) ? implode(' ', $meta['description']) : $meta['description']; } $host = parse_url($url)['host']; $keywords = self::create_keywords_from_content($content . ' ' . $title, $url); if (isset($meta['keywords']) && !empty($meta['keywords'])) { $keywords = is_array($meta['keywords']) ? implode(' ', $meta['keywords']) : $meta['keywords']; } if ($content == '' && $desc != '') { $content = $desc; } if (str_word_count($content) < 5) { return false; } if ($content == "") { return false; } if (strlen($content) != strlen(utf8_decode($content))) { return false; } return db::create_fulltext_item($title, $content, $keywords, $url, $desc); }