function crawl($text, $imageQuantity, $header) { if (preg_match(Regex::$URL_REGEX, $text, $match)) { $title = ""; $description = ""; $videoIframe = ""; $video = false; if (strpos($match[0], " ") === 0) { $match[0] = "http://" . substr($match[0], 1); } $finalUrl = $match[0]; $pageUrl = $finalUrl; if (Content::isImage($pageUrl)) { $images = [$pageUrl]; } else { $urlData = $this->getPage($pageUrl); if (!$urlData["content"] && strpos($pageUrl, "//www.") === false) { if (strpos($pageUrl, "http://") !== false) { $pageUrl = str_replace("http://", "http://www.", $pageUrl); } elseif (strpos($pageUrl, "https://") !== false) { $pageUrl = str_replace("https://", "https://www.", $pageUrl); } $urlData = $this->getPage($pageUrl); } $pageUrl = $finalUrl = $urlData["url"]; $raw = $urlData["content"]; $header = $urlData["header"]; $metaTags = Content::getMetaTags($raw); $tempTitle = Content::extendedTrim($metaTags["title"]); if ($tempTitle != "") { $title = $tempTitle; } if ($title == "") { if (preg_match(Regex::$TITLE_REGEX, str_replace("\n", " ", $raw), $matching)) { $title = $matching[2]; } } $tempDescription = Content::extendedTrim($metaTags["description"]); if ($tempDescription != "") { $description = $tempDescription; } else { $description = Content::crawlCode($raw); } $descriptionUnderstood = false; if ($description != "") { $descriptionUnderstood = true; } if ($descriptionUnderstood == false && strlen($title) > strlen($description) && !preg_match(Regex::$URL_REGEX, $description) && $description != "" && !preg_match('/[A-Z]/', $description) || $title == $description) { $title = $description; $description = Content::crawlCode($raw); } if (Content::isJson($title)) { $title = ""; } if (Content::isJson($description)) { $description = ""; } $media = $this->getMedia($pageUrl); $images = count($media) == 0 ? array(Content::extendedTrim($metaTags["image"])) : array($media[0]); $videoIframe = $media[1]; if (count($images) == 0 || $images[0] === "") { $images = Content::getImages($raw, $pageUrl, $imageQuantity); } if ($media != null && $media[0] != "" && $media[1] != "") { $video = true; } $title = Content::extendedTrim($title); $pageUrl = Content::extendedTrim($pageUrl); $description = Content::extendedTrim($description); $description = preg_replace(Regex::$SCRIPT_REGEX, "", $description); } $finalLink = explode("&", $finalUrl); $finalLink = $finalLink[0]; $description = strip_tags($description); $videoIframe = $videoIframe == null ? "" : $videoIframe; $answer = array("title" => $title, "url" => $finalLink, "pageUrl" => $finalUrl, "canonicalUrl" => Url::canonicalPage($pageUrl), "description" => $description, "image" => $images[0], "images" => $images, "video" => $video, "videoIframe" => $videoIframe); $result_json = Json::jsonSafe($answer, $header); $result_json_decoded = json_decode($result_json); $flagged = false; if (!isset($result_json_decoded->title)) { $title = utf8_encode($title); $flagged = true; } if (!isset($result_json_decoded->description)) { $description = utf8_encode($description); $flagged = true; } if ($flagged) { $answer = array("title" => $title, "url" => $finalLink, "pageUrl" => $finalUrl, "canonicalUrl" => Url::canonicalPage($pageUrl), "description" => $description, "image" => $images[0], "images" => $images, "video" => $video, "videoIframe" => $videoIframe); return Json::jsonSafe($answer, $header); } else { return $result_json; } } return null; }
function crawl($text, $imageQuantity, $header) { if (preg_match(Regex::$urlRegex, $text, $match)) { $title = ""; $description = ""; $videoIframe = ""; $video = "no"; //if (strpos($match[0], " ") === 0) // $match[0] = "http://" . substr($match[0], 1); $finalUrl = $text; $pageUrl = str_replace("https://", "http://", $finalUrl); $images = []; if (Content::isImage($pageUrl)) { $images[] = $pageUrl; } else { $urlData = $this->getPage($pageUrl); if (!$urlData["content"] && strpos($pageUrl, "//www.") === false) { if (strpos($pageUrl, "http://") !== false) { $pageUrl = str_replace("http://", "http://www.", $pageUrl); } elseif (strpos($pageUrl, "https://") !== false) { $pageUrl = str_replace("https://", "https://www.", $pageUrl); } $urlData = $this->getPage($pageUrl); } $pageUrl = $finalUrl = $urlData["url"]; $raw = $urlData["content"]; $header = $urlData["header"]; $headers = $urlData["headers"]; $metaTags = Content::getMetaTags($raw); $tempTitle = Content::extendedTrim($metaTags["title"]); if ($tempTitle != "") { $title = $tempTitle; } if ($title == "") { if (preg_match(Regex::$titleRegex, str_replace("\n", " ", $raw), $matching)) { $title = $matching[2]; } } $tempDescription = Content::extendedTrim($metaTags["description"]); if ($tempDescription != "") { $description = $tempDescription; } else { $description = Content::crawlCode($raw); } $descriptionUnderstood = false; if ($description != "") { $descriptionUnderstood = true; } if ($descriptionUnderstood == false && strlen($title) > strlen($description) && !preg_match(Regex::$urlRegex, $description) && $description != "" && !preg_match('/[A-Z]/', $description) || $title == $description) { $title = $description; $description = Content::crawlCode($raw); } if (Content::isJson($title)) { $title = ""; } if (Content::isJson($description)) { $description = ""; } $media = self::getMedia($pageUrl); if (count($media) == 0) { foreach ($metaTags['images'] as $metaImage) { $images[] = !preg_match(Regex::$httpRegex, $metaImage) ? Url::canonicalLink(Content::extendedTrim($metaImage), $pageUrl) : $metaImage; } } else { $images[] = $media[0]; $videoIframe = $media[1]; } $images = array_merge($images, Content::getImages($raw, $pageUrl, $imageQuantity)); $images = array_keys(array_flip($images)); // filter out duplicate image urls if ($media != null && $media[0] != "" && $media[1] != "") { $video = "yes"; } $title = Content::extendedTrim($title); $pageUrl = Content::extendedTrim($pageUrl); $description = Content::extendedTrim($description); $description = preg_replace(Regex::$scriptRegex, "", $description); } $finalLink = explode("&", $finalUrl); $finalLink = $finalLink[0]; $description = strip_tags($description); $can_brand = true; if (isset($headers["x-frame-options"]) && $headers["x-frame-options"] == "SAMEORIGIN") { $can_brand = false; } $answer = array("title" => $title, "url" => $finalLink, "pageUrl" => $finalUrl, "canonicalUrl" => Url::canonicalPage($pageUrl), "description" => $description, "images" => $images, "video" => $video, "videoIframe" => $videoIframe, "canBrand" => $can_brand); $result_json = Json::jsonSafe($answer, $header); $result_json_decoded = json_decode($result_json); $flagged = false; if (!isset($result_json_decoded->title)) { $title = utf8_encode($title); $flagged = true; } if (!isset($result_json_decoded->description)) { $description = utf8_encode($description); $flagged = true; } if ($flagged) { $answer = array("title" => $title, "url" => $finalLink, "pageUrl" => $finalUrl, "canonicalUrl" => Url::canonicalPage($pageUrl), "description" => $description, "images" => $images, "video" => $video, "videoIframe" => $videoIframe, "canBrand" => $can_brand); return Json::jsonSafe($answer, $header); } else { return $result_json; } } return null; }
<?php /** * Copyright (c) 2014 Leonardo Cardoso ( * Dual licensed under the MIT ( * and GPL ( licenses. * * Version: 1.3.0 */ include_once "classes/Database.php"; include_once "classes/SetUp.php"; include_once "classes/Json.php"; SetUp::headers(); $header = ""; $answer = Database::select(); echo Json::jsonSafe($answer, $header);
function crawl($text, $imageQuantity, $header) { if (preg_match(Regex::$urlRegex, $text, $match)) { $title = ""; $description = ""; $videoIframe = ""; $video = 0; if (strpos($match[0], " ") === 0) { $match[0] = "http://" . substr($match[0], 1); } $finalUrl = $match[0]; $flag11 = false; if (strpos($finalUrl, "https://") !== false) { //echo "<pre>";print_r($finalUrl); $flag11 = true; } $pageUrl = str_replace("https://", "http://", $finalUrl); if (Content::isImage($pageUrl)) { $images = $pageUrl; } else { $urlData = $this->getPage($pageUrl); if (strpos($urlData["content"], "301 Moved Permanently") !== false) { if ($flag11 == true) { $pageUrl = str_replace("http://", "https://", $finalUrl); } $urlData = $this->getPage($pageUrl); } if (!$urlData["content"] && strpos($pageUrl, "//www.") === false) { if (strpos($pageUrl, "http://") !== false) { $pageUrl = str_replace("http://", "http://www.", $pageUrl); } elseif (strpos($pageUrl, "https://") !== false) { $pageUrl = str_replace("https://", "https://www.", $pageUrl); } $urlData = $this->getPage($pageUrl); } $urlData = Content::stripIrrelevantTags($urlData); $pageUrl = $finalUrl = $urlData["url"]; $raw = $urlData["content"]; //echo "<pre>";print_r($raw); $header = $urlData["header"]; $metaTags = Content::getMetaTags($raw); //echo "<pre>";print_r($metaTags);exit; $keywords = ""; if (isset($metaTags["keywords"])) { $keywords = $metaTags["keywords"]; } $tempTitle = Content::extendedTrim($metaTags["title"]); if ($tempTitle != "") { $title = $tempTitle; } if ($title == "") { if (preg_match(Regex::$titleRegex, str_replace("\n", " ", $raw), $matching)) { $title = $matching[2]; } } $tempDescription = Content::extendedTrim($metaTags["description"]); if ($tempDescription != "") { $description = $tempDescription; } else { $description = Content::crawlCode($raw); } $descriptionUnderstood = false; if ($description != "") { $descriptionUnderstood = true; } if ($descriptionUnderstood == false && strlen($title) > strlen($description) && !preg_match(Regex::$urlRegex, $description) && $description != "" && !preg_match('/[A-Z]/', $description) || $title == $description) { $title = $description; $description = Content::crawlCode($raw); } $media = $this->getMedia($pageUrl); $images = count($media) == 0 ? Content::extendedTrim($metaTags["image"]) : $media[0]; $videoIframe = $media[2]; if ($images == "") { $images = Content::getImages($raw, $pageUrl, $imageQuantity); } if ($media != null && $media[0] != "" && $media[1] != "") { $video = 1; } $title = Content::extendedTrim($title); $pageUrl = Content::extendedTrim($pageUrl); $description = Content::extendedTrim($description); $description = preg_replace(Regex::$scriptRegex, "", $description); } $finalLink = explode("&", $finalUrl); $finalLink = $finalLink[0]; $description = strip_tags($description); $answer = array("title" => $title, "url" => $finalLink, "keywords" => $keywords, "pageUrl" => $finalUrl, "canonicalUrl" => Url::canonicalPage($pageUrl), "description" => $description, "images" => $images, "video" => $video, "videoIframe" => $videoIframe); // Changed by bhargav. return $answer; // end change. $result_json = Json::jsonSafe($answer, $header); $result_json_decoded = json_decode($result_json); $flagged = false; if (!isset($result_json_decoded->title)) { $title = utf8_encode($title); $flagged = true; } if (!isset($result_json_decoded->description)) { $description = utf8_encode($description); $flagged = true; } if ($flagged) { $answer = array("title" => $title, "url" => $finalLink, "pageUrl" => $finalUrl, "canonicalUrl" => Url::canonicalPage($pageUrl), "description" => $description, "images" => $images, "video" => $video, "videoIframe" => $videoIframe); return Json::jsonSafe($answer, $header); } else { return $result_json; } } return null; }