public function collectData(array $param) { function fetchArticle($link) { $page = file_get_html($link); $contenu = $page->find(".article-text")[0]; return strip_tags($contenu); } $html = ''; $html = file_get_html('http://www.courrierinternational.com/article') or $this->returnError('Error.', 500); $element = $html->find(".type-normal"); $article_count = 1; foreach ($element as $article) { $item = new \Item(); $item->uri = "http://www.courrierinternational.com" . $article->find("a")[0]->getAttribute("href"); $item->content = fetchArticle("http://www.courrierinternational.com" . $article->find("a")[0]->getAttribute("href")); $item->title = strip_tags($article->find("h2")[0]); $dateTime = date_parse($article->find("time")[0]); $item->timestamp = mktime($dateTime['hour'], $dateTime['minute'], $dateTime['second'], $dateTime['month'], $dateTime['day'], $dateTime['year']); $this->items[] = $item; $article_count++; if ($article_count > 5) { break; } } }
function fetchArticle($pagename,$recurse = 1){ $pagename = urlencode($pagename); $url = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&explaintext=&exchars=999999&format=json&exintro=&titles=".$pagename."&utf8="; $url = "https://en.wikipedia.org/w/api.php?action=query&prop=revisions&format=json&rvprop=content&titles=".$pagename."&utf8="; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5'); $result = curl_exec($ch); curl_close($ch); $json_array = json_decode($result, true); $extract = "There was nothing here."; foreach($json_array['query']['pages'] as $page){ foreach($page['revisions'] as $revision){ if (isset($revision['*'])) { $extract = $revision['*']; } else { $extract = "You see nothing special."; } } } // If it's a redirect, follow it (but only once) if (preg_match("/#REDIRECT \[\[([^\]]+)\]\]/",$extract,$matches) && $recurse>0) { list($extract,$temparticle) = fetchArticle($matches[1],0); } if (strlen($extract)<5) { $extract = "You see nothing special here."; } $original = $extract; // Replace station templates with just the station name $extract = preg_replace("/{{([^|])+ stations\|station=([^{}]+)}}/","$2",$extract); // Unpack conversion templates $extract = preg_replace("/{{convert\|([^|]+)\|([^|]+)\|[^{}]+}}/","$1 $2",$extract); // Strip wiki template markup three times for good measure $extract = preg_replace("/{{[^{}]+}}/","",$extract); $extract = preg_replace("/{{[^{}]+}}/","",$extract); $extract = preg_replace("/{{[^{}]+}}/","",$extract); // Strip ref tags and their contents $extract = preg_replace("/<ref>[^<]+<\/ref>/","",$extract); $extract = cleanpunctuation($extract); $extract = strip_tags($extract); // strip all remaining HTML $extract = preg_replace("/'''?/","",$extract); // strip bold/italic markup $extract = preg_replace("/=====[^=]+=====/","",$extract); // strip headings $extract = preg_replace("/====[^=]+====/","",$extract); // strip headings $extract = preg_replace("/===[^=]+===/","",$extract); // strip headings $extract = preg_replace("/==[^=]+==/","",$extract); // strip headings // Cleanup $extract = preg_replace("/\n/"," ",$extract); $extract = preg_replace("/ /"," ",$extract); $extract = preg_replace("/ +/"," ",$extract); $extract = preg_replace("/\[\[([^|\]]+)\]\]/","$1",$extract); // strip brackets from unpiped links $extract = preg_replace("/\[\[([^|]+)\|([^]]+)\]\]/","$2",$extract); // replace piped links with second term $extract = preg_replace("/\[([^\]]+)\]/","$1",$extract); // strip anything left in a bracket (probably just external links) $extract = preg_replace("/\/[^ ]+\//","",$extract); // strip IPA pronunciation // Avoid problematic abbreviations that look like they end sentences $extract = preg_replace("/(St|Dr|Mr|Mrs|Ms|Rt Hon|pp|\(b|\(ca?|No)\./","$1.",$extract); $extract = preg_replace("/\b([A-Za-z])\./","$1.",$extract); $extract = preg_replace("/(\d),(\d\d\d)/","$1,$2",$extract); return array($original,$extract); }