public function collectData(array $param)
 {
     function fetchArticle($link)
     {
         $page = file_get_html($link);
         $contenu = $page->find(".article-text")[0];
         return strip_tags($contenu);
     }
     $html = '';
     $html = file_get_html('http://www.courrierinternational.com/article') or $this->returnError('Error.', 500);
     $element = $html->find(".type-normal");
     $article_count = 1;
     foreach ($element as $article) {
         $item = new \Item();
         $item->uri = "http://www.courrierinternational.com" . $article->find("a")[0]->getAttribute("href");
         $item->content = fetchArticle("http://www.courrierinternational.com" . $article->find("a")[0]->getAttribute("href"));
         $item->title = strip_tags($article->find("h2")[0]);
         $dateTime = date_parse($article->find("time")[0]);
         $item->timestamp = mktime($dateTime['hour'], $dateTime['minute'], $dateTime['second'], $dateTime['month'], $dateTime['day'], $dateTime['year']);
         $this->items[] = $item;
         $article_count++;
         if ($article_count > 5) {
             break;
         }
     }
 }
Example #2
0
function fetchArticle($pagename,$recurse = 1){
	
	$pagename = urlencode($pagename);
	$url = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&explaintext=&exchars=999999&format=json&exintro=&titles=".$pagename."&utf8=";
	$url = "https://en.wikipedia.org/w/api.php?action=query&prop=revisions&format=json&rvprop=content&titles=".$pagename."&utf8=";

	$ch = curl_init();
	curl_setopt($ch, CURLOPT_URL, $url);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
	curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5');
	$result = curl_exec($ch);
	curl_close($ch);

	$json_array = json_decode($result, true);
	
	$extract = "There was nothing here.";
	
	foreach($json_array['query']['pages'] as $page){

		foreach($page['revisions'] as $revision){
			
		if (isset($revision['*'])) { $extract = $revision['*']; } else { $extract = "You see nothing special."; }
		}
			
	}
	
	// If it's a redirect, follow it (but only once)
	if (preg_match("/#REDIRECT \[\[([^\]]+)\]\]/",$extract,$matches) && $recurse>0)
	{
		list($extract,$temparticle) = fetchArticle($matches[1],0);
	}
	
	if (strlen($extract)<5) { $extract = "You see nothing special here."; }

	$original = $extract;
	
    // Replace station templates with just the station name
	$extract = preg_replace("/{{([^|])+ stations\|station=([^{}]+)}}/","$2",$extract);
	
	// Unpack conversion templates
	$extract = preg_replace("/{{convert\|([^|]+)\|([^|]+)\|[^{}]+}}/","$1 $2",$extract);
	
    // Strip wiki template markup three times for good measure
	$extract = preg_replace("/{{[^{}]+}}/","",$extract);
	$extract = preg_replace("/{{[^{}]+}}/","",$extract);
	$extract = preg_replace("/{{[^{}]+}}/","",$extract);
	
	// Strip ref tags and their contents
	$extract = preg_replace("/<ref>[^<]+<\/ref>/","",$extract);

	$extract = cleanpunctuation($extract);
	$extract = strip_tags($extract); // strip all remaining HTML
	$extract = preg_replace("/'''?/","",$extract); // strip bold/italic markup
	$extract = preg_replace("/=====[^=]+=====/","",$extract); // strip headings
	$extract = preg_replace("/====[^=]+====/","",$extract); // strip headings
	$extract = preg_replace("/===[^=]+===/","",$extract); // strip headings
	$extract = preg_replace("/==[^=]+==/","",$extract); // strip headings

	// Cleanup
	$extract = preg_replace("/\n/"," ",$extract);
	$extract = preg_replace("/&nbsp;/"," ",$extract);
	$extract = preg_replace("/ +/"," ",$extract);

	$extract = preg_replace("/\[\[([^|\]]+)\]\]/","$1",$extract); // strip brackets from unpiped links
	$extract = preg_replace("/\[\[([^|]+)\|([^]]+)\]\]/","$2",$extract); // replace piped links with second term
	$extract = preg_replace("/\[([^\]]+)\]/","$1",$extract); // strip anything left in a bracket (probably just external links)
	$extract = preg_replace("/\/[^ ]+\//","",$extract); // strip IPA pronunciation

	// Avoid problematic abbreviations that look like they end sentences
	$extract = preg_replace("/(St|Dr|Mr|Mrs|Ms|Rt Hon|pp|\(b|\(ca?|No)\./","$1&period;",$extract);
	$extract = preg_replace("/\b([A-Za-z])\./","$1&period;",$extract);
	$extract = preg_replace("/(\d),(\d\d\d)/","$1&comma;$2",$extract);
	
	return array($original,$extract);
}