function getLinksFromSection($sectionURL) { global $spamsites; set_time_limit(0); $html = pf_file_get_html($sectionURL); $blogs = array(); $c = 0; foreach ($html->find('#bodyContent') as $body) { foreach ($body->find('a') as $link) { if (!in_array($link->href, $spamsites)) { if ($link->rel == 'nofollow') { $URL = $link->href; $title = $link->innertext; $slug = slugger($title); $blogs[$slug]['slug'] = $slug; $blogs[$slug]['url'] = $URL; $blogs[$slug]['title'] = htmlspecialchars(strip_tags($title)); } } else { } } } return $blogs; }
public function build_the_ref_array() { error_reporting(E_ALL); error_reporting(-1); echo 'begin<br /><br />'; $theWikiLink = 'http://academicblogs.org/index.php/Main_Page'; $htmlCounter = array(); //Random article for testing. $html = pf_file_get_html($theWikiLink); //print_r($html); # Get the title page foreach ($html->find('h1') as $link) { //print_r($link); // if (($link->plaintext == '[edit] External links') || ($link->plaintext == '[edit] References') ){ set_time_limit(0); # Get the main content block $nextBlock = $link->next_sibling(); //print_r($nextBlock); $counter = 0; $sectionCounter = 0; $links = array(); # Walk through the dom and count paragraphs between H2 tags foreach ($nextBlock->children() as $bodyChild) { echo $bodyChild; if ($bodyChild->find('span') && $bodyChild->tag == 'h2') { foreach ($bodyChild->find('span') as $span) { $sectionCounter++; $spanText = $span->innertext; $spanNameArray = explode(' ', $spanText); $spanSlug = ''; foreach ($spanNameArray as $spanNamePart) { $spanSlug .= htmlentities(ucfirst($spanNamePart)); } $spanSlug = $this->sanitize($spanSlug, false, true); $htmlCounter[$spanSlug]['slug'] = $spanSlug; $htmlCounter[$spanSlug]['text'] = htmlspecialchars(strip_tags($spanText)); $htmlCounter[$spanSlug]['counter'] = $counter; $counter = 0; $links = array(); //$htmlCounter[]; } } else { //$htmlCounter[$spanSlug]['error'] = false; } if ($bodyChild->tag == 'p' && count($bodyChild->find('a')) == 1 && count($bodyChild->find('a[class=new]')) == 0) { $counter++; foreach ($bodyChild->find('a') as $childLink) { $link = $childLink->href; $title = $childLink->title; if (!in_array($link, $this->get_spam_sites())) { $titleArray = explode(' ', $title); $titleSlug = ''; foreach ($titleArray as $titlePart) { $titleSlug .= htmlentities(ucfirst($titlePart)); } //$charsToElim = array('?','/','\\'); $titleSlug = $this->sanitize($titleSlug, false, true); $link = 'http://academicblogs.org' . $link; $sectionSlug = $htmlCounter[$spanSlug]['slug']; $htmlCounter[$spanSlug]['links'][$titleSlug]['slug'] = $titleSlug; $htmlCounter[$spanSlug]['links'][$titleSlug]['title'] = htmlspecialchars(strip_tags($title)); $htmlCounter[$spanSlug]['links'][$titleSlug]['link'] = $link; //if ($childLink->){ $htmlCounter[$spanSlug]['links'][$titleSlug]['blogs'] = $this->getLinksFromSection($link); //} //$links[$sectionSlug][$titleSlug]['title'] = $title; //$links[$sectionSlug][$titleSlug]['link'] = $link; } else { $counter--; $htmlCounter[$spanSlug]['links'][$counter]['error'] = false; } } } } } return $htmlCounter; }
$contentHtml = pf_file_get_html('http://oha2012.thatcamp.org/'); //set_error_handler("customError"); $content = $contentHtml->find('article'); echo $content[0]->innertext; echo '<hr />'; $contentHtml = pf_file_get_html('http://www.wordsinspace.net/urban-media-archaeology/2012-fall/?page_id=9'); //set_error_handler("customError"); $content = $contentHtml->find('section'); echo $content[0]->innertext; echo '<hr />'; $contentHtml = pf_file_get_html('http://www.wordsinspace.net/urban-media-archaeology/2012-fall/?page_id=9'); //set_error_handler("customError"); $content = $contentHtml->find('#content'); echo $content[0]->innertext; echo '<hr />'; $contentHtml = pf_file_get_html('http://www.wordsinspace.net/urban-media-archaeology/2012-fall/?page_id=9'); //set_error_handler("customError"); $content = $contentHtml->find('.page-content'); //use to create it in html. //echo htmlspecialchars($content[0]->innertext); echo mb_convert_encoding($content[0]->innertext, 'UTF-8', 'UTF-8'); echo '<hr />'; //OG Check goes here. $contentHtml = get_meta_tags('http://www.nytimes.com/2012/09/04/us/politics/democrats-say-us-is-better-off-than-4-years-ago.html?_r=1&hp'); //set_error_handler("customError"); $content = $contentHtml['description']; //echo $content; echo $content; echo '<hr />'; //Case 1 - .hentry http://oha2012.thatcamp.org/ //Case 2 - .entry-content http://www.freshandnew.org/2012/08/museum-datasets-un-comprehensive-ness-data-mining/