Beispiel #1
0
function getLinksFromSection($sectionURL)
{
    global $spamsites;
    set_time_limit(0);
    $html = pf_file_get_html($sectionURL);
    $blogs = array();
    $c = 0;
    foreach ($html->find('#bodyContent') as $body) {
        foreach ($body->find('a') as $link) {
            if (!in_array($link->href, $spamsites)) {
                if ($link->rel == 'nofollow') {
                    $URL = $link->href;
                    $title = $link->innertext;
                    $slug = slugger($title);
                    $blogs[$slug]['slug'] = $slug;
                    $blogs[$slug]['url'] = $URL;
                    $blogs[$slug]['title'] = htmlspecialchars(strip_tags($title));
                }
            } else {
            }
        }
    }
    return $blogs;
}
Beispiel #2
0
 public function build_the_ref_array()
 {
     error_reporting(E_ALL);
     error_reporting(-1);
     echo 'begin<br /><br />';
     $theWikiLink = 'http://academicblogs.org/index.php/Main_Page';
     $htmlCounter = array();
     //Random article for testing.
     $html = pf_file_get_html($theWikiLink);
     //print_r($html);
     # Get the title page
     foreach ($html->find('h1') as $link) {
         //print_r($link);
         //	if (($link->plaintext == '[edit] External links') || ($link->plaintext == '[edit] References') ){
         set_time_limit(0);
         # Get the main content block
         $nextBlock = $link->next_sibling();
         //print_r($nextBlock);
         $counter = 0;
         $sectionCounter = 0;
         $links = array();
         # Walk through the dom and count paragraphs between H2 tags
         foreach ($nextBlock->children() as $bodyChild) {
             echo $bodyChild;
             if ($bodyChild->find('span') && $bodyChild->tag == 'h2') {
                 foreach ($bodyChild->find('span') as $span) {
                     $sectionCounter++;
                     $spanText = $span->innertext;
                     $spanNameArray = explode(' ', $spanText);
                     $spanSlug = '';
                     foreach ($spanNameArray as $spanNamePart) {
                         $spanSlug .= htmlentities(ucfirst($spanNamePart));
                     }
                     $spanSlug = $this->sanitize($spanSlug, false, true);
                     $htmlCounter[$spanSlug]['slug'] = $spanSlug;
                     $htmlCounter[$spanSlug]['text'] = htmlspecialchars(strip_tags($spanText));
                     $htmlCounter[$spanSlug]['counter'] = $counter;
                     $counter = 0;
                     $links = array();
                     //$htmlCounter[];
                 }
             } else {
                 //$htmlCounter[$spanSlug]['error'] = false;
             }
             if ($bodyChild->tag == 'p' && count($bodyChild->find('a')) == 1 && count($bodyChild->find('a[class=new]')) == 0) {
                 $counter++;
                 foreach ($bodyChild->find('a') as $childLink) {
                     $link = $childLink->href;
                     $title = $childLink->title;
                     if (!in_array($link, $this->get_spam_sites())) {
                         $titleArray = explode(' ', $title);
                         $titleSlug = '';
                         foreach ($titleArray as $titlePart) {
                             $titleSlug .= htmlentities(ucfirst($titlePart));
                         }
                         //$charsToElim = array('?','/','\\');
                         $titleSlug = $this->sanitize($titleSlug, false, true);
                         $link = 'http://academicblogs.org' . $link;
                         $sectionSlug = $htmlCounter[$spanSlug]['slug'];
                         $htmlCounter[$spanSlug]['links'][$titleSlug]['slug'] = $titleSlug;
                         $htmlCounter[$spanSlug]['links'][$titleSlug]['title'] = htmlspecialchars(strip_tags($title));
                         $htmlCounter[$spanSlug]['links'][$titleSlug]['link'] = $link;
                         //if ($childLink->){
                         $htmlCounter[$spanSlug]['links'][$titleSlug]['blogs'] = $this->getLinksFromSection($link);
                         //}
                         //$links[$sectionSlug][$titleSlug]['title'] = $title;
                         //$links[$sectionSlug][$titleSlug]['link'] = $link;
                     } else {
                         $counter--;
                         $htmlCounter[$spanSlug]['links'][$counter]['error'] = false;
                     }
                 }
             }
         }
     }
     return $htmlCounter;
 }
Beispiel #3
0
$contentHtml = pf_file_get_html('http://oha2012.thatcamp.org/');
//set_error_handler("customError");
$content = $contentHtml->find('article');
echo $content[0]->innertext;
echo '<hr />';
$contentHtml = pf_file_get_html('http://www.wordsinspace.net/urban-media-archaeology/2012-fall/?page_id=9');
//set_error_handler("customError");
$content = $contentHtml->find('section');
echo $content[0]->innertext;
echo '<hr />';
$contentHtml = pf_file_get_html('http://www.wordsinspace.net/urban-media-archaeology/2012-fall/?page_id=9');
//set_error_handler("customError");
$content = $contentHtml->find('#content');
echo $content[0]->innertext;
echo '<hr />';
$contentHtml = pf_file_get_html('http://www.wordsinspace.net/urban-media-archaeology/2012-fall/?page_id=9');
//set_error_handler("customError");
$content = $contentHtml->find('.page-content');
//use to create it in html.
//echo htmlspecialchars($content[0]->innertext);
echo mb_convert_encoding($content[0]->innertext, 'UTF-8', 'UTF-8');
echo '<hr />';
//OG Check goes here.
$contentHtml = get_meta_tags('http://www.nytimes.com/2012/09/04/us/politics/democrats-say-us-is-better-off-than-4-years-ago.html?_r=1&hp');
//set_error_handler("customError");
$content = $contentHtml['description'];
//echo $content;
echo $content;
echo '<hr />';
//Case 1 - .hentry http://oha2012.thatcamp.org/
//Case 2 - .entry-content  http://www.freshandnew.org/2012/08/museum-datasets-un-comprehensive-ness-data-mining/