public function extractPage($pageID, $pageTitle, $pageSource) { global $pagetitle; // Needed for Imageextraction in catchObjectDatatype.php (catchLogo()); $pagetitle = $pageTitle; $result = new ExtractionResult($pageID, $this->language, self::extractorID); global $parseResult; // Contains the Extraction result $parseResult = null; parsePage($pageID, $pageSource); if (count($parseResult) < 1) { return $result; } $knownProperties = array($parseResult[0][1]); foreach ($parseResult as $myTriple) { $subject = RDFtriple::URI($myTriple[0]); // Rename Properties like LeaderName1, LeaderName2, ... to LeaderName if (preg_match("/(.*[^0-9_]+)([0-9])\$/", $myTriple[1], $matches)) { $key = array_search($matches[1], $knownProperties); if ($key) { $myTriple[1] = $knownProperties[$key]; } else { array_push($knownProperties, $matches[1]); } $myTriple[1] = $matches[1]; } else { if (!array_search($myTriple[1], $knownProperties)) { array_push($knownProperties, $myTriple[1]); } } $predicate = RDFtriple::URI($myTriple[1]); if ($myTriple[3] == "r") { $object = RDFtriple::URI($myTriple[2]); } else { if ($myTriple[5] == null) { $myTriple[5] = $this->language; } $object = RDFtriple::literal($myTriple[2], $myTriple[4], $myTriple[5]); } $result->addTriple($subject, $predicate, $object); $this->allPredicates->addPredicate($myTriple[1]); } return $result; }
#parsePage($html1); #print "Next page link: " . $nextPageLink . "\n"; print "title, author, year, addinfo, publisher, abstract" . "\n"; #scraperwiki::sqliteexecute("create table acmdata1 (a int, `title` string, 'author' string, 'year' string, 'addinfo' string, 'publisher' string, 'abstract' string)"); $maxPages = 0; $numrecords = 1; while (strlen($nextPageLink) > 0 and $maxPages < 6) { $maxPages++; #print "Moving on to next page" . "\n"; $html_content = scraperWiki::scrape($nextPageLink); $html1 = str_get_html($html_content); #print $html1 . "\n"; $nextLink = $html1->find("td[@colspan='2']", 0); # print "Next link: " . $nextLink->innertext . "\n"; $nextPageLink = getNextLink($nextLink); $numrecords = parsePage($html1, $maxPages); sleep(120); } print "No further pages" . "\n"; $data = scraperwiki::select("* from acmdata1"); print "<html><table>"; print "<tr><th>Title</th><th>Author</th><th>Year</th><th>AddInfo</th><th>Publisher</th><th>Abstract</th>"; foreach ($data as $d) { print "<tr>"; print "<td>" . "ACM" . "</td>"; print "<td>" . $d["title"] . "</td>"; print "<td>" . $d["author"] . "</td>"; print "<td>" . $d["year"] . "</td>"; print "<td>" . $d["addinfo"] . "</td>"; print "<td>" . $d["publisher"] . "</td>"; print "<td>" . $d["abstract"] . "</td>";
function parsePage($key,$out=false) { global $global_current_file,$designPath,$commonDesignPath,$renderInclude; $dpath = $designPath; $cdpath = $commonDesignPath; $global_current_file = $key; if($out === false) { $out = getDesign($key); $designPath .= '/'.$key; $commonDesignPath .= '/'.$key; } if($out !== false) { $offset=0; while(1) { $start = strpos($out,'§',$offset); if($start === false) break; $off = $start+2; $end = strpos($out,'§',$off); if($end === false) break; $size = $end-$off; $word = substr($out,$off,$size); $out = substr_replace($out,parsePage($word),$start,$size+4); $offset = $start; } } // /* generate next line in brace content ! */ global $renderWords,$renderInclude; foreach($renderWords as $w) print '$d=getDesignCache(\':'.$w.'\');if($d!==false){if(!isset($renderInclude[\''.$w.'\']))$renderInclude[\''.$w.'\']=$d;else $renderInclude[\''.$w.'\'].=$d;}';die(); if(getDesignCache(':addition') !== false){ $d=getDesignCache(':head');if($d!==false){if(!isset($renderInclude['head']))$renderInclude['head']=$d;else $renderInclude['head'].=$d;}$d=getDesignCache(':body');if($d!==false){if(!isset($renderInclude['body']))$renderInclude['body']=$d;else $renderInclude['body'].=$d;}$d=getDesignCache(':js');if($d!==false){if(!isset($renderInclude['js']))$renderInclude['js']=$d;else $renderInclude['js'].=$d;}$d=getDesignCache(':jquery');if($d!==false){if(!isset($renderInclude['jquery']))$renderInclude['jquery']=$d;else $renderInclude['jquery'].=$d;}$d=getDesignCache(':title');if($d!==false){if(!isset($renderInclude['title']))$renderInclude['title']=$d;else $renderInclude['title'].=$d;}$d=getDesignCache('meta');if($d!==false){if(!isset($renderInclude['meta']))$renderInclude['meta']=$d;else $renderInclude['meta'].=$d;}$d=getDesignCache(':style');if($d!==false){if(!isset($renderInclude['style']))$renderInclude['style']=$d;else $renderInclude['style'].=$d;}$d=getDesignCache(':keywords');if($d!==false){if(!isset($renderInclude['keywords']))$renderInclude['keywords']=$d;else $renderInclude['keywords'].=$d;}$d=getDesignCache(':description');if($d!==false){if(!isset($renderInclude['description']))$renderInclude['description']=$d;else $renderInclude['description'].=$d;} } $designPath = $dpath; $commonDesignPath = $cdpath; return $out; }
//*** Parse the HTML Header. $strOutput .= parseHeader($intCatId, $strCommand, $intElmntId); //*** Route to the correct HTML Body Parser. switch ($intCatId) { case NAV_MYPUNCH_LOGIN: if ($_CONF['app']['secureLogin']) { header("Location: " . Request::getURI("https") . "/?cid=" . NAV_MYPUNCH_LOGIN); exit; } else { require_once 'inc.tplparse_login.php'; $strOutput .= parseLogin($intElmntId, $strCommand); } break; case NAV_MYPUNCH_NOACCOUNT: require_once 'includes/inc.tplparse_noaccount.php'; $strOutput .= parsePage($intElmntId, $strCommand); break; case NAV_MYPUNCH_USERS: require_once 'includes/inc.tplparse_user.php'; if ($intElmntId == 0) { $intElmntId = NAV_MYPUNCH_USERS_USER; } $strOutput .= parseMenu($intCatId, $strCommand); $strOutput .= parseUsers($intElmntId, $strCommand); break; case NAV_MYPUNCH_PROFILE: require_once 'includes/inc.tplparse_profile.php'; $strOutput .= parseMenu($intCatId, $strCommand); $strOutput .= parseProfile($intElmntId, $strCommand); break; case NAV_MYPUNCH_ANNOUNCEMENTS: