/** * Search for a tag within an XML DOMDocument * * @param stdClass $tagname The name of the tag to search for * @param XPath $xpath The XML to find the tag in * @param XPath $attribute The attribute to search for (if we should search for a child node with the given * value for the name attribute * @since Moodle 3.1 */ function get_tag($tagname, $xpath, $attribute = null) { if ($attribute) { $result = $xpath->query('//*[local-name() = \'' . $tagname . '\'][@name="' . $attribute . '"]'); } else { $result = $xpath->query('//*[local-name() = \'' . $tagname . '\']'); } if ($result->length > 0) { return $result->item(0)->nodeValue; } return null; }
function scrapeImdb($url) { $baseurl = "http://www.imdb.com"; $array = array(); $xpath = new XPath($url); $imgsrcQuery = $xpath->query("//td[@class='image']//img/@src"); $imgtitleQuery = $xpath->query("//td[@class='image']//img/@title"); $linktitleQuery = $xpath->query("//td[@class='title']/a/text()"); $linkhrefQuery = $xpath->query("//td[@class='title']/a/@href"); $fh = fopen("imdb.txt", "a++"); for ($x = 0; $x < $linkhrefQuery->length; $x++) { $string = $array[$x]['imageTitle'] = $imgtitleQuery->item($x)->nodeValue . "*"; $string .= $array[$x]['imageSource'] = $imgsrcQuery->item($x)->nodeValue . "*"; $string .= $array[$x]['linkTitle'] = $linktitleQuery->item($x)->nodeValue . "*"; $string .= $array[$x]['linkHref'] = $baseurl . $linkhrefQuery->item($x)->nodeValue . "*"; $string .= fwrite($fh, $string . "\n\n\n\n"); /*$array[$x]['imageTitle'] = $imgtitleQuery->item($x)->nodeValue; $array[$x]['imageSource'] = $imgsrcQuery->item($x)->nodeValue; $array[$x]['linkTitle'] = $linktitleQuery->item($x)->nodeValue; $array[$x]['linkHref'] = $baseurl . $linkhrefQuery->item($x)->nodeValue; */ } fclose($fh); // check for the "Next" link $nextPageQuery = $xpath->query("(//span[@class='pagination']/a[contains(., 'Next')])[1]/@href"); if ($nextPageQuery->length) { $nextUrl = $baseurl . $nextPageQuery->item(0)->nodeValue; // $array = array_merge($array, scrapeImdb($nextUrl)); // merging the array and recursively calling the function scrapeImdb($nextUrl); } return $array; }
function GetBook($bookid) { //$uri = "http://www.bengou.cm/cartoon/douluodalu/"; list($bname, $bid) = explode("_", $bookid); $uri = sprintf('http://bengou.cm/cartoon/%s/', $bname); $html = $this->http->get($uri, "email.gif"); $html = str_replace("<head>", '<head><meta http-equiv="content-type" content="text/html; charset=utf-8" />', $html); if (strlen($html) < 1) { return False; } $sections = array(); $xpath = new XPath($html); $elements = $xpath->query("//div[@class='section-list mark']"); foreach ($elements as $element) { $chapters = array(); $nodes = $xpath->query("span/a", $element); foreach ($nodes as $node) { $href = $node->getattribute("href"); $name = $node->nodeValue; //http://bengou.cm/cartoon/xiudougaoxiao/7278_134945.html list($bid, $cid) = explode("_", basename($href, ".html")); $chapters[] = array("name" => $name, "id" => $cid); } $section = array(); $section["name"] = $xpath->get_value("h6", $element); $section["chapters"] = $chapters; $sections[] = $section; } $datetime = $xpath->get_value("//div[@class='cartoon-intro']/div/p[6]"); list($year, $mon, $day, $h, $m, $s) = sscanf($datetime, "更新时间:%d/%d/%d %d:%d:%d"); $book = array(); $book["icon"] = $xpath->get_attribute("//div[@class='cartoon-intro']/a/img", "src"); $book["author"] = $xpath->get_value("//div[@class='cartoon-intro']/div/p[1]/a"); $book["status"] = $xpath->get_value("//div[@class='cartoon-intro']/div/p[2]"); $book["catalog"] = $xpath->get_value("//div[@class='cartoon-intro']/div/p[3]"); $book["tags"] = $xpath->get_value("//div[@class='cartoon-intro']/div/p[4]"); $book["region"] = $xpath->get_value("//div[@class='cartoon-intro']/div/p[5]/a"); $book["datetime"] = date("Y-m-d H:i:s", mktime($h, $m, $s, $mon, $day, $year)); $book["summary"] = $xpath->get_value("//p[@id='cartoon_digest2']"); $book["section"] = $sections; return $book; }
/** * Unmarshal XML to an object * * @param xml.parser.InputSource source * @param string classname * @param [:var] inject * @return lang.Object * @throws lang.ClassNotFoundException * @throws xml.XMLFormatException * @throws lang.reflect.TargetInvocationException * @throws lang.IllegalArgumentException */ public function unmarshalFrom(InputSource $input, $classname, $inject = array()) { libxml_clear_errors(); $doc = new DOMDocument(); if (!$doc->load(Streams::readableUri($input->getStream()))) { $e = libxml_get_last_error(); throw new XMLFormatException(trim($e->message), $e->code, $input->getSource(), $e->line, $e->column); } $xpath = new XPath($doc); // Class factory based on tag name, reference to a static method which is called with // the class name and returns an XPClass instance. $class = XPClass::forName($classname); if ($class->hasAnnotation('xmlmapping', 'factory')) { if ($class->hasAnnotation('xmlmapping', 'pass')) { $factoryArgs = array(); foreach ($class->getAnnotation('xmlmapping', 'pass') as $pass) { $factoryArgs[] = self::contentOf($xpath->query($pass, $doc->documentElement)); } } else { $factoryArgs = array($doc->documentElement->nodeName); } $class = $class->getMethod($class->getAnnotation('xmlmapping', 'factory'))->invoke(NULL, $factoryArgs); } return self::recurse($xpath, $doc->documentElement, $class, $inject); }
private function __ParseBooks($html) { $books = array(); $html = str_replace("text/html; charset=gb2312", "text/html; charset=gb18030", $html); $xpath = new XPath($html); $elements = $xpath->query("//div[@class='border']/div/ul/li/a"); foreach ($elements as $element) { $href = $element->getattribute('href'); $book = $element->getattribute('title'); if (strlen($href) > 0 && strlen($book) > 0) { $bookid = basename($href, ".html"); $books[basename(dirname($href)) . '-' . substr($bookid, 8)] = $book; } } return $books; }
function __ParseChapter($html) { $chapters = array(); $html = str_replace("text/html; charset=gb2312", "text/html; charset=gb18030", $html); if (strlen($html) < 1) { return $chapters; } $xpath = new XPath($html); $elements = $xpath->query("//li[@class='a1']/a"); foreach ($elements as $element) { $href = $element->getattribute('href'); $chapter = $element->nodeValue; if (strlen($href) > 0 && strlen($chapter) > 0) { list($play, $chapterid) = explode("_", basename($href, ".html")); $chapters[] = array("name" => $chapter, "uri" => $chapterid); } } return $chapters; }
public function queryTreeWithDefaultEncoding() { $value = new String('value öäü', 'utf-8'); $xpath = new XPath($s = sprintf('<document><node>%s</node></document>', $value->getBytes('utf-8'))); $this->assertEquals($value, new String($xpath->query('string(/document/node)'), 'utf-8')); }
function GetCatalogUrls($uri) { $html = $this->http->get($uri, "Ysjs/bot.js"); $html = str_replace("text/html; charset=gb2312", "text/html; charset=gb18030", $html); $xpath = new XPath($html); $options = $xpath->query("//select[@name='select']/option"); $urls = array(); foreach ($options as $option) { $href = $option->getattribute('value'); if (strlen($href) < 1) { continue; } $urls[] = dirname($uri) . '/' . $href; } return $urls; }
function GetCatalog() { $uri = 'http://www.xxbh.net/'; $html = http_proxy_get($uri, "template/xxbh", 10); $html = str_replace("text/html; charset=gb2312", "text/html; charset=gb18030", $html); $catalog = array(); $catalog["最近更新"] = '/comicone/page_a.html'; $catalog["排行榜"] = 'comicone/page_b.html'; $xpath = new XPath($html); $elements = $xpath->query("//ul[@class='ul4']/li/a"); foreach ($elements as $element) { $href = $element->getattribute('href'); $text = $element->nodeValue; if (strlen($href) > 1 && strlen($text) > 0) { $catalog[$text] = $href; } } return $catalog; }
function GetChapter($bookid, $chapterid) { list($bname, $bid) = explode("_", $bookid); if (strlen($bname) > 0) { $uri = "http://www.imanhua.com/comic/{$bid}/{$bname}{$chapterid}.shtml"; } else { $uri = "http://www.imanhua.com/comic/{$bid}/list_{$chapterid}.html"; } $html = $this->http->get($uri, "foot_chapter.js"); $html = str_replace("charset=gb2312", "charset=gb18030", $html); if (strlen($html) < 1) { return False; } //file_put_contents("imanhua-$bookid-$chapterid.html", $html); //$html = file_get_contents("imanhua-$bookid-$chapterid.html"); $xpath = new XPath($html); $scripts = $xpath->query("/html/head/script"); if ($scripts->length > 0) { $script = $scripts->item(0)->nodeValue; $js = new V8Js(); $js->executeString($script, "imanhua", V8Js::FLAG_FORCE_ARRAY); $cInfo = $js->executeString("cInfo;", "imanhua", V8Js::FLAG_FORCE_ARRAY); //$servers = array('c5.mangafiles.com', 'c4.mangafiles.com', 't5.mangafiles.com', 't4.mangafiles.com'); if ($cInfo["cid"] > 7910) { // http://www.imanhua.com/comic/76/list_61224.html // http://c4.mangafiles.com/Files/Images/76/61224/imanhua_001.png // "/Files/Images/"+cInfo.bid+"/"+cInfo.cid+"/"+$cInfo["files"][$i] $pictures = array(); foreach ($cInfo["files"] as $file) { $pictures[] = "http://c4.mangafiles.com" . "/Files/Images/" . $cInfo["bid"] . "/" . $cInfo["cid"] . "/" . $file; } return $pictures; } else { // http://www.imanhua.com/comic/135/list_7198.html // "/pictures/135/7198/trdh01.jpg" foreach ($cInfo["files"] as $file) { $pictures[] = "http://t4.mangafiles.com" . $file; } return $cInfo["files"]; } } else { return array(); } }