public function getLink() { $tmp = array('link', 'guid'); $through = array('headlines.yahoo.co.jp', 'youtube.com', 'groups.google.com/forum', 'facebook.com'); foreach ($tmp as $v) { $c = xpath($this->content, $v); if (strrpos($c, '?') && !inStr($c, $through)) { $c = substr($c, 0, strrpos($c, '?')); } if ($c && !strpos($c, 'rss')) { return $c; } } }
public function testXpath() { $xml = "<root> <a> 1 </a> <b>2</b> </root>"; list($_) = xpath($xml, '/*'); $this->assertRegexp("~^\\s*<root>\\s*<a> 1 </a>\\s*<b>2</b>\\s*</root>\\s*\$~", $_); $_ = xpath($xml); $this->assertRegexp("~^\\s*<root>\\s*<a> 1 </a>\\s*<b>2</b>\\s*</root>\\s*\$~", $_); list($_) = xpath($xml, '//a/text()'); $this->assertEquals(' 1 ', $_); // $xml = "<root><test> \n </test></root>"; $_ = xpath($xml); $this->assertRegexp("~^\\s*<root>\\s*<test> \n </test>\\s*</root>\\s*\$~", $_); // $xml = "<root> <a class='cl1 cl2 cl3'> 1 </a> <b>2</b> </root>"; list($_) = xpath($xml, '//*[class(cl2)]'); $this->assertRegexp("~^\\s*<a\\b[^>]*>.*?</a>\\s*\$~", $_); // $xml = "<root> <a>1</a> <b>2</b> <c>3</c> </root>"; $xml = xpath($xml, '//b', function ($tag) { $tag->parentNode->removeChild($tag); }); $this->assertRegexp("~^\\s*<root>\\s*<a>1</a>\\s*<c>3</c>\\s*</root>\\s*\$~", $xml); // $xml = "<root><a><one>1</one><two>2</two></a></root>"; $xml = xpath($xml, '//a', function ($tag) { $_ = xpath($tag, '//text()'); $tag->nodeValue = implode(' ', $_); }); $this->assertRegexp("~^\\s*<root>\\s*<a>1 2</a>\\s*</root>\\s*\$~", $xml); // $xml = "<root><a><one>1</one><two>2</two></a></root>"; $xml = xpath($xml, '//a', function ($tag) { if ($tag->hasChildNodes()) { $collector = array(); foreach ($tag->childNodes as $child) { $collector[] = $child; } for ($i = 0; $i < count($collector); $i++) { $tag->parentNode->insertBefore($collector[$i], $tag); } } $tag->parentNode->removeChild($tag); }); $this->assertRegexp("~^\\s*<root>\\s*<one>1</one>\\s*<two>2</two>\\s*</root>\\s*\$~", $xml); // $xml = "<root> <a>1</a> <b>2</b> <c>3</c> </root>"; $xml = xpath($xml, '//b', "remove"); $this->assertRegexp("~^\\s*<root>\\s*<a>1</a>\\s*<c>3</c>\\s*</root>\\s*\$~", $xml); // $xml = "<root><a remove='1'><b>b</b><c remove='1'></c></a></root>"; $count = 0; $xml = xpath($xml, '//*[@remove="1"]', function ($tag) use(&$count) { $count += 1; $tag->parentNode->removeChild($tag); }); $this->assertRegexp("~^\\s*<root/>\\s*\$~", $xml); $this->assertEquals(1, $count); // $xml = "<root><p>a<br>b</p></root>"; $texts = xpath($xml, '//p/text()'); $this->assertEquals("a", $texts[0]); $this->assertEquals("b", $texts[1]); // $xml = <<<END <!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>title</title> <meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> </head> <body> content </body> </html> END; list($text) = xpath($xml, '/html/body/text()'); $this->assertEquals("content", trim($text)); }
function lexer(&$string, $rules = array()) { $escape = false; $ignore = false; $return = array('output' => array()); $modify = function ($output, $scheme, $map, $debug) use(&$modify) { while ($scheme) { if (preg_match('~^[\\w0-9]+~', $scheme, $match) or preg_match('~^\\((((?>[^\\(\\)]+)|(?R))*)\\)~', $scheme, $match)) { $scheme = substr($scheme, strlen($match[0])); $while = $scheme[0] === '*'; $scheme = ltrim($scheme, '*,'); do { $old = $output; if (isset($match[1])) { if (is_callable($_ = $modify)) { $output = $_($output, $match[1], $map, $debug); } } else { if (is_callable($_ = $map[$match[0]])) { $output = $_($output); } if (is_callable($debug) and $old != $output) { $debug($match[0], $old, $output); } } } while ($while and $old != $output); } else { $scheme = ''; } } return $output; }; $stringify = function ($output, $map, $extra = array()) use(&$stringify) { $parents = isset($extra['parents']) ? $extra['parents'] : array(); $next = isset($extra['next']) ? $extra['next'] : null; $prev = isset($extra['prev']) ? $extra['prev'] : null; $collector = array(); xpath($output, '/*', function ($tag) use(&$collector, $stringify, $map, $parents, $prev, $next) { $body = array(); $parents[] = $tag->nodeName; $length = $tag->childNodes->length ?: 0; for ($i = 0; $i < $length; $i++) { if ($tag->childNodes->item($i)->nodeType === XML_TEXT_NODE) { $body[] = $tag->childNodes->item($i)->nodeValue; } else { for ($j = 0, $p = array(); $j < $i; $j++) { $p[] = $tag->childNodes->item($j)->nodeName; } $p = array_reverse($p); for ($j = $i + 1, $n = array(); $j < $length; $j++) { $n[] = $tag->childNodes->item($j)->nodeName; } $body[] = $_ = call_user_func($stringify, xpath($tag->childNodes->item($i)), $map, array('parents' => $parents, 'prev' => $p, 'next' => $n)); } } array_pop($parents); $body = implode('', $body); $attr = array(); if ($tag->hasAttributes()) { foreach ($tag->attributes as $_) { $attr[$_->nodeName] = $_->nodeValue; } } if (isset($map[$tag->nodeName]) and is_callable($map[$tag->nodeName])) { $collector[] = call_user_func($map[$tag->nodeName], $body, array('attr' => $attr, 'parents' => $parents, 'prev' => $prev, 'next' => $next)); } elseif (!isset($map[$tag->nodeName])) { _warn(__FUNCTION__ . ": NO {$tag->nodeName} IN MAP!"); } elseif (!is_callable($map[$tag->nodeName])) { _warn(__FUNCTION__ . ": {$tag->nodeName} IN MAP IS NOT CALLABLE!"); } }); return implode('', $collector); }; $autoFix = function ($output) { $reg = '~<(?P<close>/?)(?P<tag>\\w+)\\b[^>]*>~'; $order = array(); $output = preg_replace_callback($reg, function ($match) use(&$order) { if (substr($match[0], -2) === '/>') { return $match[0]; } $isClose = (isset($match['close']) and $match['close']); if (!$isClose) { $order[] = $match['tag']; return $match[0]; } if (!$order) { return ''; } $last = array_pop($order); if ($last === $match['tag']) { return $match[0]; } $order[] = $last; $index = array_search($match['tag'], $order); if ($index === false) { return ''; } $return = ""; $count = count($order); for ($i = $index + 1; $i < $count; $i++) { $return .= "</{$order[$i]}>"; } for ($i = $index; $i < $count; $i++) { array_pop($order); } return $return . $match[0]; }, $output); foreach (array_reverse($order) as $tag) { $output .= "</{$tag}>"; } return $output; }; $pushChar = function ($char) use(&$return) { $char = esc($char); $space = '<space/>'; $begin = '<string>'; $bl = mb_strlen($begin); $end = '</string>'; $el = mb_strlen($end); $count = count($return['output']); if ($count > 0 and $part =& $return['output'][$count - 1] and mb_substr($part, 0, $bl) === $begin and mb_substr($part, -$el) === $end) { $part = mb_substr($part, 0, -$el) . $char . $end; } elseif ($count > 1) { $triggerSpace = false; for ($i = $count - 1; $i >= 0; $i--) { if ($return['output'][$i] === $space) { $triggerSpace = true; } else { break; } } if ($triggerSpace and $i >= 0 and mb_substr($return['output'][$i], 0, $bl) === $begin and mb_substr($return['output'][$i], -$el) === $end) { $spaces = str_repeat(' ', $count - $i - 1); $return['output'][$i] = mb_substr($return['output'][$i], 0, -$el) . $spaces . $char . $end; $return['output'] = array_slice($return['output'], 0, $i + 1); } else { $return['output'][] = "{$begin}{$char}{$end}"; } } else { $return['output'][] = "{$begin}{$char}{$end}"; } }; $cloakEscape = 0; @($escapeChar = $rules['escapeChar']); @($escapeSequence = $rules['escapeSequence']); if (is_string($escapeSequence)) { $_ = array(); for ($i = 0; $i < strlen($escapeSequence); $i++) { $_[$escapeSequence[$i]] = $escapeSequence[$i]; } $escapeSequence = $_; } @($ignoreBegin = $rules['ignoreBegin']); @($ignoreEnd = $rules['ignoreEnd']); @($endDomain = $rules['endDomain']); @($wrapper = $rules['wrapper']); @($tags = (array) $rules['tags']); $string = str_replace("\r\n", "\n", $string); while ($escape or mb_strlen($string) > 0) { if ($ignoreEnd and $ignore and mb_strpos($string, $ignoreEnd) === 0) { $ignore = false; $string = mb_substr($string, mb_strlen($ignoreEnd)); continue; } if ($ignore) { $string = mb_substr($string, 1); continue; } if ($escape and !$string) { $escape = false; $pushChar($escapeChar); continue; } if ($escape) { $escape = false; $_ = mb_substr($string, 0, 1); $string = mb_substr($string, 1); if ($escapeSequence and array_key_exists($_, $escapeSequence)) { $pushChar($escapeSequence[$_]); } elseif ($cloakEscape === 0) { $cloakEscape = 1; $string = $escapeChar . $_ . $string; } else { $cloakEscape = 0; $pushChar($escapeChar); $string = $_ . $string; } continue; } if ($escapeChar and mb_strpos($string, $escapeChar) === 0 and in_array($cloakEscape, array(0, 2))) { $escape = true; $string = mb_substr($string, mb_strlen($escapeChar)); continue; } if ($ignoreBegin and mb_strpos($string, $ignoreBegin) === 0) { $ignore = true; $string = mb_substr($string, mb_strlen($ignoreBegin)); continue; } if ($endDomain and is_callable($endDomain) and $result = call_user_func_array($endDomain, array(&$string))) { if (is_assoc($result)) { $return = array_merge_recursive($return, $result); } elseif (is_string($result)) { $return['output'][] = $result; } break; } if ($endDomain and is_string($endDomain) and mb_strpos($string, $endDomain) === 0) { $string = mb_substr($string, mb_strlen($endDomain)); if (isset($wrapper[1])) { $return['output'][] = $wrapper[1]; } break; } /* SEARCH TAGS - BEGIN */ foreach ($tags as $tag) { if (is_array($tag) and isset($tag['beginDomain']) and ($callable = is_callable($tag['beginDomain']) and $result = call_user_func_array($tag['beginDomain'], array(&$string)) or is_string($tag['beginDomain']) and mb_strpos($string, $tag['beginDomain']) === 0)) { $cloakEscape = 0; if (isset($tag['tags']) and $tag['tags'] === 'inherit') { $tag['tags'] = $rules['tags']; } if (!$callable) { $string = mb_substr($string, mb_strlen($tag['beginDomain'])); } if (!$callable and isset($tag['wrapper'][0])) { $return['output'][] = $tag['wrapper'][0]; } $value = lexer($string, $tag); if ($callable) { if (is_assoc($result)) { $return = array_merge_recursive($return, $result); } elseif (is_string($result)) { $return['output'][] = $result; } } if (is_assoc($value)) { $return = array_merge_recursive($return, $value); } else { $return['output'][] = $value; } continue 2; } elseif (is_array($tag) and count($tag) === 2 and isset($tag[0]) and mb_strpos($string, $tag[0]) === 0) { $cloakEscape = 0; $string = mb_substr($string, mb_strlen($tag[0])); $return['output'][] = $tag[1]; continue 2; } elseif (is_callable($tag) and $result = call_user_func_array($tag, array(&$string))) { $cloakEscape = 0; if (is_assoc($result)) { $return = array_merge_recursive($return, $result); } else { $return['output'][] = $result; } continue 2; } } /* SEARCH TAGS - END */ if (in_array($string[0], array("\n", "\r"))) { $string = mb_substr($string, 1); $return['output'][] = "<nl/>"; continue; } if (ctype_space($string[0])) { $string = mb_substr($string, 1); $return['output'][] = "<space/>"; continue; } if ($cloakEscape === 1) { $cloakEscape = 2; continue; } if (mb_strlen($string)) { $_ = mb_substr($string, 0, 1); $string = mb_substr($string, 1); $pushChar($_); continue; } } $return['output'] = implode('', $return['output']); if (isset($rules['autoFix']) and $rules['autoFix']) { $return['output'] = $autoFix($return['output']); } if (isset($rules['modify']) and $rules['modify'] and is_assoc($rules['modify'])) { $return['output'] = call_user_func_array($modify, array($return['output'], $rules['modify']['scheme'], $rules['modify']['map'], isset($rules['modify']['debug']) ? $rules['modify']['debug'] : null)); } if (isset($rules['stringify']) and $rules['stringify'] and is_assoc($rules['stringify'])) { $return['output'] = call_user_func_array($stringify, array($return['output'], $rules['stringify'])); } return $return; }
function pageImageURL(\DOMDocument $doc) { $xpath = "//div[contains(normalize-space(@id), 'viewer')]" . "//img"; return f\map(f\map(elementToPageImage), xpath($doc, $xpath)); }