/** * Method to parse a Vimeo RSS feed object * * @return void */ public function parse() { parent::parse(); if (null === $this->feed['author']) { $this->feed['author'] = str_replace('Vimeo / ', null, $this->feed['title']); } $items = $this->feed['items']; foreach ($items as $key => $item) { $id = substr($item['link'], strrpos($item['link'], '/') + 1); $items[$key]['id'] = $id; $vimeo = \Pop\Http\Response::parse('http://vimeo.com/api/v2/video/' . $id . '.php'); if (!$vimeo->isError()) { $info = unserialize($vimeo->getBody()); if (isset($info[0]) && is_array($info[0])) { $items[$key]['views'] = isset($info[0]['stats_number_of_plays']) ? $info[0]['stats_number_of_plays'] : null; $items[$key]['likes'] = isset($info[0]['stats_number_of_likes']) ? $info[0]['stats_number_of_likes'] : null; $items[$key]['duration'] = $info[0]['duration']; $items[$key]['image_thumb'] = $info[0]['thumbnail_small']; $items[$key]['image_medium'] = $info[0]['thumbnail_medium']; $items[$key]['image_large'] = $info[0]['thumbnail_large']; foreach ($info[0] as $k => $v) { if ($v != '') { $items[$key][$k] = $v; } } } } } $this->feed['items'] = $items; }
/** * Method to parse Youtube Atom feed object * * @return void */ public function parse() { parent::parse(); $items = $this->feed['items']; foreach ($items as $key => $item) { if ($items[$key]['content'] == '') { $items[$key]['content'] = $item['title']; } $id = substr($item['link'], strpos($item['link'], 'v=') + 2); if (strpos($id, '&') !== false) { $id = substr($id, 0, strpos($id, '&')); } $items[$key]['id'] = $id; $youtube = \Pop\Http\Response::parse('http://gdata.youtube.com/feeds/api/videos/' . $id . '?v=2&alt=json'); if (!$youtube->isError()) { $info = json_decode($youtube->getBody(), true); $items[$key]['views'] = $info['entry']['yt$statistics']['viewCount']; $items[$key]['likes'] = $info['entry']['yt$rating']['numLikes']; $items[$key]['duration'] = $info['entry']['media$group']['yt$duration']['seconds']; $items[$key]['image_thumb'] = 'http://i.ytimg.com/vi/' . $id . '/default.jpg'; $items[$key]['image_medium'] = 'http://i.ytimg.com/vi/' . $id . '/mqdefault.jpg'; $items[$key]['image_large'] = 'http://i.ytimg.com/vi/' . $id . '/hqdefault.jpg'; foreach ($info as $k => $v) { if ($v != '') { $items[$key][$k] = $v; } } } } $this->feed['items'] = $items; }
/** * Method to parse Youtube JSON feed object * * @return void */ public function parse() { parent::parse(); $this->feed['title'] = $this->feed['title']['$t']; $this->feed['url'] = $this->feed['url'][0]['href']; $this->feed['description'] = $this->feed['title']; $this->feed['date'] = $this->feed['date']['$t']; $this->feed['generator'] = $this->feed['generator']['$t']; $this->feed['author'] = $this->feed['author'][0]['name']['$t']; $items = $this->feed['items']; foreach ($items as $key => $item) { if (isset($this->obj['feed']['entry'][$key]['content']['$t'])) { $content = html_entity_decode($this->obj['feed']['entry'][$key]['content']['$t'], ENT_QUOTES, 'UTF-8'); } else { $content = $this->obj['feed']['entry'][$key]['title']['$t']; } $items[$key]['title'] = $this->obj['feed']['entry'][$key]['title']['$t']; $items[$key]['content'] = $content; $items[$key]['link'] = $items[$key]['link'][0]['href']; $items[$key]['published'] = $this->obj['feed']['entry'][$key]['published']['$t']; $items[$key]['time'] = self::calculateTime($this->obj['feed']['entry'][$key]['published']['$t']); $id = substr($items[$key]['link'], strpos($items[$key]['link'], 'v=') + 2); if (strpos($id, '&') !== false) { $id = substr($id, 0, strpos($id, '&')); } $items[$key]['id'] = $id; $youtube = \Pop\Http\Response::parse('http://gdata.youtube.com/feeds/api/videos/' . $id . '?v=2&alt=json'); if (!$youtube->isError()) { $info = json_decode($youtube->getBody(), true); $items[$key]['views'] = $info['entry']['yt$statistics']['viewCount']; $items[$key]['likes'] = $info['entry']['yt$rating']['numLikes']; $items[$key]['duration'] = $info['entry']['media$group']['yt$duration']['seconds']; $items[$key]['image_thumb'] = 'http://i.ytimg.com/vi/' . $id . '/default.jpg'; $items[$key]['image_medium'] = 'http://i.ytimg.com/vi/' . $id . '/mqdefault.jpg'; $items[$key]['image_large'] = 'http://i.ytimg.com/vi/' . $id . '/hqdefault.jpg'; foreach ($info as $k => $v) { if ($v != '') { $items[$key][$k] = $v; } } } } $this->feed['items'] = $items; }
public function parse($baseUrl, $context, array $tags) { $dom = null; $contentType = null; $this->response = Response::parse($this->url, $context); if (null !== $this->response->getHeader('Content-type')) { $this->contentType = $this->response->getHeader('Content-type'); } else { if (null !== $this->response->getHeader('Content-Type')) { $this->contentType = $this->response->getHeader('Content-Type'); } } if (null !== $this->contentType && stripos($this->contentType, 'text/html') !== false) { if ($this->response->getCode() == 200) { $oldError = ini_get('error_reporting'); error_reporting(0); $dom = new \DOMDocument(); $dom->recover = true; $dom->strictErrorChecking = false; $dom->loadHTML($this->response->getBody()); error_reporting($oldError); } } if (null !== $dom) { foreach ($tags as $tag) { switch ($tag) { case 'title': $title = $dom->getElementsByTagName('title'); $this->elements['title'] = null !== $title->item(0) ? trim($title->item(0)->nodeValue) : null; break; case 'meta': $meta = $dom->getElementsByTagName('meta'); if (null !== $meta->item(0)) { foreach ($meta as $m) { if ($m->hasAttribute('name') && $m->hasAttribute('content')) { if (!isset($this->elements['meta'])) { $this->elements['meta'] = []; } $this->elements['meta'][] = ['name' => $m->getAttribute('name'), 'content' => $m->getAttribute('content')]; } } } break; case 'a': $anchors = $dom->getElementsByTagName('a'); if (null !== $anchors->item(0)) { foreach ($anchors as $a) { if (!isset($this->elements['a'])) { $this->elements['a'] = []; } $href = $a->hasAttribute('href') ? $a->getAttribute('href') : null; if (null !== $href && $this->isValidHref($href)) { if (substr($href, 0, strlen($baseUrl)) == $baseUrl) { $href = substr($href, strlen($baseUrl)); } $url = substr($this->url, strlen($baseUrl)); if (substr($href, 0, 1) == '/') { $href = $baseUrl . $href; } else { if (substr($href, 0, 2) == './') { $href = $baseUrl . $url . substr($href, 1); } else { if (strpos($href, '../') !== false) { $depth = substr_count($url, '/'); $levels = substr_count($href, '../'); if ($depth > $levels) { for ($i = 0; $i < $levels; $i++) { $url = substr($url, 0, strrpos($url, '/')); } $href = $baseUrl . $url . '/' . str_replace('../', '', $href); } else { $href = $baseUrl . '/' . str_replace('../', '', $href); } } } } if (substr($href, 0, strlen($baseUrl)) == $baseUrl && !in_array($href, $this->children) && $this->url != $href) { $this->children[] = $href; } } if ($a->nodeValue != '') { $value = $a->nodeValue; } else { $imgs = $a->getElementsByTagName('img'); $value = null !== $imgs->item(0) ? '[image]' : null; } $this->elements['a'][] = array('href' => $href, 'value' => $value, 'title' => $a->hasAttribute('title') ? $a->getAttribute('title') : null, 'name' => $a->hasAttribute('name') ? $a->getAttribute('name') : null, 'rel' => $a->hasAttribute('rel') ? $a->getAttribute('rel') : null); } } break; case 'img': $images = $dom->getElementsByTagName('img'); if (null !== $images->item(0)) { foreach ($images as $image) { if (!isset($this->elements['img'])) { $this->elements['img'] = []; } $this->elements['img'][] = ['src' => $image->hasAttribute('src') ? $image->getAttribute('src') : null, 'alt' => $image->hasAttribute('alt') ? $image->getAttribute('alt') : null, 'title' => $image->hasAttribute('title') ? $image->getAttribute('title') : null]; } } break; default: $element = $dom->getElementsByTagName($tag); if (null !== $element->item(0)) { foreach ($element as $e) { $this->elements[$tag][] = $e->nodeValue; } } } } } return $this->elements; }
public function testParse() { $r = Response::parse('http://www.popphp.org/version'); $r = Response::parse('http://www.popphp.org/version', array('header' => "Accept-language: en\r\n")); $this->assertEquals('200', $r->getCode()); $this->assertEquals('OK', $r->getMessage()); $this->assertEquals('1.7.0', trim($r->getBody())); $this->assertEquals('text/plain', $r->getHeader('Content-Type')); $this->assertTrue($r->isSuccessful()); $this->assertTrue(is_array($r->getHeaders())); $this->assertFalse($r->isError()); $this->assertFalse($r->isRedirect()); $r = new Response(200, array('Content-Type' => 'text/plain')); $r->setBody('This is a test.'); $response = $r->getHeadersAsString() . PHP_EOL . $r->getBody(); $r = Response::parse($response); $this->assertEquals('200', $r->getCode()); $this->assertEquals('OK', $r->getMessage()); $this->assertEquals('This is a test.', trim($r->getBody())); }
/** * Constructor * * Instantiate the document object * * @param string $url * @param array $elements * @return \PopSpider\Spider */ public function __construct($url, array $elements = null) { $this->url = $url; $this->schema = substr($this->url, 0, strpos($this->url, '//') + 2); $this->base = str_replace($this->schema, '', $this->url); if (substr($this->base, -1) == '/') { $this->base = substr($this->base, 0, -1); } if (strpos($this->base, '/') !== false) { $base = substr($this->base, 0, strrpos($this->base, '/') + 1); $tail = substr($this->base, strrpos($this->base, '/') + 1); if (strpos($tail, '.') === false) { $this->base = $base . $tail; } else { $this->base = $base; } } $this->base = $this->schema . $this->base; if (substr($this->base, -1) != '/') { $this->base .= '/'; } $ua = isset($_SERVER['HTTP_USER_AGENT']) ? $_SERVER['HTTP_USER_AGENT'] : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'; $opts = array('method' => 'GET', 'header' => "Accept-language: en\r\n" . "User-Agent: " . $ua . "\r\n", 'user_agent' => $ua); $response = \Pop\Http\Response::parse($this->url, $opts); $this->redirect = $response->isRedirect(); $this->error = $response->isError(); $this->code = $response->getCode(); $this->contentLength = strlen($response->getBody()); if (!$this->error) { // Get content type if (null !== $response->getHeader('Content-type')) { $this->contentType = $response->getHeader('Content-type'); } else { if (null !== $response->getHeader('Content-Type')) { $this->contentType = $response->getHeader('Content-Type'); } } // If an HTML page, parse it if (stripos($this->contentType, 'text/html') !== false) { $oldError = ini_get('error_reporting'); error_reporting(0); $this->dom = new \DOMDocument(); $this->dom->recover = true; $this->dom->strictErrorChecking = false; $this->dom->loadHTML($response->getBody()); error_reporting($oldError); $this->parseElements($elements); } } }