<?php /** * Created by PhpStorm. * User: yakov * Date: 05.06.16 * Time: 12:19 */ require __DIR__ . '/../vendor/autoload.php'; $parser = new \jakulov\HyperParser\Parser(); $url = 'https://en.wikipedia.org/wiki/Adam_Smith'; $pattern = ['name' => '#firstHeading', 'img' => '.image img|src', 'bio' => '#mw-content-text p|innertext', 'tags' => '#mw-normal-catlinks a']; $data = $parser->parseUrl($url, $pattern); echo 'Name: ' . $data['name'][0] . PHP_EOL; echo 'Photo: ' . $data['img'][0] . PHP_EOL . PHP_EOL; echo 'Bio: ' . strip_tags($data['bio'][0]) . PHP_EOL . PHP_EOL; echo '===================== ' . PHP_EOL; echo 'Tags: ' . join(', ', array_slice($data['tags'], 1)) . PHP_EOL . PHP_EOL;
<?php /** * Created by PhpStorm. * User: yakov * Date: 09.06.16 * Time: 4:59 */ require __DIR__ . '/../vendor/autoload.php'; $parser = new \jakulov\HyperParser\Parser(); $url = 'https://www.avito.ru/kazan/avtomobili'; $pattern = ['cars' => ['selector' => '.item_table', 'fields' => ['url' => '.item-description-title-link|href', 'photo' => '.photo-count-show|src', 'title' => '.item-description-title-link', 'price' => '.about', 'date' => '.date']], 'current' => '.pagination-page_current']; $found = true; $cars = []; $parseUrl = $url; $pageLimit = 5; $currentPage = 0; $timeStart = microtime(true); while ($found) { try { $data = $parser->parseUrl($parseUrl, $pattern); $bulkUrls = []; if ($data && $data['cars']) { foreach ($data['cars'] as $car) { $price = ''; if (preg_match('/(.*)руб/', $car['price'][0], $m)) { $price = str_replace(' ', '', isset($m[1]) ? $m[1] : $price); } $carUrl = 'https://www.avito.ru' . $car['url'][0]; $cars[$carUrl] = ['url' => $carUrl, 'photo' => $car['photo'][0], 'title' => $car['title'][0], 'price' => $price, 'date' => date('Y-m-d H:i:s', strtotime(dateRusToEn($car['date'][0])))]; $bulkUrls[] = $carUrl;
public function testExtractDataByPatternWithArray() { $content = ''; $selector = '.item'; $fieldSelector = '.title'; $data1 = 'Test Title 1'; $data2 = 'Test Title 2'; $elementMock = $this->getMockBuilder(DOMMock::class)->setMethods(['text'])->getMock(); $elementMock->expects($this->at(0))->method('text')->will($this->returnValue($data1)); $elementMock->expects($this->at(1))->method('text')->will($this->returnValue($data2)); $itemMock = $this->getMockBuilder(DOMMock::class)->setMethods(['find'])->getMock(); $itemMock->expects($this->exactly(2))->method('find')->with($this->equalTo($fieldSelector))->will($this->returnValue([$elementMock])); $domMock = $this->getMockBuilder(DOMMock::class)->setMethods(['find'])->getMock(); $domMock->expects($this->once())->method('find')->with($this->equalTo($selector))->will($this->returnValue([$itemMock, $itemMock])); $domParserMock = $this->getMockBuilder(DOMParserMock::class)->setMethods(['getDOM'])->getMock(); $domParserMock->expects($this->once())->method('getDOM')->with($this->equalTo($content))->will($this->returnValue($domMock)); $pattern = ['items' => ['selector' => $selector, 'fields' => ['title' => $fieldSelector]]]; $expected = ['items' => [['title' => [$data1]], ['title' => [$data2]]]]; $parser = new \jakulov\HyperParser\Parser($domParserMock); $actual = $parser->extractDataByPattern($content, $pattern); $this->assertEquals($expected, $actual); }
<?php /** * Created by PhpStorm. * User: yakov * Date: 05.06.16 * Time: 15:30 */ require __DIR__ . '/../vendor/autoload.php'; $parser = new \jakulov\HyperParser\Parser(); $url1 = 'http://lenta.ru'; $pattern1 = ['links' => '.b-yellow-box .item a|href']; $data1 = $parser->parseUrl($url1, $pattern1); $urls = []; foreach ($data1['links'] as $link) { if ($link) { $urls[] = $url1 . $link; } } //var_dump($urls); $pattern2 = ['title' => 'title', 'img' => '.b-topic__title-image img|src', 'text' => '.b-text|innertext']; $data2 = $parser->bulkParse($urls, $pattern2, false); foreach ($data2 as $url => $news) { if (is_array($news)) { echo PHP_EOL . '==============' . PHP_EOL; echo $url . PHP_EOL; echo 'Title: ' . $news['title'][0] . PHP_EOL; echo 'IMG: ' . $news['img'][0] . PHP_EOL . PHP_EOL; echo 'Text: ' . $news['text'][0] . PHP_EOL . PHP_EOL; } else { echo 'ERROR: ' . $news . PHP_EOL . PHP_EOL;