Example #1
0
<?php

/**
 * Created by PhpStorm.
 * User: yakov
 * Date: 05.06.16
 * Time: 12:19
 */
require __DIR__ . '/../vendor/autoload.php';
$parser = new \jakulov\HyperParser\Parser();
$url = 'https://en.wikipedia.org/wiki/Adam_Smith';
$pattern = ['name' => '#firstHeading', 'img' => '.image img|src', 'bio' => '#mw-content-text p|innertext', 'tags' => '#mw-normal-catlinks a'];
$data = $parser->parseUrl($url, $pattern);
echo 'Name: ' . $data['name'][0] . PHP_EOL;
echo 'Photo: ' . $data['img'][0] . PHP_EOL . PHP_EOL;
echo 'Bio: ' . strip_tags($data['bio'][0]) . PHP_EOL . PHP_EOL;
echo '===================== ' . PHP_EOL;
echo 'Tags: ' . join(', ', array_slice($data['tags'], 1)) . PHP_EOL . PHP_EOL;
Example #2
0
<?php

/**
 * Created by PhpStorm.
 * User: yakov
 * Date: 09.06.16
 * Time: 4:59
 */
require __DIR__ . '/../vendor/autoload.php';
$parser = new \jakulov\HyperParser\Parser();
$url = 'https://www.avito.ru/kazan/avtomobili';
$pattern = ['cars' => ['selector' => '.item_table', 'fields' => ['url' => '.item-description-title-link|href', 'photo' => '.photo-count-show|src', 'title' => '.item-description-title-link', 'price' => '.about', 'date' => '.date']], 'current' => '.pagination-page_current'];
$found = true;
$cars = [];
$parseUrl = $url;
$pageLimit = 5;
$currentPage = 0;
$timeStart = microtime(true);
while ($found) {
    try {
        $data = $parser->parseUrl($parseUrl, $pattern);
        $bulkUrls = [];
        if ($data && $data['cars']) {
            foreach ($data['cars'] as $car) {
                $price = '';
                if (preg_match('/(.*)руб/', $car['price'][0], $m)) {
                    $price = str_replace(' ', '', isset($m[1]) ? $m[1] : $price);
                }
                $carUrl = 'https://www.avito.ru' . $car['url'][0];
                $cars[$carUrl] = ['url' => $carUrl, 'photo' => $car['photo'][0], 'title' => $car['title'][0], 'price' => $price, 'date' => date('Y-m-d H:i:s', strtotime(dateRusToEn($car['date'][0])))];
                $bulkUrls[] = $carUrl;
Example #3
0
 public function testExtractDataByPatternWithArray()
 {
     $content = '';
     $selector = '.item';
     $fieldSelector = '.title';
     $data1 = 'Test Title 1';
     $data2 = 'Test Title 2';
     $elementMock = $this->getMockBuilder(DOMMock::class)->setMethods(['text'])->getMock();
     $elementMock->expects($this->at(0))->method('text')->will($this->returnValue($data1));
     $elementMock->expects($this->at(1))->method('text')->will($this->returnValue($data2));
     $itemMock = $this->getMockBuilder(DOMMock::class)->setMethods(['find'])->getMock();
     $itemMock->expects($this->exactly(2))->method('find')->with($this->equalTo($fieldSelector))->will($this->returnValue([$elementMock]));
     $domMock = $this->getMockBuilder(DOMMock::class)->setMethods(['find'])->getMock();
     $domMock->expects($this->once())->method('find')->with($this->equalTo($selector))->will($this->returnValue([$itemMock, $itemMock]));
     $domParserMock = $this->getMockBuilder(DOMParserMock::class)->setMethods(['getDOM'])->getMock();
     $domParserMock->expects($this->once())->method('getDOM')->with($this->equalTo($content))->will($this->returnValue($domMock));
     $pattern = ['items' => ['selector' => $selector, 'fields' => ['title' => $fieldSelector]]];
     $expected = ['items' => [['title' => [$data1]], ['title' => [$data2]]]];
     $parser = new \jakulov\HyperParser\Parser($domParserMock);
     $actual = $parser->extractDataByPattern($content, $pattern);
     $this->assertEquals($expected, $actual);
 }
Example #4
0
<?php

/**
 * Created by PhpStorm.
 * User: yakov
 * Date: 05.06.16
 * Time: 15:30
 */
require __DIR__ . '/../vendor/autoload.php';
$parser = new \jakulov\HyperParser\Parser();
$url1 = 'http://lenta.ru';
$pattern1 = ['links' => '.b-yellow-box .item a|href'];
$data1 = $parser->parseUrl($url1, $pattern1);
$urls = [];
foreach ($data1['links'] as $link) {
    if ($link) {
        $urls[] = $url1 . $link;
    }
}
//var_dump($urls);
$pattern2 = ['title' => 'title', 'img' => '.b-topic__title-image img|src', 'text' => '.b-text|innertext'];
$data2 = $parser->bulkParse($urls, $pattern2, false);
foreach ($data2 as $url => $news) {
    if (is_array($news)) {
        echo PHP_EOL . '==============' . PHP_EOL;
        echo $url . PHP_EOL;
        echo 'Title: ' . $news['title'][0] . PHP_EOL;
        echo 'IMG: ' . $news['img'][0] . PHP_EOL . PHP_EOL;
        echo 'Text: ' . $news['text'][0] . PHP_EOL . PHP_EOL;
    } else {
        echo 'ERROR: ' . $news . PHP_EOL . PHP_EOL;