Пример #1
0
<?php

/**
 * Created by PhpStorm.
 * User: gabrielgagno
 * Date: 11/27/15
 * Time: 9:26 AM
 */
include 'MetaParser.php';
use Parser\MetaParser;
$conn = mysqli_connect('localhost', 'root', '', 'nutch');
if (!$conn) {
    die('Not connected : ' . mysqli_error($conn));
}
$selectQuery = "select * from webpage where content is not null and baseUrl is not null";
$results = $conn->query($selectQuery);
$curlUrl = 'http://50.18.169.52/globe_search/webpage';
while ($row = mysqli_fetch_assoc($results)) {
    if ($row['content'] != null || !empty($row['content'])) {
        $meta = MetaParser::parseMetaTagsFromHtmlString($row['content'], array('description'));
        if (isset($meta['description'])) {
            $query = "update webpage set aq_md_description = \"" . $meta['description'] . "\"where id =\"" . $row['id'] . "\"";
            $conn->query($query);
        }
    }
}
MetaParser::addSearchString();
Пример #2
0
 /**
  * wrapper for parsing using html-formatted strings
  * @param $string
  * @param null $names
  * @return array
  */
 public static function parseMetaTagsFromHtmlString($string, $names = NULL)
 {
     return MetaParser::parse($string, $names);
 }
Пример #3
0
echo "AUTOCOMMIT SET TO TRUE\n";
$query = "select id, title, baseUrl, text, content from webpage where status = 2";
$results = $conn->query($query);
#bulk index using curl
$curlUrl = "http://" . $es_settings['host'] . ":" . $es_settings['port'] . "/" . $es_settings['index'] . "/webpage/_bulk";
echo "CURL URL: " . $curlUrl;
$ch = curl_init();
# initialize curl payload
$payload = "";
$rowNum = 1;
$rowCount = $results->num_rows;
# for all results of query
while ($row = mysqli_fetch_assoc($results)) {
    echo "DOCUMENT NUMBER: " . $rowNum . "\n";
    $metas = MetaParser::parseMetaTagsFromHtmlString($row['content'], ['description', 'keywords']);
    $payloadHalf = json_encode(array("id" => utf8_encode($row['id']), "title" => utf8_encode($row['title']), "url" => utf8_encode($row['baseUrl']), "content" => utf8_encode($row['text']), "desc" => empty($metas['description']) ? null : utf8_encode($metas['description']), "keywords" => empty($metas['keywords']) ? null : utf8_encode($metas['keywords']), "subdomain" => MetaParser::getSubdomain($row['baseUrl'])), JSON_UNESCAPED_SLASHES);
    $payload = $payload . "{\"create\":{}}\n" . $payloadHalf . "\n";
    if ($rowNum % 200 == 0 || $rowNum == $rowCount) {
        curl_setopt_array($ch, array(CURLOPT_CUSTOMREQUEST => 'POST', CURLOPT_URL => $curlUrl, CURLOPT_RETURNTRANSFER => 1, CURLOPT_POSTFIELDS => $payload));
        echo "\nindexing job: starting...\n";
        #execute bulk index for 200 documents
        $response = curl_exec($ch);
        echo $response;
        echo "indexing job: done.\n";
        $payload = "";
    }
    $rowNum++;
}
curl_close($ch);
# close the connection
$conn->close();