<?php /** * Created by PhpStorm. * User: gabrielgagno * Date: 11/27/15 * Time: 9:26 AM */ include 'MetaParser.php'; use Parser\MetaParser; $conn = mysqli_connect('localhost', 'root', '', 'nutch'); if (!$conn) { die('Not connected : ' . mysqli_error($conn)); } $selectQuery = "select * from webpage where content is not null and baseUrl is not null"; $results = $conn->query($selectQuery); $curlUrl = 'http://50.18.169.52/globe_search/webpage'; while ($row = mysqli_fetch_assoc($results)) { if ($row['content'] != null || !empty($row['content'])) { $meta = MetaParser::parseMetaTagsFromHtmlString($row['content'], array('description')); if (isset($meta['description'])) { $query = "update webpage set aq_md_description = \"" . $meta['description'] . "\"where id =\"" . $row['id'] . "\""; $conn->query($query); } } } MetaParser::addSearchString();
/** * wrapper for parsing using html-formatted strings * @param $string * @param null $names * @return array */ public static function parseMetaTagsFromHtmlString($string, $names = NULL) { return MetaParser::parse($string, $names); }
echo "AUTOCOMMIT SET TO TRUE\n"; $query = "select id, title, baseUrl, text, content from webpage where status = 2"; $results = $conn->query($query); #bulk index using curl $curlUrl = "http://" . $es_settings['host'] . ":" . $es_settings['port'] . "/" . $es_settings['index'] . "/webpage/_bulk"; echo "CURL URL: " . $curlUrl; $ch = curl_init(); # initialize curl payload $payload = ""; $rowNum = 1; $rowCount = $results->num_rows; # for all results of query while ($row = mysqli_fetch_assoc($results)) { echo "DOCUMENT NUMBER: " . $rowNum . "\n"; $metas = MetaParser::parseMetaTagsFromHtmlString($row['content'], ['description', 'keywords']); $payloadHalf = json_encode(array("id" => utf8_encode($row['id']), "title" => utf8_encode($row['title']), "url" => utf8_encode($row['baseUrl']), "content" => utf8_encode($row['text']), "desc" => empty($metas['description']) ? null : utf8_encode($metas['description']), "keywords" => empty($metas['keywords']) ? null : utf8_encode($metas['keywords']), "subdomain" => MetaParser::getSubdomain($row['baseUrl'])), JSON_UNESCAPED_SLASHES); $payload = $payload . "{\"create\":{}}\n" . $payloadHalf . "\n"; if ($rowNum % 200 == 0 || $rowNum == $rowCount) { curl_setopt_array($ch, array(CURLOPT_CUSTOMREQUEST => 'POST', CURLOPT_URL => $curlUrl, CURLOPT_RETURNTRANSFER => 1, CURLOPT_POSTFIELDS => $payload)); echo "\nindexing job: starting...\n"; #execute bulk index for 200 documents $response = curl_exec($ch); echo $response; echo "indexing job: done.\n"; $payload = ""; } $rowNum++; } curl_close($ch); # close the connection $conn->close();