while (!feof($fh)) { $line = fgetcsv($fh, 0, ',', '"'); if (isset($line[1]) && $line[1] !== '') { if (isset($line[2]) && $line[2] === '') { $timedoutresponsescount++; $sLine = $line[0] . $s . $line[1]; fwrite($fsw, $sLine . "\n"); } else { if (!isset($line[4])) { if ($prevLine !== '') { echo "[ERROR] Error parsing {$inputfilename} right after " . $prevLine . "\n"; } $sLine = ''; } else { if (!isset($line[5]) || $line[5] === -1) { $line[5] = pagerank('http://' . preg_replace('@http[s]?://@i', '', $line[1])); } $sLine = preg_replace('@\\t\\n@', "\n", "{$line[0]}{$s}{$line[1]}{$s}\"{$line[2]}\"{$s}\"{$line[3]}\"{$s}\"{$line[4]}\"{$s}{$line[5]}"); if (preg_match($pattern, $sLine)) { fwrite($fw, $sLine . "\n"); } } } $prevLine = $sLine; } } fclose($fsw); fclose($fh); if (filesize($swapOutputFile) === 0) { echo "No more timed out requests found after repass " . ($i + 1) . ".\n Exiting...\n"; break;
function request_callback($response, $info, $request) { $s = ','; // CSV output file separator $doc = new DOMDocument(); @$doc->loadHTML($response); $titletags = $doc->getElementsByTagName('title'); if ($titletags->length > 0) { $titletag = $titletags->item(0); $title = preg_replace('@(?<!\\\\)"@', '\\"', $titletag->textContent); } else { $title = ''; } $metatags = $doc->getElementsByTagName('meta'); $description = ''; $keywords = ''; for ($i = 0; $i < $metatags->length; $i++) { $metatag = $metatags->item($i); /** @noinspection PhpUndefinedMethodInspection */ if (strtolower($metatag->getAttribute('name')) === 'description') { /** @noinspection PhpUndefinedMethodInspection */ $description = preg_replace('@(?<!\\\\)"@', '\\"', $metatag->getAttribute('content')); } else { /** @noinspection PhpUndefinedMethodInspection */ if (strtolower($metatag->getAttribute('name')) === 'keywords') { /** @noinspection PhpUndefinedMethodInspection */ $keywords = preg_replace('@(?<!\\\\)"@', '\\"', $metatag->getAttribute('content')); } } } // This will work fine on concurrency as long as we're not trying to write more than 64KB (see http://www.php.net/manual/en/function.stream-set-write-buffer.php) $line = preg_replace('@\\t\\n@', "\n", "{$request->rank}{$s}{$request->url}{$s}\"{$title}\"{$s}\"{$keywords}\"{$s}\"{$description}\"{$s}" . (isset($request->pagerank) ? $request->pagerank : pagerank($request->url))); $pattern = '@^[\\x{000a}\\x{000d}\\x{0020}-\\x{007e}\\x{2000}-\\x{27ff}]*$@u'; //only English (ASCII printable and unicode general extensions) characters if (preg_match($pattern, $line)) { fwrite($request->outputfilehandler, $line . "\n"); } //echo $line."\n"; }
function showpr($url) { $pr = pagerank($url); echo $pr; //return $pr; }