Example #1
0
<?php

require 'Article.php';
require 'Util.php';
libxml_use_internal_errors(true);
$types = array('json', 'html', 'pdf');
foreach ($types as $type) {
    $dir = 'data/' . $type;
    if (!file_exists($dir)) {
        mkdir($dir, 0700, true);
    }
}
$curl = curl_init();
curl_setopt_array($curl, array(CURLOPT_RETURNTRANSFER => true, CURLOPT_FOLLOWLOCATION => true, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_TIMEOUT => 60, CURLOPT_ENCODING => '', CURLOPT_NOPROGRESS => false, CURLOPT_MAXREDIRS => 20, CURLOPT_COOKIEJAR => '/tmp/cookies.txt', CURLOPT_COOKIEFILE => '/tmp/cookies.txt', CURLOPT_USERAGENT => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'));
$dois = file('dois.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
printf("Fetching data for %d DOIs\n", count($dois));
shuffle($dois);
// randomise the DOIs to distribute load
$log = fopen('error.log', 'w');
$selectors = json_decode(file_get_contents('selectors.json'), true);
foreach ($dois as $doi) {
    $article = new Article($curl, $log, $selectors, $doi);
    $article->fill();
}