Пример #1
0
 /**
  * {@inheritdoc}
  */
 public function writerTest()
 {
     $csv = new Writer($this->path, 'w');
     foreach ($this->generateRawData() as $row) {
         $csv->writeRow($row);
     }
 }
Пример #2
0
foreach ($cmp2 as $name => $dataset) {
    $title = $dataset['title_simple'];
    $cmp2_by_title[$title] = isset($cmp2_by_title[$title]) ? $cmp2_by_title[$title] : [];
    $cmp2_by_title[$title][] = $dataset;
    $guid = trim($dataset['guid']);
    if ($guid) {
        $cmp2_by_guid[$guid] = isset($cmp2_by_guid[$guid]) ? $cmp2_by_guid[$guid] : [];
        $cmp2_by_guid[$guid][] = $dataset;
    }
}
echo 'comparison.csv' . PHP_EOL;
is_file($results_dir . '/comparison.csv') && unlink($results_dir . '/comparison.csv');
$csv = new Writer($results_dir . '/comparison.csv');
$cmp1_header = "DMS";
$cmp2_header = "NON-DMS";
$csv->writeRow([$cmp1_header . ' Title', $cmp1_header . ' URL', $cmp1_header . ' GUID', $cmp1_header . ' Topics', $cmp1_header . ' Categories', 'Matched', $cmp2_header . ' Title', $cmp2_header . ' URL', $cmp2_header . ' GUID', 'URL Match', 'GUID Match']);
foreach ($cmp1 as $name => $cmp1_dataset) {
    if (isset($cmp2_by_guid[$cmp1_dataset['guid']])) {
        foreach ($cmp2_by_guid[$cmp1_dataset['guid']] as $cmp2_dataset) {
            $csv->writeRow([$cmp1_dataset['title'], $cmp1_dataset['url'], $cmp1_dataset['guid'], $cmp1_dataset['topics'], $cmp1_dataset['categories'], true, $cmp2_dataset['title'], $cmp2_dataset['url'], $cmp2_dataset['guid'], (bool) ($cmp1_dataset['name'] && $cmp1_dataset['name'] == $cmp2_dataset['name']), true]);
        }
        continue;
    }
    if (isset($cmp2_by_title[$cmp1_dataset['title_simple']])) {
        foreach ($cmp2_by_title[$cmp1_dataset['title_simple']] as $cmp2_dataset) {
            $csv->writeRow([$cmp1_dataset['title'], $cmp1_dataset['url'], $cmp1_dataset['guid'], $cmp1_dataset['topics'], $cmp1_dataset['categories'], true, $cmp2_dataset['title'], $cmp2_dataset['url'], $cmp2_dataset['guid'], true, (bool) ($cmp1_dataset['guid'] && $cmp1_dataset['guid'] == $cmp2_dataset['guid'])]);
        }
        continue;
    }
    $csv->writeRow([$cmp1_dataset['title'], $cmp1_dataset['url'], $cmp1_dataset['guid'], $cmp1_dataset['topics'], $cmp1_dataset['categories'], false, '', '', '', false, false]);
}
Пример #3
0
 /**
  * @param             $category
  * @param CkanManager $CkanManagerProduction
  */
 public function checkGroupAgainstProd($category, self $CkanManagerProduction)
 {
     $csv = new Writer($this->resultsDir . '/' . $category . date('_Ymd-His') . '.csv');
     $csv->writeRow(['Staging dataset name', 'Staging Source', 'Prod exists', 'Prod has ' . $category, 'Prod Source']);
     $ckan_query = '((groups:' . $category . ') + dataset_type:dataset)';
     $start = 0;
     $per_page = 20;
     while (true) {
         $packages = $this->tryPackageSearch($ckan_query, '', $per_page, $start);
         if (!$packages) {
             echo "{$start} / {$per_page} :: finish" . PHP_EOL;
             break;
         }
         foreach ($packages as $package) {
             if (is_array($package['extras']) && sizeof($package['extras']) && strpos(json_encode($package['extras']), '"dms"')) {
                 $resource_type = 'DMS';
                 //                    echo "DMS ".$package['name'].PHP_EOL;
             } elseif (is_array($package['extras']) && sizeof($package['extras']) && strpos(json_encode($package['extras']), '"value":"geospatial"')) {
                 $resource_type = 'GEO';
                 //                    echo "GEO ".$package['name'].PHP_EOL;
             } elseif (is_array($package['extras']) && sizeof($package['extras']) && strpos(json_encode($package['extras']), 'source_datajson_identifier')) {
                 $resource_type = 'JSON';
                 //                    echo "JSON ".$package['name'].PHP_EOL;
             } else {
                 $resource_type = 'OTHER';
                 echo json_encode($package['extras']) . PHP_EOL;
                 echo "UNKNOWN: " . $package['name'] . PHP_EOL;
             }
             $prod_package = $CkanManagerProduction->tryPackageShow($package['name']);
             $exists = $prod_package ? 'EXISTS' : 'NOT FOUND';
             $prod_category_found = '';
             $prod_resource_type = '';
             if ($prod_package) {
                 $prod_category_found = 'FALSE';
                 if (isset($prod_package['groups']) && sizeof($prod_package['groups']) && strpos(json_encode($prod_package['groups']), $category)) {
                     $prod_category_found = 'HAS';
                 }
                 if (is_array($prod_package['extras']) && sizeof($prod_package['extras']) && strpos(json_encode($prod_package['extras']), '"dms"')) {
                     $prod_resource_type = 'DMS';
                     //                    echo "DMS ".$prod_package['name'].PHP_EOL;
                 } elseif (is_array($prod_package['extras']) && sizeof($prod_package['extras']) && strpos(json_encode($prod_package['extras']), '"value":"geospatial"')) {
                     $prod_resource_type = 'GEO';
                     //                    echo "GEO ".$prod_package['name'].PHP_EOL;
                 } elseif (is_array($prod_package['extras']) && sizeof($prod_package['extras']) && strpos(json_encode($prod_package['extras']), 'source_datajson_identifier')) {
                     $prod_resource_type = 'JSON';
                     //                    echo "JSON ".$prod_package['name'].PHP_EOL;
                 } else {
                     $prod_resource_type = 'OTHER';
                     echo json_encode($prod_package['extras']) . PHP_EOL;
                     echo "UNKNOWN on PROD: " . $prod_package['name'] . PHP_EOL;
                 }
             }
             $csv->writeRow([$package['name'], $resource_type, $exists, $prod_category_found, $prod_resource_type]);
         }
         $start += $per_page;
     }
 }
Пример #4
0
        //        skip headers
        if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
            continue;
        }
        if ($start > 0) {
            $start--;
            continue;
        }
        //        no anchors please
        list($dataset, ) = explode('#', basename(trim($row['0'])));
        //        echo $dataset.PHP_EOL;
        if (!$dataset) {
            continue;
        }
        //        double trouble check
        if (strpos($row['0'], '://')) {
            if (!strpos($row['0'], '/dataset/')) {
                file_put_contents($results_dir . '/' . $basename . '_export.log.csv', $row['0'] . ',WRONG URL' . PHP_EOL, FILE_APPEND | LOCK_EX);
                continue;
            }
        }
        $lines = $CkanManager->exportPackage($dataset);
        foreach ($lines as $line) {
            $tags_csv->writeRow($line);
        }
    }
}
//$brief = $CkanManager->exportShort('extras_harvest_source_title:Test ISO WAF AND (dataset_type:dataset)');
//$csv->writeFromArray($brief);
// show running time on finish
timer();
foreach ($json_backup_noaa as $name => $dataset) {
    $title = $dataset['title_simple'];
    $json_backup_noaa_by_title[$title] = isset($json_backup_noaa_by_title[$title]) ? $json_backup_noaa_by_title[$title] : [];
    $json_backup_noaa_by_title[$title][] = $dataset;
    $guid = trim($dataset['guid']);
    if ($guid) {
        $json_backup_noaa_by_guid[$guid] = isset($json_backup_noaa_by_guid[$guid]) ? $json_backup_noaa_by_guid[$guid] : [];
        $json_backup_noaa_by_guid[$guid][] = $dataset;
    }
}
echo 'prod_vs_json_backup.csv' . PHP_EOL;
is_file($results_dir . '/prod_vs_json_backup_noaa_geospatial.csv') && unlink($results_dir . '/prod_vs_json_backup_noaa_geospatial.csv');
$csv = new Writer($results_dir . '/prod_vs_json_backup_noaa_geospatial.csv');
$csv->writeRow(['UAT Title', 'UAT URL', 'UAT GUID', 'UAT Topics', 'UAT Categories', 'Matched', 'JSON Title', 'JSON URL', 'JSON GUID', 'URL Match', 'GUID Match']);
$csv_uat_tagging = new Writer($results_dir . '/uat_tagging.csv');
$csv_uat_tagging->writeRow(['url', 'group', 'tags']);
foreach ($uat_noaa as $name => $uat_dataset) {
    if (isset($json_backup_noaa_by_guid[$uat_dataset['guid']])) {
        foreach ($json_backup_noaa_by_guid[$uat_dataset['guid']] as $json_backup_dataset) {
            $csv->writeRow([$uat_dataset['title'], $uat_dataset['url'], $uat_dataset['guid'], $uat_dataset['topics'], $uat_dataset['categories'], true, $json_backup_dataset['title'], $json_backup_dataset['url'], $json_backup_dataset['guid'], (bool) ($uat_dataset['name'] == $json_backup_dataset['name']), true]);
            if (isset($json_backup_tags[$json_backup_dataset['title_simple']])) {
                foreach ($json_backup_tags[$json_backup_dataset['title_simple']] as $group => $tags) {
                    $csv_uat_tagging->writeRow([$uat_dataset['url'], $group, join(';', $tags)]);
                }
            }
        }
        continue;
    }
    if (isset($json_backup_noaa_by_title[$uat_dataset['title_simple']])) {
        foreach ($json_backup_noaa_by_title[$uat_dataset['title_simple']] as $json_backup_dataset) {
            $csv->writeRow([$uat_dataset['title'], $uat_dataset['url'], $uat_dataset['guid'], $uat_dataset['topics'], $uat_dataset['categories'], true, $json_backup_dataset['title'], $json_backup_dataset['url'], $json_backup_dataset['guid'], true, (bool) ($uat_dataset['guid'] == $json_backup_dataset['guid'])]);
// We don't want the header (use curl_getinfo())
curl_setopt($curl_ch, CURLOPT_HEADER, false);
// Track the handle's request string
curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true);
// Attempt to retrieve the modification date of the remote document.
curl_setopt($curl_ch, CURLOPT_FILETIME, true);
// Initialize cURL headers
foreach (glob(CKANMNGR_DATA_DIR . '/redirects_*.csv') as $csv_file) {
    $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
    echo $status;
    $basename = str_replace('.csv', '', basename($csv_file));
    //    fix wrong END-OF-LINE
    file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
    $csv_source = new EasyCSV\Reader($csv_file, 'r+', false);
    $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv');
    $csv_destination->writeRow(['from', 'to', 'status', 'real_redirect']);
    $i = 0;
    while (true) {
        if (!($i++ % 100)) {
            echo $i . PHP_EOL;
        }
        $row = $csv_source->getRow();
        if (!$row) {
            break;
        }
        //        skip headers
        if (in_array(trim(strtolower($row[0])), ['socrata code', 'from', 'source url'])) {
            //            $csv_destination->writeRow($row);
            continue;
        }
        $socrata_url = $row[0];
                $d['status'] = 'deleted';
            }
            array_push($statistics, $d);
        }
    }
    $delete_csv = new Writer($results_dir . '/delete_' . $organization . '.csv');
    $delete_csv->writeRow(['url']);
    $delete_csv->writeFromArray($delete);
    $delete_full_csv = new Writer($results_dir . '/' . $organization . '_delete_full.csv');
    $headers = array_keys($delete_full[0]);
    $delete_full_csv->writeRow($headers);
    $delete_full_csv->writeFromArray($delete_full);
    $stats_csv = new Writer($results_dir . '/' . $organization . '_statistics.csv');
    $headers = array_keys($statistics[0]);
    $stats_csv->writeRow($headers);
    $stats_csv->writeFromArray($statistics);
    $survivors_csv = new Writer($results_dir . '/' . $organization . '_survivors.csv');
    $headers = array_keys($survivors[array_keys($survivors)[0]]);
    $survivors_csv->writeRow($headers);
    $survivors_csv->writeFromArray($survivors);
    $stitle = '';
    foreach ($delete_full as $dataset) {
        if ($dataset['title_simple'] !== $stitle) {
            $stitle = $dataset['title_simple'];
            //            echo PHP_EOL;
        }
        //        echo printf('%20s %20s',$dataset['title_simple'],$dataset['name']).PHP_EOL;
    }
}
// show running time on finish
timer();
foreach ($json_backup_epa as $name => $dataset) {
    $title = $dataset['title_simple'];
    $json_backup_epa_by_title[$title] = isset($json_backup_epa_by_title[$title]) ? $json_backup_epa_by_title[$title] : [];
    $json_backup_epa_by_title[$title][] = $dataset;
    $guid = trim($dataset['guid']);
    if ($guid) {
        $json_backup_epa_by_guid[$guid] = isset($json_backup_epa_by_guid[$guid]) ? $json_backup_epa_by_guid[$guid] : [];
        $json_backup_epa_by_guid[$guid][] = $dataset;
    }
}
echo 'prod_vs_json_backup.csv' . PHP_EOL;
is_file($results_dir . '/prod_vs_json_backup_epa_geospatial.csv') && unlink($results_dir . '/prod_vs_json_backup_epa_geospatial.csv');
$csv = new Writer($results_dir . '/prod_vs_json_backup_epa_geospatial.csv');
$csv->writeRow(['Prod Title', 'Prod URL', 'Prod GUID', 'Prod Topics', 'Prod Categories', 'Matched', 'JSON Title', 'JSON URL', 'JSON GUID', 'URL Match', 'Title Match', 'GUID Match']);
$csv_prod_tagging = new Writer($results_dir . '/prod_tagging.csv');
$csv_prod_tagging->writeRow(['url', 'group', 'tags', 'old_url', 'new_title', 'old_title', 'match_by']);
foreach ($prod_epa as $name => $prod_dataset) {
    if (isset($json_backup_epa_by_guid[$prod_dataset['guid']])) {
        foreach ($json_backup_epa_by_guid[$prod_dataset['guid']] as $json_backup_dataset) {
            $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $json_backup_dataset['title'], $json_backup_dataset['url'], $json_backup_dataset['guid'], (bool) ($prod_dataset['name'] && $prod_dataset['name'] == $json_backup_dataset['name']), (bool) ($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $json_backup_dataset['title_simple']), true]);
            if (isset($json_backup_tags[$json_backup_dataset['title_simple']])) {
                foreach ($json_backup_tags[$json_backup_dataset['title_simple']] as $group => $tags) {
                    $csv_prod_tagging->writeRow([$prod_dataset['url'], $group, join(';', $tags), $json_backup_dataset['name'], $prod_dataset['title_simple'], $json_backup_dataset['title_simple'], 'guid: ' . $prod_dataset['guid']]);
                }
            }
        }
        continue;
    }
    if (isset($json_backup_epa_by_title[$prod_dataset['title_simple']])) {
        foreach ($json_backup_epa_by_title[$prod_dataset['title_simple']] as $json_backup_dataset) {
            $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $json_backup_dataset['title'], $json_backup_dataset['url'], $json_backup_dataset['guid'], (bool) ($prod_dataset['name'] && $prod_dataset['name'] == $json_backup_dataset['name']), true, (bool) ($prod_dataset['guid'] == $json_backup_dataset['guid'])]);
    $uat = new Writer($results_dir . '/uat.csv');
    $uat->writeRow(['title', 'title_simple', 'name', 'url', 'topics', 'categories']);
    $UatCkanManager = new CkanManager(CKAN_UAT_API_URL);
    $UatCkanManager->resultsDir = $results_dir;
    $uat_nuclear = $UatCkanManager->exportBrief('extras_harvest_source_title:NRC data.json', '', 'http://uat-catalog-fe-data.reisys.com/dataset/');
    $uat->writeFromArray($uat_nuclear);
} else {
    $uat = new Reader($results_dir . '/uat.csv');
    $uat_nuclear = $uat->getAll();
}
$uat_nuclear_by_title = [];
foreach ($uat_nuclear as $name => $dataset) {
    $title = $dataset['title_simple'];
    $uat_nuclear_by_title[$title] = isset($uat_nuclear_by_title[$title]) ? $uat_nuclear_by_title[$title] : [];
    $uat_nuclear_by_title[$title][] = $dataset;
}
echo 'prod_vs_uat.csv' . PHP_EOL;
is_file($results_dir . '/prod_vs_uat_nuclear_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_nuclear_geospatial.csv');
$csv = new Writer($results_dir . '/prod_vs_uat_nuclear_geospatial.csv');
$csv->writeRow(['Prod Title', 'Prod URL', 'Prod Topics', 'Prod Categories', 'Matched', 'UAT Title', 'UAT URL']);
foreach ($prod_nuclear as $name => $prod_dataset) {
    if (isset($uat_nuclear_by_title[$prod_dataset['title_simple']])) {
        foreach ($uat_nuclear_by_title[$prod_dataset['title_simple']] as $uat_dataset) {
            $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['topics'], $prod_dataset['categories'], true, $uat_dataset['title'], $uat_dataset['url']]);
        }
        continue;
    }
    $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['topics'], $prod_dataset['categories'], false, '', '']);
}
// show running time on finish
timer();
mkdir($results_dir);
/**
 * Production
 */
$CkanManager = new CkanManager(CKAN_API_URL);
/**
 * Staging
 */
//$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
foreach (glob(CKANMNGR_DATA_DIR . '/find_*.csv') as $csv_file) {
    $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
    echo $status;
    $basename = str_replace('.csv', '', basename($csv_file));
    $csv_source = new EasyCSV\Reader($csv_file, 'r+', false);
    $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_results.csv');
    $csv_destination->writeRow(['url', 'exact match', 'title', 'found by title']);
    $i = 0;
    while (true) {
        if (!($i++ % 10)) {
            echo $i . PHP_EOL;
        }
        $row = $csv_source->getRow();
        if (!$row) {
            break;
        }
        //        skip headers
        if (in_array(trim(strtolower($row[0])), ['url', 'from', 'source url'])) {
            continue;
        }
        $title = $row[0];
        /**
}
$prod_epa_by_title = $prod_epa_by_guid = [];
foreach ($prod_epa as $name => $dataset) {
    $title = $dataset['title_simple'];
    $prod_epa_by_title[$title] = isset($prod_epa_by_title[$title]) ? $prod_epa_by_title[$title] : [];
    $prod_epa_by_title[$title][] = $dataset;
    $guid = trim($dataset['guid']);
    if ($guid) {
        $prod_epa_by_guid[$guid] = isset($prod_epa_by_guid[$guid]) ? $prod_epa_by_guid[$guid] : [];
        $prod_epa_by_guid[$guid][] = $dataset;
    }
}
echo 'json_vs_prod.csv' . PHP_EOL;
is_file($results_dir . '/json_vs_prod_epa.csv') && unlink($results_dir . '/json_vs_prod_epa.csv');
$csv = new Writer($results_dir . '/json_vs_prod_epa.csv');
$csv->writeRow(['Backup Title', 'Backup URL', 'Backup GUID', 'Backup Topics', 'Backup Categories', 'Matched', 'Prod Title', 'Prod URL', 'Prod GUID', 'URL Match', 'GUID Match']);
foreach ($json_backup_epa as $name => $backup_dataset) {
    if (isset($prod_epa_by_guid[$backup_dataset['guid']])) {
        foreach ($prod_epa_by_guid[$backup_dataset['guid']] as $prod_dataset) {
            $csv->writeRow([$backup_dataset['title'], $backup_dataset['url'], $backup_dataset['guid'], $backup_dataset['topics'], $backup_dataset['categories'], true, $prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], (bool) ($backup_dataset['name'] == $prod_dataset['name']), true]);
        }
        continue;
    }
    if (isset($prod_epa_by_title[$backup_dataset['title_simple']])) {
        foreach ($prod_epa_by_title[$backup_dataset['title_simple']] as $prod_dataset) {
            $csv->writeRow([$backup_dataset['title'], $backup_dataset['url'], $backup_dataset['guid'], $backup_dataset['topics'], $backup_dataset['categories'], true, $prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], true, (bool) ($backup_dataset['guid'] == $prod_dataset['guid'])]);
        }
        continue;
    }
    $csv->writeRow([$backup_dataset['title'], $backup_dataset['url'], $backup_dataset['guid'], $backup_dataset['topics'], $backup_dataset['categories'], false, '', '', '', false, false]);
}
// We don't want the header (use curl_getinfo())
curl_setopt($curl_ch, CURLOPT_HEADER, false);
// Track the handle's request string
curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true);
// Attempt to retrieve the modification date of the remote document.
curl_setopt($curl_ch, CURLOPT_FILETIME, true);
// Initialize cURL headers
foreach (glob(CKANMNGR_DATA_DIR . '/redirects_ckan.csv') as $csv_file) {
    $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
    echo $status;
    $basename = str_replace('.csv', '', basename($csv_file));
    //    fix wrong END-OF-LINE
    file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
    $csv_source = new EasyCSV\Reader($csv_file, 'r+', false);
    $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv');
    $csv_destination->writeRow(['url_xyz', 'url', 'url status', 'api_rest_xyz status', 'api_rest_xyz url']);
    $i = 0;
    while (true) {
        if (!($i++ % 10)) {
            echo $i . PHP_EOL;
        }
        $row = $csv_source->getRow();
        if (!$row) {
            break;
        }
        //        skip headers
        if (in_array(trim(strtolower($row[0])), ['socrata code', 'from', 'source url'])) {
            //            $csv_destination->writeRow($row);
            continue;
        }
        $dataset_url_xyz = $row[0];
 //            $categories = explode(';', trim($row['2']));
 //            $categories = array_map('trim', $categories);
 //        }
 //        no anchors please
 $dataset = get_dataset_basename($row['0']);
 if (!$dataset) {
     continue;
 }
 //        echo "\tOriginal: ".$dataset . PHP_EOL;
 //        $CkanManager->assignGroupsAndCategoriesToDatasets(
 //            [$dataset],
 //            trim($row['1']),
 //            $categories,
 //            $basename
 //        );
 $output->writeRow([$dataset, trim($row['1']), $categories]);
 echo join(' , ', [$dataset, trim($row['1']), $categories]) . PHP_EOL;
 if (isset($brothers[$dataset])) {
     foreach ($brothers[$dataset] as $brother) {
         if (!strlen(trim($brother))) {
             continue;
         }
         $brother = get_dataset_basename($brother);
         if (!$brother) {
             continue;
         }
         $output->writeRow([$brother, trim($row['1']), $categories]);
         echo join(' , ', [$brother, trim($row['1']), $categories]) . PHP_EOL;
         //                echo "\tUat (s):" . PHP_EOL;
         //                $CkanManager->assignGroupsAndCategoriesToDatasets(
         //                    [$brother],
Пример #14
0
// We don't want the header (use curl_getinfo())
curl_setopt($curl_ch, CURLOPT_HEADER, false);
// Track the handle's request string
curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true);
// Attempt to retrieve the modification date of the remote document.
curl_setopt($curl_ch, CURLOPT_FILETIME, true);
// Initialize cURL headers
foreach (glob(CKANMNGR_DATA_DIR . '/check_*.csv') as $csv_file) {
    $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
    echo $status;
    $basename = str_replace('.csv', '', basename($csv_file));
    //    fix wrong END-OF-LINE
    file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
    $csv_source = new EasyCSV\Reader($csv_file, 'r+', false);
    $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv');
    $csv_destination->writeRow(['dataset', 'status', 'aapi found']);
    $i = 0;
    while (true) {
        if (!($i++ % 100)) {
            echo $i . PHP_EOL;
        }
        $row = $csv_source->getRow();
        if (!$row) {
            break;
        }
        //        skip headers
        if (in_array(trim(strtolower($row[0])), ['data.gov url'])) {
            continue;
        }
        $url = strtolower($row[0]);
        if (!strpos($url, '/dataset/')) {
Пример #15
0
//    'url',
//    'identifier',
//    'org title',
//    'org name',
//    'topics',
//    'categories',
//]);
$CkanManager->resultsDir = $results_dir;
//$brief = $CkanManager->exportShort('extras_license:"https\://creativecommons.org/publicdomain/zero/1.0/" AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('','((collection_package_id:* OR *:*) AND license_id:"cc-by-sa" AND license:"https\://creativecommons.org/publicdomain/zero/1.0/") AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('%28%28collection_package_id:*%20OR%20*:*%29+AND+license_id:"cc-by-sa"+AND+license:"https://creativecommons.org/publicdomain/zero/1.0/"%29');
//$brief = $CkanManager->exportShort('organization:wake-county AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('organization:gsa-gov AND harvest_source_title:Open* AND (dataset_type:dataset)',
//$brief = $CkanManager->exportShort('organization:doe-gov AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('organization:dhs-gov AND (harvest_source_title:DHS*) AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('organization:epa-gov AND (harvest_source_title:*Gateway) AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('organization:epa-gov AND (metadata_type:geospatial) AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('organization:nasa-gov AND (harvest_source_title:NASA*) AND (dataset_type:dataset)');
$brief = $CkanManager->exportShort('organization:ntsb-gov AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('organization:noaa-gov AND metadata_type:geospatial AND (dataset_type:dataset) AND groups:*');
//$brief = $CkanManager->exportShort('metadata-source:dms AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('organization:doj-gov AND (dataset_type:dataset)');
//    'http://uat-catalog-fe-data.reisys.com/dataset/');
//$brief = $CkanManager->exportShort('(extra_harvest_source_title:Open+*) AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('organization:gsa-gov AND (dataset_type:dataset)');
//$brief = $CkanManager->exportShort('extras_harvest_source_title:Test ISO WAF AND (dataset_type:dataset)');
$headers = array_keys($brief[array_keys($brief)[0]]);
$csv->writeRow($headers);
$csv->writeFromArray($brief);
// show running time on finish
timer();
    $new = new Writer($results_dir . '/new.csv');
    $new->writeRow(['title', 'title_simple', 'name', 'url', 'topics', 'categories']);
    $UatCkanManager = new CkanManager(CKAN_API_URL);
    $UatCkanManager->resultsDir = $results_dir;
    $new_commerce = $UatCkanManager->exportBrief('extras_harvest_source_title:Commerce Non Spatial Data.json Harvest Source');
    $new->writeFromArray($new_commerce);
} else {
    $new = new Reader($results_dir . '/new.csv');
    $new_commerce = $new->getAll();
}
$new_commerce_by_title = [];
foreach ($new_commerce as $name => $dataset) {
    $title = $dataset['title_simple'];
    $new_commerce_by_title[$title] = isset($new_commerce_by_title[$title]) ? $new_commerce_by_title[$title] : [];
    $new_commerce_by_title[$title][] = $dataset;
}
echo 'prod_vs_new.csv' . PHP_EOL;
is_file($results_dir . '/prod_vs_prod_commerce.csv') && unlink($results_dir . '/prod_vs_prod_commerce.csv');
$csv = new Writer($results_dir . '/prod_vs_prod_commerce.csv');
$csv->writeRow(['Prod Title', 'Prod URL', 'Prod Topics', 'Prod Categories', 'Matched', 'NEW Title', 'NEW URL', 'URL Match']);
foreach ($prod_commerce as $name => $prod_dataset) {
    if (isset($new_commerce_by_title[$prod_dataset['title_simple']])) {
        foreach ($new_commerce_by_title[$prod_dataset['title_simple']] as $new_dataset) {
            $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['topics'], $prod_dataset['categories'], true, $new_dataset['title'], $new_dataset['url'], true]);
        }
        continue;
    }
    $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['topics'], $prod_dataset['categories'], false, '', '', false]);
}
// show running time on finish
timer();
}
$qa_epa_by_title = $qa_epa_by_guid = [];
foreach ($qa_epa as $name => $dataset) {
    $title = $dataset['title_simple'];
    $qa_epa_by_title[$title] = isset($qa_epa_by_title[$title]) ? $qa_epa_by_title[$title] : [];
    $qa_epa_by_title[$title][] = $dataset;
    $guid = trim($dataset['guid']);
    if ($guid) {
        $qa_epa_by_guid[$guid] = isset($qa_epa_by_guid[$guid]) ? $qa_epa_by_guid[$guid] : [];
        $qa_epa_by_guid[$guid][] = $dataset;
    }
}
echo 'prod_vs_qa.csv' . PHP_EOL;
is_file($results_dir . '/prod_vs_qa_epa.csv') && unlink($results_dir . '/prod_vs_qa_epa.csv');
$csv = new Writer($results_dir . '/prod_vs_qa_epa.csv');
$csv->writeRow(['Prod Title', 'Prod URL', 'Prod GUID', 'Prod Topics', 'Prod Categories', 'Matched', 'QA Title', 'QA URL', 'QA GUID', 'URL Match', 'GUID Match']);
foreach ($prod_epa as $name => $prod_dataset) {
    if (isset($qa_epa_by_guid[$prod_dataset['guid']])) {
        foreach ($qa_epa_by_guid[$prod_dataset['guid']] as $qa_dataset) {
            $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $qa_dataset['title'], $qa_dataset['url'], $qa_dataset['guid'], (bool) ($prod_dataset['name'] == $qa_dataset['name']), true]);
        }
        continue;
    }
    if (isset($qa_epa_by_title[$prod_dataset['title_simple']])) {
        foreach ($qa_epa_by_title[$prod_dataset['title_simple']] as $qa_dataset) {
            $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $qa_dataset['title'], $qa_dataset['url'], $qa_dataset['guid'], true, (bool) ($prod_dataset['guid'] == $qa_dataset['guid'])]);
        }
        continue;
    }
    $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], false, '', '', '', false, false]);
}
}
$uat_pbgc_by_title = $uat_pbgc_by_guid = [];
foreach ($uat_pbgc as $name => $dataset) {
    $title = $dataset['title_simple'];
    $uat_pbgc_by_title[$title] = isset($uat_pbgc_by_title[$title]) ? $uat_pbgc_by_title[$title] : [];
    $uat_pbgc_by_title[$title][] = $dataset;
    $guid = trim($dataset['guid']);
    if ($guid) {
        $uat_pbgc_by_guid[$guid] = isset($uat_pbgc_by_guid[$guid]) ? $uat_pbgc_by_guid[$guid] : [];
        $uat_pbgc_by_guid[$guid][] = $dataset;
    }
}
echo 'prod_vs_uat.csv' . PHP_EOL;
is_file($results_dir . '/prod_vs_uat_pbgc_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_pbgc_geospatial.csv');
$csv = new Writer($results_dir . '/prod_vs_uat_pbgc_geospatial.csv');
$csv->writeRow(['Prod Title', 'Prod URL', 'Prod GUID', 'Prod Topics', 'Prod Categories', 'Matched', 'UAT Title', 'UAT URL', 'UAT GUID', 'URL Match', 'Title Match', 'GUID Match']);
foreach ($prod_pbgc as $name => $prod_dataset) {
    if (isset($uat_pbgc_by_guid[$prod_dataset['guid']])) {
        foreach ($uat_pbgc_by_guid[$prod_dataset['guid']] as $uat_dataset) {
            $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $uat_dataset['title'], $uat_dataset['url'], $uat_dataset['guid'], (bool) ($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']), (bool) ($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $uat_dataset['title_simple']), true]);
        }
        continue;
    }
    if (isset($uat_pbgc_by_title[$prod_dataset['title_simple']])) {
        foreach ($uat_pbgc_by_title[$prod_dataset['title_simple']] as $uat_dataset) {
            $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $uat_dataset['title'], $uat_dataset['url'], $uat_dataset['guid'], (bool) ($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']), true, (bool) ($prod_dataset['guid'] && $prod_dataset['guid'] == $uat_dataset['guid'])]);
        }
        continue;
    }
    $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], false, '', '', '', false, false]);
}
// We don't want the header (use curl_getinfo())
curl_setopt($curl_ch, CURLOPT_HEADER, false);
// Track the handle's request string
curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true);
// Attempt to retrieve the modification date of the remote document.
curl_setopt($curl_ch, CURLOPT_FILETIME, true);
// Initialize cURL headers
foreach (glob(CKANMNGR_DATA_DIR . '/redirects*.csv') as $csv_file) {
    $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
    echo $status;
    $basename = str_replace('.csv', '', basename($csv_file));
    //    fix wrong END-OF-LINE
    file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
    $csv_source = new EasyCSV\Reader($csv_file, 'r+', false);
    $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv');
    $csv_destination->writeRow(['from', 'to', 'status']);
    $i = 0;
    while (true) {
        if (!($i++ % 10)) {
            echo $i . PHP_EOL;
        }
        $row = $csv_source->getRow();
        if (!$row) {
            break;
        }
        //        skip headers
        if (in_array(trim(strtolower($row[0])), ['from', 'source url'])) {
            //            $csv_destination->writeRow($row);
            continue;
        }
        $from = $row[0];