/** * {@inheritdoc} */ public function writerTest() { $csv = new Writer($this->path, 'w'); foreach ($this->generateRawData() as $row) { $csv->writeRow($row); } }
$cmp2 = $cmp2_csv->getAll(); } $cmp2_by_title = $cmp2_by_guid = []; foreach ($cmp2 as $name => $dataset) { $title = $dataset['title_simple']; $cmp2_by_title[$title] = isset($cmp2_by_title[$title]) ? $cmp2_by_title[$title] : []; $cmp2_by_title[$title][] = $dataset; $guid = trim($dataset['guid']); if ($guid) { $cmp2_by_guid[$guid] = isset($cmp2_by_guid[$guid]) ? $cmp2_by_guid[$guid] : []; $cmp2_by_guid[$guid][] = $dataset; } } echo 'comparison.csv' . PHP_EOL; is_file($results_dir . '/comparison.csv') && unlink($results_dir . '/comparison.csv'); $csv = new Writer($results_dir . '/comparison.csv'); $cmp1_header = "DMS"; $cmp2_header = "NON-DMS"; $csv->writeRow([$cmp1_header . ' Title', $cmp1_header . ' URL', $cmp1_header . ' GUID', $cmp1_header . ' Topics', $cmp1_header . ' Categories', 'Matched', $cmp2_header . ' Title', $cmp2_header . ' URL', $cmp2_header . ' GUID', 'URL Match', 'GUID Match']); foreach ($cmp1 as $name => $cmp1_dataset) { if (isset($cmp2_by_guid[$cmp1_dataset['guid']])) { foreach ($cmp2_by_guid[$cmp1_dataset['guid']] as $cmp2_dataset) { $csv->writeRow([$cmp1_dataset['title'], $cmp1_dataset['url'], $cmp1_dataset['guid'], $cmp1_dataset['topics'], $cmp1_dataset['categories'], true, $cmp2_dataset['title'], $cmp2_dataset['url'], $cmp2_dataset['guid'], (bool) ($cmp1_dataset['name'] && $cmp1_dataset['name'] == $cmp2_dataset['name']), true]); } continue; } if (isset($cmp2_by_title[$cmp1_dataset['title_simple']])) { foreach ($cmp2_by_title[$cmp1_dataset['title_simple']] as $cmp2_dataset) { $csv->writeRow([$cmp1_dataset['title'], $cmp1_dataset['url'], $cmp1_dataset['guid'], $cmp1_dataset['topics'], $cmp1_dataset['categories'], true, $cmp2_dataset['title'], $cmp2_dataset['url'], $cmp2_dataset['guid'], true, (bool) ($cmp1_dataset['guid'] && $cmp1_dataset['guid'] == $cmp2_dataset['guid'])]); } continue;
/** * @param $category * @param CkanManager $CkanManagerProduction */ public function checkGroupAgainstProd($category, self $CkanManagerProduction) { $csv = new Writer($this->resultsDir . '/' . $category . date('_Ymd-His') . '.csv'); $csv->writeRow(['Staging dataset name', 'Staging Source', 'Prod exists', 'Prod has ' . $category, 'Prod Source']); $ckan_query = '((groups:' . $category . ') + dataset_type:dataset)'; $start = 0; $per_page = 20; while (true) { $packages = $this->tryPackageSearch($ckan_query, '', $per_page, $start); if (!$packages) { echo "{$start} / {$per_page} :: finish" . PHP_EOL; break; } foreach ($packages as $package) { if (is_array($package['extras']) && sizeof($package['extras']) && strpos(json_encode($package['extras']), '"dms"')) { $resource_type = 'DMS'; // echo "DMS ".$package['name'].PHP_EOL; } elseif (is_array($package['extras']) && sizeof($package['extras']) && strpos(json_encode($package['extras']), '"value":"geospatial"')) { $resource_type = 'GEO'; // echo "GEO ".$package['name'].PHP_EOL; } elseif (is_array($package['extras']) && sizeof($package['extras']) && strpos(json_encode($package['extras']), 'source_datajson_identifier')) { $resource_type = 'JSON'; // echo "JSON ".$package['name'].PHP_EOL; } else { $resource_type = 'OTHER'; echo json_encode($package['extras']) . PHP_EOL; echo "UNKNOWN: " . $package['name'] . PHP_EOL; } $prod_package = $CkanManagerProduction->tryPackageShow($package['name']); $exists = $prod_package ? 'EXISTS' : 'NOT FOUND'; $prod_category_found = ''; $prod_resource_type = ''; if ($prod_package) { $prod_category_found = 'FALSE'; if (isset($prod_package['groups']) && sizeof($prod_package['groups']) && strpos(json_encode($prod_package['groups']), $category)) { $prod_category_found = 'HAS'; } if (is_array($prod_package['extras']) && sizeof($prod_package['extras']) && strpos(json_encode($prod_package['extras']), '"dms"')) { $prod_resource_type = 'DMS'; // echo "DMS ".$prod_package['name'].PHP_EOL; } elseif (is_array($prod_package['extras']) && sizeof($prod_package['extras']) && strpos(json_encode($prod_package['extras']), '"value":"geospatial"')) { $prod_resource_type = 'GEO'; // echo "GEO ".$prod_package['name'].PHP_EOL; } elseif (is_array($prod_package['extras']) && sizeof($prod_package['extras']) && strpos(json_encode($prod_package['extras']), 'source_datajson_identifier')) { $prod_resource_type = 'JSON'; // echo "JSON ".$prod_package['name'].PHP_EOL; } else { $prod_resource_type = 'OTHER'; echo json_encode($prod_package['extras']) . PHP_EOL; echo "UNKNOWN on PROD: " . $prod_package['name'] . PHP_EOL; } } $csv->writeRow([$package['name'], $resource_type, $exists, $prod_category_found, $prod_resource_type]); } $start += $per_page; } }
namespace CKAN\Manager; use EasyCSV\Reader; use EasyCSV\Writer; require_once dirname(__DIR__) . '/inc/common.php'; /** * Create results dir for logs and json results */ $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_SHORT'; mkdir($results_dir); $start = isset($argv[1]) ? trim($argv[1]) : 0; $CkanManager = new CkanManager(CKAN_API_URL, CKAN_API_KEY); //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); $tags_csv = new Writer($results_dir . '/assign_tags.csv'); $CkanManager->resultsDir = $results_dir; foreach (glob(CKANMNGR_DATA_DIR . '/export_*.csv') as $csv_file) { $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; echo $status; $basename = str_replace('.csv', '', basename($csv_file)); // fix wrong END-OF-LINE file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); $csv = new Reader($csv_file, 'r+', false); while (true) { $row = $csv->getRow(); if (!$row) { break; } // skip headers if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
$json_backup_noaa_by_title = $json_backup_noaa_by_guid = []; foreach ($json_backup_noaa as $name => $dataset) { $title = $dataset['title_simple']; $json_backup_noaa_by_title[$title] = isset($json_backup_noaa_by_title[$title]) ? $json_backup_noaa_by_title[$title] : []; $json_backup_noaa_by_title[$title][] = $dataset; $guid = trim($dataset['guid']); if ($guid) { $json_backup_noaa_by_guid[$guid] = isset($json_backup_noaa_by_guid[$guid]) ? $json_backup_noaa_by_guid[$guid] : []; $json_backup_noaa_by_guid[$guid][] = $dataset; } } echo 'prod_vs_json_backup.csv' . PHP_EOL; is_file($results_dir . '/prod_vs_json_backup_noaa_geospatial.csv') && unlink($results_dir . '/prod_vs_json_backup_noaa_geospatial.csv'); $csv = new Writer($results_dir . '/prod_vs_json_backup_noaa_geospatial.csv'); $csv->writeRow(['UAT Title', 'UAT URL', 'UAT GUID', 'UAT Topics', 'UAT Categories', 'Matched', 'JSON Title', 'JSON URL', 'JSON GUID', 'URL Match', 'GUID Match']); $csv_uat_tagging = new Writer($results_dir . '/uat_tagging.csv'); $csv_uat_tagging->writeRow(['url', 'group', 'tags']); foreach ($uat_noaa as $name => $uat_dataset) { if (isset($json_backup_noaa_by_guid[$uat_dataset['guid']])) { foreach ($json_backup_noaa_by_guid[$uat_dataset['guid']] as $json_backup_dataset) { $csv->writeRow([$uat_dataset['title'], $uat_dataset['url'], $uat_dataset['guid'], $uat_dataset['topics'], $uat_dataset['categories'], true, $json_backup_dataset['title'], $json_backup_dataset['url'], $json_backup_dataset['guid'], (bool) ($uat_dataset['name'] == $json_backup_dataset['name']), true]); if (isset($json_backup_tags[$json_backup_dataset['title_simple']])) { foreach ($json_backup_tags[$json_backup_dataset['title_simple']] as $group => $tags) { $csv_uat_tagging->writeRow([$uat_dataset['url'], $group, join(';', $tags)]); } } } continue; } if (isset($json_backup_noaa_by_title[$uat_dataset['title_simple']])) { foreach ($json_backup_noaa_by_title[$uat_dataset['title_simple']] as $json_backup_dataset) {
curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5); // We don't want the header (use curl_getinfo()) curl_setopt($curl_ch, CURLOPT_HEADER, false); // Track the handle's request string curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true); // Attempt to retrieve the modification date of the remote document. curl_setopt($curl_ch, CURLOPT_FILETIME, true); // Initialize cURL headers foreach (glob(CKANMNGR_DATA_DIR . '/redirects_*.csv') as $csv_file) { $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; echo $status; $basename = str_replace('.csv', '', basename($csv_file)); // fix wrong END-OF-LINE file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); $csv_source = new EasyCSV\Reader($csv_file, 'r+', false); $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv'); $csv_destination->writeRow(['from', 'to', 'status', 'real_redirect']); $i = 0; while (true) { if (!($i++ % 100)) { echo $i . PHP_EOL; } $row = $csv_source->getRow(); if (!$row) { break; } // skip headers if (in_array(trim(strtolower($row[0])), ['socrata code', 'from', 'source url'])) { // $csv_destination->writeRow($row); continue; }
$d['status'] = 'deleted'; } array_push($statistics, $d); } } $delete_csv = new Writer($results_dir . '/delete_' . $organization . '.csv'); $delete_csv->writeRow(['url']); $delete_csv->writeFromArray($delete); $delete_full_csv = new Writer($results_dir . '/' . $organization . '_delete_full.csv'); $headers = array_keys($delete_full[0]); $delete_full_csv->writeRow($headers); $delete_full_csv->writeFromArray($delete_full); $stats_csv = new Writer($results_dir . '/' . $organization . '_statistics.csv'); $headers = array_keys($statistics[0]); $stats_csv->writeRow($headers); $stats_csv->writeFromArray($statistics); $survivors_csv = new Writer($results_dir . '/' . $organization . '_survivors.csv'); $headers = array_keys($survivors[array_keys($survivors)[0]]); $survivors_csv->writeRow($headers); $survivors_csv->writeFromArray($survivors); $stitle = ''; foreach ($delete_full as $dataset) { if ($dataset['title_simple'] !== $stitle) { $stitle = $dataset['title_simple']; // echo PHP_EOL; } // echo printf('%20s %20s',$dataset['title_simple'],$dataset['name']).PHP_EOL; } } // show running time on finish timer();
$json_backup_epa_by_title = $json_backup_epa_by_guid = []; foreach ($json_backup_epa as $name => $dataset) { $title = $dataset['title_simple']; $json_backup_epa_by_title[$title] = isset($json_backup_epa_by_title[$title]) ? $json_backup_epa_by_title[$title] : []; $json_backup_epa_by_title[$title][] = $dataset; $guid = trim($dataset['guid']); if ($guid) { $json_backup_epa_by_guid[$guid] = isset($json_backup_epa_by_guid[$guid]) ? $json_backup_epa_by_guid[$guid] : []; $json_backup_epa_by_guid[$guid][] = $dataset; } } echo 'prod_vs_json_backup.csv' . PHP_EOL; is_file($results_dir . '/prod_vs_json_backup_epa_geospatial.csv') && unlink($results_dir . '/prod_vs_json_backup_epa_geospatial.csv'); $csv = new Writer($results_dir . '/prod_vs_json_backup_epa_geospatial.csv'); $csv->writeRow(['Prod Title', 'Prod URL', 'Prod GUID', 'Prod Topics', 'Prod Categories', 'Matched', 'JSON Title', 'JSON URL', 'JSON GUID', 'URL Match', 'Title Match', 'GUID Match']); $csv_prod_tagging = new Writer($results_dir . '/prod_tagging.csv'); $csv_prod_tagging->writeRow(['url', 'group', 'tags', 'old_url', 'new_title', 'old_title', 'match_by']); foreach ($prod_epa as $name => $prod_dataset) { if (isset($json_backup_epa_by_guid[$prod_dataset['guid']])) { foreach ($json_backup_epa_by_guid[$prod_dataset['guid']] as $json_backup_dataset) { $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $json_backup_dataset['title'], $json_backup_dataset['url'], $json_backup_dataset['guid'], (bool) ($prod_dataset['name'] && $prod_dataset['name'] == $json_backup_dataset['name']), (bool) ($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $json_backup_dataset['title_simple']), true]); if (isset($json_backup_tags[$json_backup_dataset['title_simple']])) { foreach ($json_backup_tags[$json_backup_dataset['title_simple']] as $group => $tags) { $csv_prod_tagging->writeRow([$prod_dataset['url'], $group, join(';', $tags), $json_backup_dataset['name'], $prod_dataset['title_simple'], $json_backup_dataset['title_simple'], 'guid: ' . $prod_dataset['guid']]); } } } continue; } if (isset($json_backup_epa_by_title[$prod_dataset['title_simple']])) { foreach ($json_backup_epa_by_title[$prod_dataset['title_simple']] as $json_backup_dataset) {
$uat = new Writer($results_dir . '/uat.csv'); $uat->writeRow(['title', 'title_simple', 'name', 'url', 'topics', 'categories']); $UatCkanManager = new CkanManager(CKAN_UAT_API_URL); $UatCkanManager->resultsDir = $results_dir; $uat_nuclear = $UatCkanManager->exportBrief('extras_harvest_source_title:NRC data.json', '', 'http://uat-catalog-fe-data.reisys.com/dataset/'); $uat->writeFromArray($uat_nuclear); } else { $uat = new Reader($results_dir . '/uat.csv'); $uat_nuclear = $uat->getAll(); } $uat_nuclear_by_title = []; foreach ($uat_nuclear as $name => $dataset) { $title = $dataset['title_simple']; $uat_nuclear_by_title[$title] = isset($uat_nuclear_by_title[$title]) ? $uat_nuclear_by_title[$title] : []; $uat_nuclear_by_title[$title][] = $dataset; } echo 'prod_vs_uat.csv' . PHP_EOL; is_file($results_dir . '/prod_vs_uat_nuclear_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_nuclear_geospatial.csv'); $csv = new Writer($results_dir . '/prod_vs_uat_nuclear_geospatial.csv'); $csv->writeRow(['Prod Title', 'Prod URL', 'Prod Topics', 'Prod Categories', 'Matched', 'UAT Title', 'UAT URL']); foreach ($prod_nuclear as $name => $prod_dataset) { if (isset($uat_nuclear_by_title[$prod_dataset['title_simple']])) { foreach ($uat_nuclear_by_title[$prod_dataset['title_simple']] as $uat_dataset) { $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['topics'], $prod_dataset['categories'], true, $uat_dataset['title'], $uat_dataset['url']]); } continue; } $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['topics'], $prod_dataset['categories'], false, '', '']); } // show running time on finish timer();
$results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_SEARCH_TITLES'; mkdir($results_dir); /** * Production */ $CkanManager = new CkanManager(CKAN_API_URL); /** * Staging */ //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); foreach (glob(CKANMNGR_DATA_DIR . '/find_*.csv') as $csv_file) { $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; echo $status; $basename = str_replace('.csv', '', basename($csv_file)); $csv_source = new EasyCSV\Reader($csv_file, 'r+', false); $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_results.csv'); $csv_destination->writeRow(['url', 'exact match', 'title', 'found by title']); $i = 0; while (true) { if (!($i++ % 10)) { echo $i . PHP_EOL; } $row = $csv_source->getRow(); if (!$row) { break; } // skip headers if (in_array(trim(strtolower($row[0])), ['url', 'from', 'source url'])) { continue; } $title = $row[0];
$prod_epa = $prod->getAll(); } $prod_epa_by_title = $prod_epa_by_guid = []; foreach ($prod_epa as $name => $dataset) { $title = $dataset['title_simple']; $prod_epa_by_title[$title] = isset($prod_epa_by_title[$title]) ? $prod_epa_by_title[$title] : []; $prod_epa_by_title[$title][] = $dataset; $guid = trim($dataset['guid']); if ($guid) { $prod_epa_by_guid[$guid] = isset($prod_epa_by_guid[$guid]) ? $prod_epa_by_guid[$guid] : []; $prod_epa_by_guid[$guid][] = $dataset; } } echo 'json_vs_prod.csv' . PHP_EOL; is_file($results_dir . '/json_vs_prod_epa.csv') && unlink($results_dir . '/json_vs_prod_epa.csv'); $csv = new Writer($results_dir . '/json_vs_prod_epa.csv'); $csv->writeRow(['Backup Title', 'Backup URL', 'Backup GUID', 'Backup Topics', 'Backup Categories', 'Matched', 'Prod Title', 'Prod URL', 'Prod GUID', 'URL Match', 'GUID Match']); foreach ($json_backup_epa as $name => $backup_dataset) { if (isset($prod_epa_by_guid[$backup_dataset['guid']])) { foreach ($prod_epa_by_guid[$backup_dataset['guid']] as $prod_dataset) { $csv->writeRow([$backup_dataset['title'], $backup_dataset['url'], $backup_dataset['guid'], $backup_dataset['topics'], $backup_dataset['categories'], true, $prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], (bool) ($backup_dataset['name'] == $prod_dataset['name']), true]); } continue; } if (isset($prod_epa_by_title[$backup_dataset['title_simple']])) { foreach ($prod_epa_by_title[$backup_dataset['title_simple']] as $prod_dataset) { $csv->writeRow([$backup_dataset['title'], $backup_dataset['url'], $backup_dataset['guid'], $backup_dataset['topics'], $backup_dataset['categories'], true, $prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], true, (bool) ($backup_dataset['guid'] == $prod_dataset['guid'])]); } continue; } $csv->writeRow([$backup_dataset['title'], $backup_dataset['url'], $backup_dataset['guid'], $backup_dataset['topics'], $backup_dataset['categories'], false, '', '', '', false, false]);
curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5); // We don't want the header (use curl_getinfo()) curl_setopt($curl_ch, CURLOPT_HEADER, false); // Track the handle's request string curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true); // Attempt to retrieve the modification date of the remote document. curl_setopt($curl_ch, CURLOPT_FILETIME, true); // Initialize cURL headers foreach (glob(CKANMNGR_DATA_DIR . '/redirects_ckan.csv') as $csv_file) { $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; echo $status; $basename = str_replace('.csv', '', basename($csv_file)); // fix wrong END-OF-LINE file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); $csv_source = new EasyCSV\Reader($csv_file, 'r+', false); $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv'); $csv_destination->writeRow(['url_xyz', 'url', 'url status', 'api_rest_xyz status', 'api_rest_xyz url']); $i = 0; while (true) { if (!($i++ % 10)) { echo $i . PHP_EOL; } $row = $csv_source->getRow(); if (!$row) { break; } // skip headers if (in_array(trim(strtolower($row[0])), ['socrata code', 'from', 'source url'])) { // $csv_destination->writeRow($row); continue; }
/** * Sample csv * dataset,group,categories * https://catalog.data.gov/dataset/food-access-research-atlas,Agriculture,"Natural Resources and Environment" * download-crossing-inventory-data-highway-rail-crossing,Agriculture, "Natural Resources and Environment;Plants and Plant Systems Agriculture" */ $CkanManager->resultsDir = $results_dir; foreach (glob(CKANMNGR_DATA_DIR . '/assign*.csv') as $csv_file) { $csv_source = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; echo $CkanManager->color->green($csv_source); $basename = str_replace('.csv', '', basename($csv_file)); // fix wrong END-OF-LINE file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX); $csv = new EasyCSV\Reader($csv_file, 'r+', false); $output = new EasyCSV\Writer($results_dir . '/' . $basename . '_clones.csv'); while (true) { $row = $csv->getRow(); if (!$row) { break; } // skip headers if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { continue; } if ($start > 0) { $start--; continue; } // format group tags $categories = isset($row['2']) ? trim($row['2']) : '';
curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5); // We don't want the header (use curl_getinfo()) curl_setopt($curl_ch, CURLOPT_HEADER, false); // Track the handle's request string curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true); // Attempt to retrieve the modification date of the remote document. curl_setopt($curl_ch, CURLOPT_FILETIME, true); // Initialize cURL headers foreach (glob(CKANMNGR_DATA_DIR . '/check_*.csv') as $csv_file) { $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; echo $status; $basename = str_replace('.csv', '', basename($csv_file)); // fix wrong END-OF-LINE file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); $csv_source = new EasyCSV\Reader($csv_file, 'r+', false); $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv'); $csv_destination->writeRow(['dataset', 'status', 'aapi found']); $i = 0; while (true) { if (!($i++ % 100)) { echo $i . PHP_EOL; } $row = $csv_source->getRow(); if (!$row) { break; } // skip headers if (in_array(trim(strtolower($row[0])), ['data.gov url'])) { continue; } $url = strtolower($row[0]);
namespace CKAN\Manager; use EasyCSV\Writer; require_once dirname(__DIR__) . '/inc/common.php'; /** * Create results dir for logs and json results */ $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_SHORT'; mkdir($results_dir); $CkanManager = new CkanManager(CKAN_API_URL); //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL); //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); //$CkanManager = new CkanManager(CKAN_UAT_API_URL); $csv = new Writer($results_dir . '/export.' . date('Y-m-d') . '.csv'); //$csv->writeRow([ // 'ckan id', // 'title', // 'name', // 'url', // 'identifier', // 'org title', // 'org name', // 'topics', // 'categories', //]); $CkanManager->resultsDir = $results_dir; //$brief = $CkanManager->exportShort('extras_license:"https\://creativecommons.org/publicdomain/zero/1.0/" AND (dataset_type:dataset)'); //$brief = $CkanManager->exportShort('','((collection_package_id:* OR *:*) AND license_id:"cc-by-sa" AND license:"https\://creativecommons.org/publicdomain/zero/1.0/") AND (dataset_type:dataset)'); //$brief = $CkanManager->exportShort('%28%28collection_package_id:*%20OR%20*:*%29+AND+license_id:"cc-by-sa"+AND+license:"https://creativecommons.org/publicdomain/zero/1.0/"%29');
$new = new Writer($results_dir . '/new.csv'); $new->writeRow(['title', 'title_simple', 'name', 'url', 'topics', 'categories']); $UatCkanManager = new CkanManager(CKAN_API_URL); $UatCkanManager->resultsDir = $results_dir; $new_commerce = $UatCkanManager->exportBrief('extras_harvest_source_title:Commerce Non Spatial Data.json Harvest Source'); $new->writeFromArray($new_commerce); } else { $new = new Reader($results_dir . '/new.csv'); $new_commerce = $new->getAll(); } $new_commerce_by_title = []; foreach ($new_commerce as $name => $dataset) { $title = $dataset['title_simple']; $new_commerce_by_title[$title] = isset($new_commerce_by_title[$title]) ? $new_commerce_by_title[$title] : []; $new_commerce_by_title[$title][] = $dataset; } echo 'prod_vs_new.csv' . PHP_EOL; is_file($results_dir . '/prod_vs_prod_commerce.csv') && unlink($results_dir . '/prod_vs_prod_commerce.csv'); $csv = new Writer($results_dir . '/prod_vs_prod_commerce.csv'); $csv->writeRow(['Prod Title', 'Prod URL', 'Prod Topics', 'Prod Categories', 'Matched', 'NEW Title', 'NEW URL', 'URL Match']); foreach ($prod_commerce as $name => $prod_dataset) { if (isset($new_commerce_by_title[$prod_dataset['title_simple']])) { foreach ($new_commerce_by_title[$prod_dataset['title_simple']] as $new_dataset) { $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['topics'], $prod_dataset['categories'], true, $new_dataset['title'], $new_dataset['url'], true]); } continue; } $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['topics'], $prod_dataset['categories'], false, '', '', false]); } // show running time on finish timer();
$qa_epa = $qa->getAll(); } $qa_epa_by_title = $qa_epa_by_guid = []; foreach ($qa_epa as $name => $dataset) { $title = $dataset['title_simple']; $qa_epa_by_title[$title] = isset($qa_epa_by_title[$title]) ? $qa_epa_by_title[$title] : []; $qa_epa_by_title[$title][] = $dataset; $guid = trim($dataset['guid']); if ($guid) { $qa_epa_by_guid[$guid] = isset($qa_epa_by_guid[$guid]) ? $qa_epa_by_guid[$guid] : []; $qa_epa_by_guid[$guid][] = $dataset; } } echo 'prod_vs_qa.csv' . PHP_EOL; is_file($results_dir . '/prod_vs_qa_epa.csv') && unlink($results_dir . '/prod_vs_qa_epa.csv'); $csv = new Writer($results_dir . '/prod_vs_qa_epa.csv'); $csv->writeRow(['Prod Title', 'Prod URL', 'Prod GUID', 'Prod Topics', 'Prod Categories', 'Matched', 'QA Title', 'QA URL', 'QA GUID', 'URL Match', 'GUID Match']); foreach ($prod_epa as $name => $prod_dataset) { if (isset($qa_epa_by_guid[$prod_dataset['guid']])) { foreach ($qa_epa_by_guid[$prod_dataset['guid']] as $qa_dataset) { $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $qa_dataset['title'], $qa_dataset['url'], $qa_dataset['guid'], (bool) ($prod_dataset['name'] == $qa_dataset['name']), true]); } continue; } if (isset($qa_epa_by_title[$prod_dataset['title_simple']])) { foreach ($qa_epa_by_title[$prod_dataset['title_simple']] as $qa_dataset) { $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $qa_dataset['title'], $qa_dataset['url'], $qa_dataset['guid'], true, (bool) ($prod_dataset['guid'] == $qa_dataset['guid'])]); } continue; } $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], false, '', '', '', false, false]);
echo PHP_EOL . 'datasets from uat: ' . sizeof($uat_pbgc) . PHP_EOL . PHP_EOL; } $uat_pbgc_by_title = $uat_pbgc_by_guid = []; foreach ($uat_pbgc as $name => $dataset) { $title = $dataset['title_simple']; $uat_pbgc_by_title[$title] = isset($uat_pbgc_by_title[$title]) ? $uat_pbgc_by_title[$title] : []; $uat_pbgc_by_title[$title][] = $dataset; $guid = trim($dataset['guid']); if ($guid) { $uat_pbgc_by_guid[$guid] = isset($uat_pbgc_by_guid[$guid]) ? $uat_pbgc_by_guid[$guid] : []; $uat_pbgc_by_guid[$guid][] = $dataset; } } echo 'prod_vs_uat.csv' . PHP_EOL; is_file($results_dir . '/prod_vs_uat_pbgc_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_pbgc_geospatial.csv'); $csv = new Writer($results_dir . '/prod_vs_uat_pbgc_geospatial.csv'); $csv->writeRow(['Prod Title', 'Prod URL', 'Prod GUID', 'Prod Topics', 'Prod Categories', 'Matched', 'UAT Title', 'UAT URL', 'UAT GUID', 'URL Match', 'Title Match', 'GUID Match']); foreach ($prod_pbgc as $name => $prod_dataset) { if (isset($uat_pbgc_by_guid[$prod_dataset['guid']])) { foreach ($uat_pbgc_by_guid[$prod_dataset['guid']] as $uat_dataset) { $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $uat_dataset['title'], $uat_dataset['url'], $uat_dataset['guid'], (bool) ($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']), (bool) ($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $uat_dataset['title_simple']), true]); } continue; } if (isset($uat_pbgc_by_title[$prod_dataset['title_simple']])) { foreach ($uat_pbgc_by_title[$prod_dataset['title_simple']] as $uat_dataset) { $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], true, $uat_dataset['title'], $uat_dataset['url'], $uat_dataset['guid'], (bool) ($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']), true, (bool) ($prod_dataset['guid'] && $prod_dataset['guid'] == $uat_dataset['guid'])]); } continue; } $csv->writeRow([$prod_dataset['title'], $prod_dataset['url'], $prod_dataset['guid'], $prod_dataset['topics'], $prod_dataset['categories'], false, '', '', '', false, false]);
curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5); // We don't want the header (use curl_getinfo()) curl_setopt($curl_ch, CURLOPT_HEADER, false); // Track the handle's request string curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true); // Attempt to retrieve the modification date of the remote document. curl_setopt($curl_ch, CURLOPT_FILETIME, true); // Initialize cURL headers foreach (glob(CKANMNGR_DATA_DIR . '/redirects*.csv') as $csv_file) { $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; echo $status; $basename = str_replace('.csv', '', basename($csv_file)); // fix wrong END-OF-LINE file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); $csv_source = new EasyCSV\Reader($csv_file, 'r+', false); $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv'); $csv_destination->writeRow(['from', 'to', 'status']); $i = 0; while (true) { if (!($i++ % 10)) { echo $i . PHP_EOL; } $row = $csv_source->getRow(); if (!$row) { break; } // skip headers if (in_array(trim(strtolower($row[0])), ['from', 'source url'])) { // $csv_destination->writeRow($row); continue; }