/** * Initialization, get datapackage.json and project descriptions to transform in a SQL+JSON database. */ function jsonCsv_to_sql(&$items, &$projects, &$db, $SEP = ',', $nmax = 0, $verbose = 2) { $OUT_report = ''; $n = $N = $N2 = 0; foreach ($items as $prj => $r) { foreach ($r as $dataset) { $folder = $projects[$prj]; $sql = array_shift($dataset); $OUT_report .= "\n\n---- PRJ {$prj}" . ($verbose >= 2 ? " {{ {$sql} }}" : ''); $stmt = $db->prepare($sql); $jpack = json_decode(file_get_contents("{$folder}/datapackage.json"), true); $ds = array(); // only for "bind" and "strict" checkings. foreach ($dataset as $i) { $i = str_replace('::bind', '', $i, $bind); $i = str_replace('::strict', '', $i, $aux); $ds["data/{$i}"] = $bind + 2 * $aux; } $ds_keys = array_keys($ds); foreach ($jpack['resources'] as $pack) { if (in_array($pack['path'], $ds_keys)) { $useType = $ds[$pack['path']] > 1; $OUT_report .= "\n\t-- reding dataset '{$pack['path']}' "; $fields = $pack['schema']['fields']; list($sql_fields, $json_fields, $json_types) = fields_to_parts($fields, false, true); //,$useType); $has_jfields = count($json_fields); if ($has_jfields) { $json_fields[] = 'source'; } // admin fields $n = $n2 = 0; $nsql = count($sql_fields); $file = "{$folder}/{$pack['path']}"; $h = fopen($file, 'r'); while ($h && !feof($h) && (!$nmax || $n < $nmax)) { if (($lin0 = $lin = fgetcsv($h, 0, $SEP)) && $n++ > 0 && isset($lin[0])) { $jsons = array_slice($lin, $nsql); $types = array_slice($json_types, $nsql); for ($t_i = 0; $t_i < count($types); $t_i++) { settype($jsons[$t_i], $types[$t_i]); } // casting to string or selected type $sqls = array_slice($lin, 0, $nsql); if ($has_jfields) { $jsons[] = $pack['path']; } // admin fields $info = json_encode(array_combine($json_fields, $jsons)); // AQUI capturar exception para tratar erro de incompatibilidade entre package e CSV. //print "\n##-- $info .. OPS no cast!"; if ($ds[$pack['path']]) { // or ctype_digit() $tmp = $sqls; foreach ($sql_fields as $i) { if ($useType) { $stmt->bindParam(":{$i}", array_shift($tmp)); } else { $stmt->bindParam(":{$i}", array_shift($tmp)); } } if ($has_jfields && $info) { $stmt->bindParam(":json_info", $info); } $ok = $stmt->execute(); } else { // implicit bind (by array order and no datatype parsing) $ok = $stmt->execute(array_merge($sqls, array($info))); } if (!$ok) { $OUT_report .= "\n ---- ERROR (at csv line-{$n} (rec. {$n2}) with error info) ----\n"; $OUT_report .= var_export($lin0, true); $OUT_report .= var_export($stmt->errorInfo(), true); return array(-1, 0, $OUT_report); //die("\n"); } else { $n2++; } } elseif ($nsql > 0 && count($sql_fields) && isset($lin) && count($lin) && $lin != array() && isset($lin[0])) { //debug print "\n-pk-$n2...0-$nsql \nlin=".count($lin);print_r($lin); $lin_check = fields_to_parts(array_slice($lin, 0, $nsql)); if ($sql_fields != $lin_check) { var_dump($lin_check); var_dump($sql_fields); die("\n --- ERROR: CSV header-basic not matches SQL/datapackage field names ---\n"); } } } //else print "\n-- !WARNING 2 at omLib.php, jsonCsv_to_sql(), n=$n\n"; $OUT_report .= " {$n} lines scanned, {$n2} used."; $N += $n; $N2 += $n2; unset($ds[$pack['path']]); } } // if $pack $sampath = "data/samples"; if (isset($ds[$sampath])) { // a kind of XML dataset unset($ds[$sampath]); $folder2 = "{$folder}/{$sampath}"; foreach (scandir($folder2) as $ft) { if (strlen($ft) > 2) { // folder-types sci and law foreach (scandir("{$folder2}/{$ft}") as $rpfolder) { if (strlen($rpfolder) > 2) { intoDb_XMLs("{$folder2}/{$ft}/{$rpfolder}", $db, $rpfolder); } } } } // scans repository's folder } foreach ($ds as $k => $v) { $OUT_report .= "\n --WARNING: pack '{$k}' (bind={$v}) not used"; } } } // for $r return array($N2, $N, $OUT_report); }
$ok = $stmt->execute(); } else { // implicit bind (by array order and no datatype parsing) $ok = $stmt->execute(array_merge($sqls, array($info))); } if (!$ok) { print "\n ---- ERROR (at line-{$n} with error info) ----\n"; print_r($lin0); print_r($stmt->errorInfo()); die("\n"); } else { $n2++; } } elseif ($nsql > 0 && count($sql_fields) && isset($lin) && count($lin) && $lin != array()) { //debug print "\n-pk-$n2...0-$nsql \nlin=".count($lin);print_r($lin); $lin_check = fields_to_parts(array_slice($lin, 0, $nsql)); if ($sql_fields != $lin_check) { var_dump($lin_check); var_dump($sql_fields); die("\n --- ERROR: CSV header-basic not matches SQL/datapackage field names ---\n"); } } } print " {$n} lines scanned, {$n2} used."; unset($ds[$pack['path']]); } } // if $pack $sampath = "data/samples"; if (isset($ds[$sampath])) { // a kind of XML dataset
/** * Get all basic information for CSV interpretation. * See also http://data.okfn.org/doc/tabular-data-package * @return array =>resource_name=>( * file, sep, fields_sql, fields_json, fields_json_types * ) */ function unpack_datapackage($folder, $dftSEP = ',', $prefURL = false) { $p = "{$folder}/datapackage.json"; if (!file_exists($p)) { die("\n\t-- ERROR: check folder ({$folder}) or add datapackage.json"); } $jpack = json_decode(file_get_contents($p), true); $ret = []; foreach ($jpack['resources'] as $pack) { if (isset($pack['path'])) { $r = []; $rpath = $pack['path']; $name = isset($pack['name']) ? $pack['name'] : $rpath; // or get filename print "\n\t -- analysing pack-resource '{$name}'"; $r['sep'] = isset($pack['sep']) ? $pack['sep'] : $dftSEP; if (isset($pack['schema']['fields'])) { list($r['fields_sql'], $r['fields_json'], $r['fields_json_types']) = fields_to_parts($pack['schema']['fields'], false, true); } else { die("\n\t -- ERROR at datapackage.json: no schema/fields in {$name}\n"); } $r['file'] = (!$rpath || $prefURL) && isset($pack['url']) ? $pack['url'] : realpath("{$folder}/{$rpath}"); $ret[$name] = $r; } } return $ret; }