public function validate_datajson($datajson_url = null, $datajson = null, $headers = null, $schema = null, $return_source = false, $quality = false, $component = null) { if ($datajson_url) { $datajson_header = $headers ? $headers : $this->campaign->uri_header($datajson_url); $errors = array(); // Max file size $max_remote_size = $this->config->item('max_remote_size'); // Only download the data.json if we need to if (empty($datajson_header['download_content_length']) || $datajson_header['download_content_length'] < 0 || !empty($datajson_header['download_content_length']) && $datajson_header['download_content_length'] > 0 && $datajson_header['download_content_length'] < $max_remote_size) { // Load the JSON $opts = array('http' => array('method' => "GET", 'user_agent' => "Data.gov data.json crawler")); $context = stream_context_create($opts); $datajson = @file_get_contents($datajson_url, false, $context, -1, $max_remote_size + 1); if ($datajson == false) { $datajson = curl_from_json($datajson_url, false, false); if (!$datajson) { $errors[] = "File not found or couldn't be downloaded"; } } } if (!empty($datajson) && (empty($datajson_header['download_content_length']) || $datajson_header['download_content_length'] < 0)) { $datajson_header['download_content_length'] = strlen($datajson); } // See if it exceeds max size if ($datajson_header['download_content_length'] > $max_remote_size) { //$filesize = human_filesize($datajson_header['download_content_length']); //$errors[] = "The data.json file is " . $filesize . " which is currently too large to parse with this tool. Sorry."; // Increase the timeout limit @set_time_limit(6000); $this->load->helper('file'); if ($rawfile = $this->archive_file('datajson-lines', $this->current_office_id, $datajson_url)) { $outfile = $rawfile . '.lines.json'; $stream = fopen($rawfile, 'r'); $out_stream = fopen($outfile, 'w+'); $listener = new DataJsonParser(); $listener->out_file = $out_stream; if ($this->environment == 'terminal' or $this->environment == 'cron') { echo 'Attempting to convert to JSON lines' . PHP_EOL; } try { $parser = new JsonStreamingParser_Parser($stream, $listener); $parser->parse(); } catch (Exception $e) { fclose($stream); throw $e; } // Get the dataset count $datajson_lines_count = $listener->_array_count; // Delete temporary raw source file unlink($rawfile); $out_stream = fopen($outfile, 'r+'); $chunk_cycle = 0; $chunk_size = 200; $chunk_count = intval(ceil($datajson_lines_count / $chunk_size)); $buffer = ''; $response = array(); $response['errors'] = array(); if ($quality !== false) { $response['qa'] = array(); } echo "Analyzing {$datajson_lines_count} lines in {$chunk_count} chunks of {$chunk_size} lines each" . PHP_EOL; while ($chunk_cycle < $chunk_count) { $buffer = ''; $datajson_qa = null; $counter = 0; if ($chunk_cycle > 0) { $key_offset = $chunk_size * $chunk_cycle; } else { $key_offset = 0; } $next_offset = $key_offset + $chunk_size; //echo "Analyzing chunk $chunk_cycle of $chunk_count ($key_offset to $next_offset of $datajson_lines_count)" . PHP_EOL; if ($chunk_cycle == 0) { $json_header = fgets($out_stream); } while (($buffer .= fgets($out_stream)) && $counter < $chunk_size) { $counter++; } $buffer = $json_header . $buffer; $buffer = substr($buffer, 0, strlen($buffer) - 2) . ']}'; $validator = $this->campaign->jsonschema_validator($buffer, 'federal-v1.1'); if (!empty($validator['errors'])) { $response['errors'] = array_merge($response['errors'], $this->process_validation_errors($validator['errors'], $key_offset)); } if ($quality !== false) { $datajson_qa = $this->campaign->datajson_qa($buffer, 'federal-v1.1', $quality, $component); if (!empty($datajson_qa)) { $response['qa'] = array_merge_recursive($response['qa'], $datajson_qa); } } $chunk_cycle++; } // Delete json lines file unlink($outfile); // ################################################################### // Needs to be refactored into separate function // ################################################################### // Sum QA counts if (!empty($response['qa'])) { if (!empty($response['qa']['bureauCodes'])) { $response['qa']['bureauCodes'] = array_keys($response['qa']['bureauCodes']); } if (!empty($response['qa']['programCodes'])) { $response['qa']['programCodes'] = array_keys($response['qa']['programCodes']); } $sum_array_fields = array('API_total', 'downloadURL_present', 'downloadURL_total', 'accessURL_present', 'accessURL_total', 'accessLevel_public', 'accessLevel_restricted', 'accessLevel_nonpublic', 'license_present', 'redaction_present', 'redaction_no_explanation'); foreach ($sum_array_fields as $array_field) { if (!empty($response['qa'][$array_field]) && is_array($response['qa'][$array_field])) { $response['qa'][$array_field] = array_sum($response['qa'][$array_field]); } } // Sum validation counts if (!empty($response['qa']['validation_counts']) && is_array($response['qa']['validation_counts'])) { foreach ($response['qa']['validation_counts'] as $validation_key => $validation_count) { if (is_array($response['qa']['validation_counts'][$validation_key])) { $response['qa']['validation_counts'][$validation_key] = array_sum($response['qa']['validation_counts'][$validation_key]); } } } } $response['valid'] = empty($response['errors']) ? true : false; $response['valid_json'] = true; $response['total_records'] = $datajson_lines_count; if (!empty($datajson_header['download_content_length'])) { $response['download_content_length'] = $datajson_header['download_content_length']; } if (empty($response['errors'])) { $response['errors'] = false; } return $response; // ################################################################### } else { $errors[] = "File not found or couldn't be downloaded"; } } // See if it's valid JSON if (!empty($datajson) && $datajson_header['download_content_length'] < $max_remote_size) { // See if raw file is valid $raw_valid_json = is_json($datajson); // See if we can clean up the file to make it valid if (!$raw_valid_json) { $datajson_processed = json_text_filter($datajson); $valid_json = is_json($datajson_processed); } else { $valid_json = true; } if ($valid_json !== true) { $errors[] = 'The validator was unable to determine if this was valid JSON'; } } if (!empty($errors)) { $valid_json = isset($valid_json) ? $valid_json : null; $raw_valid_json = isset($raw_valid_json) ? $raw_valid_json : null; $response = array('raw_valid_json' => $raw_valid_json, 'valid_json' => $valid_json, 'valid' => false, 'fail' => $errors, 'download_content_length' => $datajson_header['download_content_length']); if ($valid_json && $return_source === false) { $catalog = json_decode($datajson_processed); if ($schema == 'federal-v1.1' or $schema == 'non-federal-v1.1') { $response['total_records'] = count($catalog->dataset); } else { $response['total_records'] = count($catalog); } } return $response; } } // filter string for json conversion if we haven't already if ($datajson && empty($datajson_processed)) { $datajson_processed = json_text_filter($datajson); } // verify it's valid json if ($datajson_processed) { if (!isset($valid_json)) { $valid_json = is_json($datajson_processed); } } if ($datajson_processed && $valid_json) { $datajson_decode = json_decode($datajson_processed); if (!empty($datajson_decode->conformsTo) && $datajson_decode->conformsTo == 'https://project-open-data.cio.gov/v1.1/schema') { if ($schema !== 'federal-v1.1' && $schema !== 'non-federal-v1.1') { if ($schema == 'federal') { $schema = 'federal-v1.1'; } else { if ($schema == 'non-federal') { $schema = 'non-federal-v1.1'; } else { $schema = 'federal-v1.1'; } } } $this->schema = $schema; } if ($schema == 'federal-v1.1' && empty($datajson_decode->dataset)) { $errors[] = "This file does not appear to be using the federal-v1.1 schema"; $response = array('raw_valid_json' => $raw_valid_json, 'valid_json' => $valid_json, 'valid' => false, 'fail' => $errors); return $response; } if ($schema !== 'federal-v1.1' && $schema !== 'non-federal-v1.1') { $chunk_size = 500; $datajson_chunks = array_chunk($datajson_decode, $chunk_size); } else { $datajson_chunks = array($datajson_decode); } $response = array(); $response['errors'] = array(); if ($quality !== false) { $response['qa'] = array(); } // save detected schema version to output $response['schema_version'] = $schema; foreach ($datajson_chunks as $chunk_count => $chunk) { $chunk = json_encode($chunk); $validator = $this->campaign->jsonschema_validator($chunk, $schema); if (!empty($validator['errors'])) { if ($chunk_count) { $key_offset = $chunk_size * $chunk_count; $key_offset = $key_offset; } else { $key_offset = 0; } $response['errors'] = $response['errors'] + $this->process_validation_errors($validator['errors'], $key_offset); } if ($quality !== false) { $datajson_qa = $this->campaign->datajson_qa($chunk, $schema, $quality, $component); if (!empty($datajson_qa)) { $response['qa'] = array_merge_recursive($response['qa'], $datajson_qa); } } } // Sum QA counts if (!empty($response['qa'])) { if (!empty($response['qa']['bureauCodes'])) { $response['qa']['bureauCodes'] = array_keys($response['qa']['bureauCodes']); } if (!empty($response['qa']['programCodes'])) { $response['qa']['programCodes'] = array_keys($response['qa']['programCodes']); } $sum_array_fields = array('accessURL_present', 'accessURL_total', 'accessLevel_public', 'accessLevel_restricted', 'accessLevel_nonpublic'); foreach ($sum_array_fields as $array_field) { if (!empty($response['qa'][$array_field]) && is_array($response['qa'][$array_field])) { $response['qa'][$array_field] = array_sum($response['qa'][$array_field]); } } // Sum validation counts if (!empty($response['qa']['validation_counts']) && is_array($response['qa']['validation_counts'])) { foreach ($response['qa']['validation_counts'] as $validation_key => $validation_count) { if (is_array($response['qa']['validation_counts'][$validation_key])) { $response['qa']['validation_counts'][$validation_key] = array_sum($response['qa']['validation_counts'][$validation_key]); } } } } $valid_json = isset($raw_valid_json) ? $raw_valid_json : $valid_json; $response['valid'] = empty($response['errors']) ? true : false; $response['valid_json'] = $valid_json; if ($schema == 'federal-v1.1' or $schema == 'non-federal-v1.1') { $response['total_records'] = count($datajson_decode->dataset); } else { $response['total_records'] = count($datajson_decode); } if (!empty($datajson_header['download_content_length'])) { $response['download_content_length'] = $datajson_header['download_content_length']; } if (empty($response['errors'])) { $response['errors'] = false; } if ($return_source) { $dataset_array = ($schema == 'federal-v1.1' or $schema == 'non-federal-v1.1') ? true : false; $datajson_decode = filter_json($datajson_decode, $dataset_array); $response['source'] = $datajson_decode; } return $response; } else { $errors[] = "This does not appear to be valid JSON"; $response = array('valid_json' => false, 'valid' => false, 'fail' => $errors); if (!empty($datajson_header['download_content_length'])) { $response['download_content_length'] = $datajson_header['download_content_length']; } return $response; } }
/** * TO DO - when agencies provide a valid url, we should validate that before * the download. * * Open the archived file that has been downloaded in campaign status method and * validate it against the schema * * @param <array> $status * @param <string> $file_path * @param <string> $component * @param <string> $real_url */ public function validate_archive_file_with_schema($status, $file_path, $component, $real_url) { $fp = fopen($file_path, 'r'); if (!$fp) { $status['errors'][] = "Unable to open archived json file"; } $status['total_records'] = 0; $status['download_content_length'] = 0; $status['content_type'] = "application/json"; $status['schema_version'] = "1.0"; $json = file_get_contents($file_path); if (empty($json)) { $status['errors'][] = 'Archived json file is empty'; $status['valid_json'] = false; } else { if (!is_json($json)) { $json = json_text_filter($json); } } if (!empty($json) && !is_json($json)) { $status['errors'][] = 'Invalid archived json file'; $status['valid_json'] = false; } else { $status['download_content_length'] = strlen($json); $data = json_decode($json); $status['total_records'] = count($data); $status['valid_json'] = true; } $schema = $this->datajson_schema($component); if (!empty($data)) { $validator = new JsonSchema\Validator(); $validator->check($data, $schema); if (!$validator->isValid()) { $errors = $validator->getErrors(); $status['schema_errors'] = $errors; } } return $status; }