/** * Adds a file to the search engine. * * Notes about Solr and Tika indexing. We do not send the mime type, only the filename. * Tika has much better content type detection than Moodle, and we will have many more doc failures * if we try to send mime types. * * @param document $document * @param \stored_file $storedfile * @return void */ protected function add_stored_file($document, $storedfile) { $filedoc = $document->export_file_for_engine($storedfile); if (!$this->file_is_indexable($storedfile)) { // For files that we don't consider indexable, we will still place a reference in the search engine. $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE; $this->add_solr_document($filedoc); return; } $curl = $this->get_curl_object(); $url = $this->get_connection_url('/update/extract'); // This will prevent solr from automatically making fields for every tika output. $url->param('uprefix', 'ignored_'); // Control how content is captured. This will keep our file content clean of non-important metadata. $url->param('captureAttr', 'true'); // Move the content to a field for indexing. $url->param('fmap.content', 'solr_filecontent'); // These are common fields that matches the standard *_point dynamic field and causes an error. $url->param('fmap.media_white_point', 'ignored_mwp'); $url->param('fmap.media_black_point', 'ignored_mbp'); // Copy each key to the url with literal. // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names. foreach ($filedoc as $key => $value) { // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours. $url->param('fmap.' . $key, 'ignored_' . $key); // Place data in a tmp field. $url->param('literal.mdltmp_' . $key, $value); // Then move to the final field. $url->param('fmap.mdltmp_' . $key, $key); } // This sets the true filename for Tika. $url->param('resource.name', $storedfile->get_filename()); // A giant block of code that is really just error checking around the curl request. try { // Now actually do the request. $result = $curl->post($url->out(false), array('myfile' => $storedfile)); $code = $curl->get_errno(); $info = $curl->get_info(); // Now error handling. It is just informational, since we aren't tracking per file/doc results. if ($code != 0) { // This means an internal cURL error occurred error is in result. $message = 'Curl error ' . $code . ' while indexing file with document id ' . $filedoc['id'] . ': ' . $result . '.'; debugging($message, DEBUG_DEVELOPER); } else { if (isset($info['http_code']) && $info['http_code'] !== 200) { // Unexpected HTTP response code. $message = 'Error while indexing file with document id ' . $filedoc['id']; // Try to get error message out of msg or title if it exists. if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) { $message .= ': ' . $matches[1]; } else { if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) { $message .= ': ' . $matches[1]; } } // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter. if (CLI_SCRIPT && !PHPUNIT_TEST) { mtrace($message); } } else { // Check for the expected status field. if (preg_match('|<int [^>]*name="status"[^>]*>(\\d*)</int>|i', $result, $matches)) { // Now check for the expected status of 0, if not, error. if ((int) $matches[1] !== 0) { $message = 'Unexpected Solr status code ' . (int) $matches[1]; $message .= ' while indexing file with document id ' . $filedoc['id'] . '.'; debugging($message, DEBUG_DEVELOPER); } else { // The document was successfully indexed. return; } } else { // We received an unprocessable response. $message = 'Unexpected Solr response while indexing file with document id ' . $filedoc['id'] . ': '; $message .= strtok($result, "\n"); debugging($message, DEBUG_DEVELOPER); } } } } catch (\Exception $e) { // There was an error, but we are not tracking per-file success, so we just continue on. debugging('Unknown exception while indexing file "' . $storedfile->get_filename() . '".', DEBUG_DEVELOPER); } // If we get here, the document was not indexed due to an error. So we will index just the base info without the file. $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR; $this->add_solr_document($filedoc); }