コード例 #1
0
ファイル: engine.php プロジェクト: janeklb/moodle
 /**
  * Adds a file to the search engine.
  *
  * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
  * Tika has much better content type detection than Moodle, and we will have many more doc failures
  * if we try to send mime types.
  *
  * @param document $document
  * @param \stored_file $storedfile
  * @return void
  */
 protected function add_stored_file($document, $storedfile)
 {
     $filedoc = $document->export_file_for_engine($storedfile);
     if (!$this->file_is_indexable($storedfile)) {
         // For files that we don't consider indexable, we will still place a reference in the search engine.
         $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_FALSE;
         $this->add_solr_document($filedoc);
         return;
     }
     $curl = $this->get_curl_object();
     $url = $this->get_connection_url('/update/extract');
     // This will prevent solr from automatically making fields for every tika output.
     $url->param('uprefix', 'ignored_');
     // Control how content is captured. This will keep our file content clean of non-important metadata.
     $url->param('captureAttr', 'true');
     // Move the content to a field for indexing.
     $url->param('fmap.content', 'solr_filecontent');
     // These are common fields that matches the standard *_point dynamic field and causes an error.
     $url->param('fmap.media_white_point', 'ignored_mwp');
     $url->param('fmap.media_black_point', 'ignored_mbp');
     // Copy each key to the url with literal.
     // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
     foreach ($filedoc as $key => $value) {
         // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
         $url->param('fmap.' . $key, 'ignored_' . $key);
         // Place data in a tmp field.
         $url->param('literal.mdltmp_' . $key, $value);
         // Then move to the final field.
         $url->param('fmap.mdltmp_' . $key, $key);
     }
     // This sets the true filename for Tika.
     $url->param('resource.name', $storedfile->get_filename());
     // A giant block of code that is really just error checking around the curl request.
     try {
         // Now actually do the request.
         $result = $curl->post($url->out(false), array('myfile' => $storedfile));
         $code = $curl->get_errno();
         $info = $curl->get_info();
         // Now error handling. It is just informational, since we aren't tracking per file/doc results.
         if ($code != 0) {
             // This means an internal cURL error occurred error is in result.
             $message = 'Curl error ' . $code . ' while indexing file with document id ' . $filedoc['id'] . ': ' . $result . '.';
             debugging($message, DEBUG_DEVELOPER);
         } else {
             if (isset($info['http_code']) && $info['http_code'] !== 200) {
                 // Unexpected HTTP response code.
                 $message = 'Error while indexing file with document id ' . $filedoc['id'];
                 // Try to get error message out of msg or title if it exists.
                 if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
                     $message .= ': ' . $matches[1];
                 } else {
                     if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
                         $message .= ': ' . $matches[1];
                     }
                 }
                 // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
                 if (CLI_SCRIPT && !PHPUNIT_TEST) {
                     mtrace($message);
                 }
             } else {
                 // Check for the expected status field.
                 if (preg_match('|<int [^>]*name="status"[^>]*>(\\d*)</int>|i', $result, $matches)) {
                     // Now check for the expected status of 0, if not, error.
                     if ((int) $matches[1] !== 0) {
                         $message = 'Unexpected Solr status code ' . (int) $matches[1];
                         $message .= ' while indexing file with document id ' . $filedoc['id'] . '.';
                         debugging($message, DEBUG_DEVELOPER);
                     } else {
                         // The document was successfully indexed.
                         return;
                     }
                 } else {
                     // We received an unprocessable response.
                     $message = 'Unexpected Solr response while indexing file with document id ' . $filedoc['id'] . ': ';
                     $message .= strtok($result, "\n");
                     debugging($message, DEBUG_DEVELOPER);
                 }
             }
         }
     } catch (\Exception $e) {
         // There was an error, but we are not tracking per-file success, so we just continue on.
         debugging('Unknown exception while indexing file "' . $storedfile->get_filename() . '".', DEBUG_DEVELOPER);
     }
     // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
     $filedoc['solr_fileindexstatus'] = document::INDEXED_FILE_ERROR;
     $this->add_solr_document($filedoc);
 }