function parseRecord($record) { $status = isset($record->header->attributes()->status) ? (string) $record->header->attributes()->status : FALSE; $identifier = (string) $record->header->identifier[0]; $specs = array(); foreach ($record->header->setSpec as $spec) { $specs[] = (string) $spec; } $metadata = null; if ($record->metadata) { $dom = dom_import_simplexml($record->metadata); if ($dom === FALSE) { $errorMsq = sprintf('[id: %s, status: %s] Error while parsing XML "%s"', $identifier, $status, $record->metadata); $this->parserErrors[] = $errorMsq; xc_log_error('DOMParser', $errorMsq); } elseif (!$dom->hasChildNodes()) { $errorMsq = sprintf('No children in metadata: [id: %s, status: %s] xml: "%s"', $identifier, $status, htmlentities($record->metadata)); $this->parserErrors[] = $errorMsq; xc_log_error('DOMParser', $errorMsq); } else { foreach ($dom->childNodes as $childNode) { if ($childNode->nodeType == XML_ELEMENT_NODE) { $metadata = array('namespaceURI' => preg_replace('/\\/$/', '', $childNode->namespaceURI), 'childNode' => $childNode); break; } } } } return array('header' => array('@status' => $status, 'identifier' => $identifier, 'datestamp' => (string) $record->header->datestamp[0], 'setSpec' => $specs), 'metadata' => $metadata, 'about' => isset($record->abouts) ? $record->abouts : FALSE); }
private function _processContent() { if (strstr($this->rawContent, '<error') && !strstr($this->rawContent, '<ListRecords>')) { $text = $this->rawContent; } else { $text = $this->rawContent; if (!$this->need_debug) { $this->rawContent = ''; } } $count = 0; $text = preg_replace("/^<\\?xml[^<>]+\\?>/", "", $text, -1, $count); if ($count != 1) { xc_log_error('regex', 'Malformed XML: no sheabang — ' . htmlentities($text)); $this->parserErrors[] = 'no sheabang'; } $count = 0; $text = preg_replace("/^\\s*<OAI-PMH\\b[^<>]+>/s", "", $text, -1, $count); if ($count != 1) { xc_log_error('regex', 'Malformed XML: no <OAI-PMH> — ' . htmlentities($text)); $this->parserErrors[] = 'no OAI-PMH'; return; } $count = 1; $text = preg_replace("/^\\s*<responseDate>(.*?)<\\/responseDate>/s", "", $text, -1, $count); if ($count != 1) { xc_log_error('regex', 'Malformed XML: no <responseDate> — ' . htmlentities($text)); $this->parserErrors[] = 'no responseDate'; return; } $count = 0; $text = preg_replace("/^\\s*<request(?: [^<>]+)?>(.*?)<\\/request>/s", "", $text, -1, $count); if ($count != 1) { xc_log_error('regex', 'Malformed XML: no <request> element in XML response: ' . htmlentities(substr($text, 0, 50)) . '…'); $this->parserErrors[] = 'no request element'; return; } $this->hasListRecords = FALSE; $this->hasErrors = FALSE; $count = 0; $text = preg_replace("/^\\s*<ListRecords>/", "", $text, -1, $count); if ($count != 1) { $errors = ''; $size = drupal_strlen($text); $text = preg_replace_callback("/^\\s*<error(?: code=([\\'\"])(.*)([\\'\"]))?>(.*)<\\/error>/", array($this, 'setErrors'), $text); if ($size == drupal_strlen($text)) { xc_log_error('regex', 'Malformed XML: no ListRecords and no error elements — ' . htmlentities($text)); $this->parserErrors[] = 'no ListRecords and error elements'; } else { $this->hasErrors = TRUE; } } else { $this->hasListRecords = TRUE; } if (!$this->hasErrors && !$this->need_debug) { $this->rawContent = ''; } if ($this->hasListRecords) { $resumption = ''; $records = ''; $is_changed_size = TRUE; while (!preg_match("/^\\s*<\\/ListRecords>/", $text) && $is_changed_size) { $count = 0; $text = preg_replace_callback("/^\\s*(<record>.*<\\/record>)/s", array($this, 'setRecords'), $text, -1, $count); if ($count > 0) { $this->hasRecords = TRUE; $is_changed_size = TRUE; // raw_records is initialized by setRecords $mod_records = str_replace('<record>', '<#######><record>', $this->raw_records); $this->records = preg_split("/<#######>/", $mod_records, -1, PREG_SPLIT_NO_EMPTY); $this->recordCount = count($this->records); $mod_records = ''; $count = 0; $text = preg_replace_callback("/^\\s*(<resumptionToken([^<>]*)>(.*?)<\\/resumptionToken>|<resumptionToken([^<>]*)\\s*\\/\\s*>)/", array($this, 'setResumption'), $text, -1, $count); if ($count > 0) { $is_changed_size = TRUE; // resumption comes from setResumption //echo 'recCount: ', $this->recordCount, ", resumption: ", $this->resumptionToken['text'], "\n"; } else { $is_changed_size = FALSE; //$this->parserErrors[] = 'no resumptionToken'; } } else { $is_changed_size = FALSE; $this->parserErrors[] = 'no record elements'; } } // process inside ListRecords $count = 0; $text = preg_replace('/^\\s*<\\/ListRecords>/', '', $text, -1, $count); if ($count != 1) { xc_log_error('regex', 'Malformed XML: no /ListRecords — ' . htmlentities($text)); $this->parserErrors[] = 'no ListRecords closer element'; } } else { $this->parserErrors[] = 'no ListRecords element'; } if ($this->hasListRecords || $this->hasErrors) { $count = 0; $text = preg_replace("/^\\s*<\\/OAI-PMH>/", '', $text, -1, $count); if ($count != 1) { xc_log_error('regex', 'Malformed XML: no /OAI-PMH — ' . htmlentities($text)); $this->parserErrors[] = 'no /OAI-PMH closer element'; } } // has ListRecords or error }
public function fetchHttpContent() { $t0 = microtime(TRUE); $this->requestUrl = $this->baseUrl . '?verb=' . $this->requestVerb; if (count($this->requestArguments)) { foreach ($this->requestArguments as $key => $value) { $this->requestUrl .= '&' . $key . '=' . $value; } } $cache_file = $this->getCacheFile(); if (isset($cache_file) && file_exists($cache_file)) { try { $this->content = file_get_contents($cache_file); } catch (Exception $e) { xc_log_error('harvester', 'file get contents error: ' . $e->getMessage()); $this->error = $e->getMessage(); } $this->statistics['fetch']['curl_init'] = microtime(TRUE) - $t0; $this->httpCode = 200; } else { $ch = curl_init(""); curl_setopt($ch, CURLINFO_HEADER_OUT, 1); // Get request header curl_setopt($ch, CURLOPT_HEADER, 1); // Get response header curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_NOPROGRESS, 1); curl_setopt($ch, CURLOPT_USERAGENT, 'Omeka-XC OAI harvester'); curl_setopt($ch, CURLOPT_ENCODING, "gzip,deflate"); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); if ($this->useAuthentication) { curl_setopt($ch, CURLOPT_USERPWD, $this->username . ":" . $this->password); } if (substr($this->requestUrl, 0, 5) == 'https') { curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); } curl_setopt($ch, CURLOPT_URL, $this->requestUrl); $this->statistics['fetch']['curl_init'] = microtime(TRUE) - $t0; // get the content $response = curl_exec($ch); // extract HTTP response header $this->httpHeader = substr($response, 0, curl_getinfo($ch, CURLINFO_HEADER_SIZE)); // extract content $this->content = substr($response, curl_getinfo($ch, CURLINFO_HEADER_SIZE)); $this->requestHeader = curl_getinfo($ch, CURLINFO_HEADER_OUT); $this->httpCode = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE); $this->downloadSize = curl_getinfo($ch, CURLINFO_SIZE_DOWNLOAD); $this->curlInfo = curl_getinfo($ch); if (curl_errno($ch)) { $this->fetchError = array('code' => curl_errno($ch), 'text' => curl_error($ch)); } curl_close($ch); } $this->hasFetched = TRUE; }