/** * Return fields to be indexed in Solr (an alternative to an XSL transformation) * * @return string[] */ public function toSolrArray() { $data = parent::toSolrArray(); // Nonstandard author fields $authors = $this->getValues('author'); if ($authors) { $data['author'] = array_shift($authors); if (isset($data['author2'])) { $data['author2'] = array_merge($authors, $data['author2']); } else { $data['author2'] = $authors; } } if (isset($data['publishDate'])) { $data['main_date_str'] = MetadataUtils::extractYear($data['publishDate']); $data['main_date'] = $this->validateDate($this->getPublicationYear() . '-01-01T00:00:00Z'); } if ($range = $this->getPublicationDateRange()) { $data['search_sdaterange_mv'][] = $data['publication_sdaterange'] = MetadataUtils::dateRangeToNumeric($range); $data['search_daterange_mv'][] = $data['publication_daterange'] = MetadataUtils::dateRangeToStr($range); } foreach ($this->doc->relation as $relation) { $url = (string) $relation; // Ignore too long fields. Require at least one dot surrounded by valid // characters or a familiar scheme if (strlen($url) > 4096 || !preg_match('/[A-Za-z0-9]\\.[A-Za-z0-9]/', $url) && !preg_match('/^(http|ftp)s?:\\/\\//', $url)) { continue; } $link = ['url' => $url, 'text' => '', 'source' => $this->source]; $data['online_boolean'] = true; $data['online_str_mv'] = $this->source; $data['online_urls_str_mv'][] = json_encode($link); } foreach ($this->doc->file as $file) { $url = (string) $file->attributes()->href ? (string) $file->attributes()->href : (string) $file; $link = ['url' => $url, 'text' => (string) $file->attributes()->name, 'source' => $this->source]; $data['online_boolean'] = true; $data['online_str_mv'] = $this->source; $data['online_urls_str_mv'][] = json_encode($link); if (strcasecmp($file->attributes()->bundle, 'THUMBNAIL') == 0 && !isset($data['thumbnail'])) { $data['thumbnail'] = $url; } } if ($this->doc->permaddress) { $data['url'] = (string) $this->doc->permaddress[0]; } $data['source_str_mv'] = $this->source; $data['datasource_str_mv'] = $this->source; return $data; }
/** * Return fields to be indexed in Solr (an alternative to an XSL transformation) * * @return string[] */ public function toSolrArray() { $data = parent::toSolrArray(); if (isset($data['publishDate'])) { $data['main_date_str'] = MetadataUtils::extractYear($data['publishDate']); $data['main_date'] = $this->validateDate($this->getPublicationYear() . '-01-01T00:00:00Z'); } if ($range = $this->getPublicationDateRange()) { $data['search_sdaterange_mv'][] = $data['publication_sdaterange'] = metadataUtils::dateRangeToNumeric($range); $data['search_daterange_mv'][] = $data['publication_daterange'] = metadataUtils::dateRangeToStr($range); } // language, take only first $languages = array_filter(explode(' ', (string) $this->doc->language), function ($value) { return preg_match('/^[a-z]{2,3}$/', $value) && $value != 'zxx' && $value != 'und'; }); $data['language'] = array_shift($languages); $data['source_str_mv'] = $this->source; $data['datasource_str_mv'] = $this->source; return $data; }
/** * Dedup: Return ISBNs in ISBN-13 format without dashes * * @return string[] */ public function getISBNs() { $arr = []; foreach ($this->doc->identifier as $identifier) { $identifier = str_replace('-', '', $identifier); if (!preg_match('{([0-9]{9,12}[0-9xX])}', $identifier, $matches)) { continue; } $isbn = $matches[1]; if (strlen($isbn) == 10) { $isbn = MetadataUtils::isbn10to13($isbn); } if ($isbn) { $arr[] = $isbn; } } return array_values(array_unique($arr)); }
/** * Verify that a string is valid ISO8601 date * * @param string $dateString Date string * * @return string Valid date string or an empty string if invalid */ protected function validateDate($dateString) { if (MetadataUtils::validateISO8601Date($dateString) !== false) { return $dateString; } return ''; }
/** * Get genre facet fields * * @return string[] Topics */ protected function getGenreFacets() { return MetadataUtils::ucFirst($this->getFieldsSubfields([[MarcRecord::GET_NORMAL, '600', ['v' => 1]], [MarcRecord::GET_NORMAL, '610', ['v' => 1]], [MarcRecord::GET_NORMAL, '611', ['v' => 1]], [MarcRecord::GET_NORMAL, '630', ['v' => 1]], [MarcRecord::GET_NORMAL, '648', ['v' => 1]], [MarcRecord::GET_NORMAL, '650', ['v' => 1]], [MarcRecord::GET_NORMAL, '651', ['v' => 1]], [MarcRecord::GET_NORMAL, '655', ['a' => 1]], [MarcRecord::GET_NORMAL, '655', ['v' => 1]]], false, true, true)); }
/** * Process a complete record set harvested e.g. from MetaLib * * @param string $source Source ID * @param string[] $harvestedRecords Array of records * * @return void */ protected function processFullRecordSet($source, $harvestedRecords) { $this->log->log('processFullRecordSet', "[{$source}] Processing complete record set"); // Create keyed array $records = []; foreach ($harvestedRecords as $record) { $marc = RecordFactory::createRecord('marc', $record, '', $source); $id = $marc->getID(); $records["{$source}.{$id}"] = $record; } $this->log->log('processFullRecordSet', "[{$source}] Merging results with the records in database"); $deleted = 0; $unchanged = 0; $changed = 0; $added = 0; $dbRecords = $this->db->record->find(['deleted' => false, 'source_id' => $source])->timeout($this->cursorTimeout); foreach ($dbRecords as $dbRecord) { $id = $dbRecord['_id']; if (!isset($records[$id])) { // Record not in harvested records, mark deleted $this->storeRecord($id, true, ''); unset($records[$id]); ++$deleted; continue; } // Check if the record has changed $marc = RecordFactory::createRecord('marc', $records[$id], '', $source); if ($marc->serialize() != MetadataUtils::getRecordData($dbRecord, false)) { // Record changed, update... $this->storeRecord($id, false, $records[$id]); ++$changed; } else { ++$unchanged; } unset($records[$id]); } $this->log->log('processFullRecordSet', "[{$source}] Adding new records"); foreach ($records as $id => $record) { $this->storeRecord($id, false, $record); ++$added; } $this->log->log('processFullRecordSet', "[{$source}] {$added} new, {$changed} changed, {$unchanged} unchanged and " . "{$deleted} deleted records processed"); }
/** * Split title to main title and description. Tries to find the first sentence * break where the title can be split. * * @param string $title Title to split * * @return null|string Null if title was not split, otherwise the initial * title part */ public static function splitTitle($title) { $i = 0; $parenLevel = 0; $bracketLevel = 0; // Make sure the title has single spaces for whitespace $title = preg_replace('/\\s+/', ' ', $title); $titleWords = explode(' ', $title); foreach ($titleWords as $word) { ++$i; $parenLevel += substr_count($word, '('); $parenLevel -= substr_count($word, ')'); $bracketLevel += substr_count($word, '['); $bracketLevel -= substr_count($word, ']'); if ($parenLevel == 0 && $bracketLevel == 0) { // Try to avoid splitting at short words or the very beginning if (substr($word, -1) == '.' && strlen($word) > 2 && ($i > 1 || strlen($word) > 4)) { // Verify that the word is strippable (not abbreviation etc.) $leadStripped = MetadataUtils::stripLeadingPunctuation($word); $stripped = metadataUtils::stripTrailingPunctuation($leadStripped); $nextFirst = isset($titleWords[$i]) ? substr($titleWords[$i], 0, 1) : ''; // 1.) There has to be something following this word. // 2.) The trailing period must be strippable or end with a year. // 3.) Next word has to start with a capital or digit // 4.) Not something like 12-p. // 5.) Not initials like A.N. if ($nextFirst && ($leadStripped != $stripped || preg_match('/^\\d{4}\\.$/', $word)) && (is_numeric($nextFirst) || !ctype_lower($nextFirst)) && !preg_match('/.+\\-\\w{1,2}\\.$/', $word) && !preg_match('/^\\w\\.\\w\\.$/', $word)) { return metadataUtils::stripTrailingPunctuation(implode(' ', array_splice($titleWords, 0, $i))); } } } } return null; }
/** * Create a sort key * * @return string */ public function getSortKey() { $key = strtoupper($this->letters); if ($this->digits) { if ($key) { $key .= ' '; } $key .= strlen((int) $this->digits); $key .= $this->digits; } $key .= $this->decimal; if ($this->suffix) { if ($key) { $key .= ' '; if (ctype_alpha($this->suffix[0])) { $key .= '_'; } } $key .= MetadataUtils::createSortableString($this->suffix); } if ($this->cutter) { foreach (preg_split('/[A-Za-z]\\d+/', $this->cutter) as $part) { if ($key) { $key .= ' '; } $key .= MetadataUtils::createSortableString($part); } } return $key; }
/** * Get an array of all fields relevant to allfields search * * @return string[] */ protected function getAllFields() { $subfieldFilter = ['650' => ['0' => 1, '2' => 1, '6' => 1, '8' => 1], '773' => ['0' => 1, '6' => 1, '7' => 1, '8' => 1, 'w' => 1], '856' => ['0' => 1, '6' => 1, '8' => 1, 'q' => 1], '979' => ['0' => 1, 'a' => 1, 'f' => 1]]; $allFields = []; // Include ISBNs, also normalized if possible foreach ($this->getFields('020') as $field) { $isbns = $this->getSubfieldsArray($field, ['a' => 1, 'z' => 1]); foreach ($isbns as $isbn) { $allFields[] = $isbn; $isbn = MetadataUtils::normalizeISBN($isbn); if ($isbn) { $allFields[] = $isbn; } } } foreach ($this->fields as $tag => $fields) { if ($tag >= 100 && $tag < 841 && $tag != 336 && $tag != 337 || $tag == 856 || $tag == 880 || $tag == 979) { foreach ($fields as $field) { $subfields = $this->getAllSubfields($field, isset($subfieldFilter[$tag]) ? $subfieldFilter[$tag] : ['0' => 1, '6' => 1, '8' => 1]); if ($subfields) { $allFields = array_merge($allFields, $subfields); } } } } $allFields = array_map(function ($str) { return MetadataUtils::stripLeadingPunctuation(MetadataUtils::stripTrailingPunctuation($str)); }, $allFields); return array_values(array_unique($allFields)); }
/** * Attempt to parse a string (in finnish) into a normalized date range. * * TODO: complicated normalizations like this should preferably reside within * their own, separate component which should allow modification of the * algorithm by methods other than hard-coding rules into source. * * @param string $input Date range * * @return string Two ISO 8601 dates separated with a comma on success, and null * on failure */ protected function parseDateRange($input) { $input = trim(strtolower($input)); if (preg_match('/(\\d\\d\\d\\d) ?- (\\d\\d\\d\\d)/', $input, $matches) > 0) { $startDate = $matches[1]; $endDate = $matches[2]; } elseif (preg_match('/(\\d\\d\\d\\d)-(\\d\\d?)-(\\d\\d?)/', $input, $matches) > 0) { $year = $matches[1]; $month = sprintf('%02d', $matches[2]); $day = sprintf('%02d', $matches[3]); $startDate = $year . '-' . $month . '-' . $day . 'T00:00:00Z'; $endDate = $year . '-' . $month . '-' . $day . 'T23:59:59Z'; $noprocess = true; } elseif (preg_match('/(\\d\\d?)\\s*.\\s*(\\d\\d?)\\s*.\\s*(\\d\\d\\d\\d)/', $input, $matches) > 0) { $year = $matches[3]; $month = sprintf('%02d', $matches[2]); $day = sprintf('%02d', $matches[1]); $startDate = $year . '-' . $month . '-' . $day . 'T00:00:00Z'; $endDate = $year . '-' . $month . '-' . $day . 'T23:59:59Z'; $noprocess = true; } elseif (preg_match('/(\\d?\\d?\\d\\d) ?\\?/', $input, $matches) > 0) { $year = $matches[1]; $startDate = $year - 3; $endDate = $year + 3; } elseif (preg_match('/(\\d?\\d?\\d\\d)/', $input, $matches) > 0) { $year = $matches[1]; $startDate = $year; $endDate = $year; } else { return null; } if (strlen($startDate) == 2) { $startDate = 1900 + (int) $startDate; } if (strlen($endDate) == 2) { $century = substr($startDate, 0, 2) . '00'; $endDate = (int) $century + (int) $endDate; } if (empty($noprocess)) { $startDate = $startDate . '-01-01T00:00:00Z'; $endDate = $endDate . '-12-31T23:59:59Z'; } // Trying to index dates into the future? I don't think so... $yearNow = date('Y'); if ($startDate > $yearNow || $endDate > $yearNow) { return null; } if (MetadataUtils::validateISO8601Date($startDate) === false || MetadataUtils::validateISO8601Date($endDate) === false) { return null; } return "{$startDate},{$endDate}"; }
/** * Return fields to be indexed in Solr * * @param boolean $prependTitleWithSubtitle If true and title_sub differs from * title_short, title is formed by combining title_sub and title_short * * @return string[] */ public function toSolrArray($prependTitleWithSubtitle) { $data = []; $doc = $this->doc; $data['ctrlnum'] = (string) $this->doc->attributes()->{'id'}; $data['fullrecord'] = MetadataUtils::trimXMLWhitespace($doc->asXML()); $data['allfields'] = $this->getAllFields($doc); if ($doc->scopecontent) { if ($doc->scopecontent->p) { // Join all p-elements into a flat string. $desc = []; foreach ($doc->scopecontent->p as $p) { $desc[] = trim((string) $p); } $desc = implode(' / ', $desc); } else { $desc = (string) $doc->scopecontent; } $data['description'] = $desc; } $authors = []; if ($names = $doc->xpath('controlaccess/persname')) { foreach ($names as $name) { if (trim((string) $name) !== '-') { $authors[] = trim((string) $name); } } } if ($names = $doc->xpath('controlaccess/corpname')) { foreach ($names as $name) { $authors[] = trim((string) $name); } } if ($authors) { $data['author'] = array_shift($authors); $data['author-letter'] = $data['author']; } if ($authors) { $data['author2'] = $authors; } if ($doc->did->origination) { $data['author_additional'] = trim((string) $doc->did->origination->corpname); } if ($geoNames = $doc->xpath('controlaccess/geogname')) { $names = []; foreach ($geoNames as $name) { if (trim((string) $name) !== '-') { $names[] = trim((string) $name); } } $data['geographic'] = $data['geographic_facet'] = $names; } if ($subjects = $doc->xpath('controlaccess/subject')) { $topics = []; foreach ($subjects as $subject) { if (trim((string) $subject) !== '-') { $topics[] = trim((string) $subject); } } $data['topic'] = $data['topic_facet'] = $topics; } $genre = $doc->xpath('controlaccess/genreform'); $data['format'] = (string) ($genre ? $genre[0] : $doc->attributes()->level); if (isset($doc->did->repository)) { $data['institution'] = (string) isset($doc->did->repository->corpname) ? $doc->did->repository->corpname : $doc->did->repository; } $data['title_sub'] = ''; switch ($data['format']) { case 'fonds': break; case 'collection': break; case 'series': case 'subseries': $data['title_sub'] = (string) $doc->did->unitid; break; default: $data['title_sub'] = (string) $doc->did->unitid; if ($doc->{'add-data'}->parent) { $data['series'] = (string) $doc->{'add-data'}->parent->attributes()->unittitle; } break; } $data['title_short'] = (string) $doc->did->unittitle; $data['title'] = ''; if ($prependTitleWithSubtitle) { if ($data['title_sub'] && $data['title_sub'] != $data['title_short']) { $data['title'] = $data['title_sub'] . ' '; } } $data['title'] .= $data['title_short']; $data['title_full'] = $data['title_sort'] = $data['title']; $data['title_sort'] = mb_strtolower(MetadataUtils::stripLeadingPunctuation($data['title_sort']), 'UTF-8'); if ($languages = $doc->did->xpath('langmaterial/language')) { foreach ($languages as $lang) { if (isset($lang->attributes()->langcode)) { $langCode = trim((string) $lang->attributes()->langcode); if ($langCode != '') { $data['language'][] = $langCode; } } } } if ($extents = $doc->did->xpath('physdesc/extent')) { foreach ($extents as $extent) { if (trim((string) $extent) !== '-') { $data['physical'][] = (string) $extent; } } } $nodes = $this->doc->did->daogrp->xpath('daoloc[@role="image_thumbnail"]'); if ($nodes) { // store first thumbnail $node = $nodes[0]; if (isset($node->attributes()->href)) { $data['thumbnail'] = (string) $node->attributes()->href; } } $data['hierarchytype'] = 'Default'; if ($this->doc->{'add-data'}->archive) { $archiveAttr = $this->doc->{'add-data'}->archive->attributes(); $data['hierarchy_top_id'] = (string) $archiveAttr->{'id'}; $data['hierarchy_top_title'] = (string) $archiveAttr->title; if ($archiveAttr->subtitle) { $data['hierarchy_top_title'] .= ' : ' . (string) $archiveAttr->subtitle; } $data['allfields'][] = $data['hierarchy_top_title']; if ($archiveAttr->sequence) { $data['hierarchy_sequence'] = (string) $archiveAttr->sequence; } } if ($this->doc->{'add-data'}->{'parent'}) { $data['hierarchy_parent_id'] = (string) $this->doc->{'add-data'}->{'parent'}->attributes()->{'id'}; $data['allfields'][] = $data['hierarchy_parent_title'] = (string) $this->doc->{'add-data'}->{'parent'}->attributes()->title; } else { $data['is_hierarchy_id'] = $data['hierarchy_top_id'] = $this->getID(); $data['is_hierarchy_title'] = $data['hierarchy_top_title'] = (string) $doc->did->unittitle; } return $data; }
/** * Get component parts in a sorted array * * @param string $sourceId Source ID * @param string $hostRecordId Host record ID (doesn't include source id) * * @return array Array of component parts */ protected function getComponentPartsSorted($sourceId, $hostRecordId) { $componentsIter = $this->db->record->find(['source_id' => $sourceId, 'host_record_id' => $hostRecordId])->timeout($this->cursorTimeout); $components = []; foreach ($componentsIter as $component) { $components[MetadataUtils::createIdSortKey($component['_id'])] = $component; } ksort($components); return array_values($components); }
/** * Create Solr array for the given record * * @param array $record Mongo record * @param integer $mergedComponents Number of component parts merged to the * record * * @return string[] * @throws Exception */ protected function createSolrArray($record, &$mergedComponents) { global $configArray; $metadataRecord = RecordFactory::createRecord($record['format'], MetadataUtils::getRecordData($record, true), $record['oai_id'], $record['source_id']); $source = $record['source_id']; if (!isset($this->settings[$source])) { // Try to reload data source settings as they might have been updated // during a long run $this->loadDatasources(); if (!isset($this->settings[$source])) { $this->log->log('createSolrArray', "No settings found for data source '{$source}', record " . "{$record['_id']}: " . $this->prettyPrint($record, true), Logger::FATAL); throw new Exception("No settings found for data source '{$source}'"); } } $settings = $this->settings[$source]; $hiddenComponent = false; if (isset($record['host_record_id'])) { if ($settings['componentParts'] == 'merge_all') { $hiddenComponent = true; } elseif ($settings['componentParts'] == 'merge_non_articles' || $settings['componentParts'] == 'merge_non_earticles') { $format = $metadataRecord->getFormat(); if (!in_array($format, $this->allArticleFormats)) { $hiddenComponent = true; } elseif (in_array($format, $this->articleFormats)) { $hiddenComponent = true; } } } if ($hiddenComponent && !$settings['indexMergedParts']) { return false; } $hasComponentParts = false; $components = null; if (!isset($record['host_record_id'])) { // Fetch info whether component parts exist and need to be merged if (!$record['linking_id']) { $this->log->log('createSolrArray', "linking_id missing for record '{$record['_id']}'", Logger::ERROR); } else { $components = $this->db->record->find(['source_id' => $record['source_id'], 'host_record_id' => $record['linking_id'], 'deleted' => false])->timeout($this->cursorTimeout); $hasComponentParts = $components->hasNext(); $format = $metadataRecord->getFormat(); $merge = false; if ($settings['componentParts'] == 'merge_all') { $merge = true; } elseif (!in_array($format, $this->allJournalFormats)) { $merge = true; } elseif (in_array($format, $this->journalFormats) && $settings['componentParts'] == 'merge_non_earticles') { $merge = true; } if (!$merge) { unset($components); } } } if (isset($components)) { $mergedComponents += $metadataRecord->mergeComponentParts($components); } if (isset($settings['solrTransformationXSLT'])) { $params = ['source_id' => $source, 'institution' => $settings['institution'], 'format' => $settings['format'], 'id_prefix' => $settings['idPrefix']]; $data = $settings['solrTransformationXSLT']->transformToSolrArray($metadataRecord->toXML(), $params); } else { $prependTitleWithSubtitle = isset($settings['prepend_title_with_subtitle']) ? $settings['prepend_title_with_subtitle'] : true; $data = $metadataRecord->toSolrArray($prependTitleWithSubtitle); $this->enrich($source, $settings, $metadataRecord, $data); } $data['id'] = $record['_id']; // Record links between host records and component parts if ($metadataRecord->getIsComponentPart()) { $hostRecord = null; if (isset($record['host_record_id']) && $this->db) { $hostRecord = $this->db->record->find(['source_id' => $record['source_id'], 'linking_id' => $record['host_record_id']])->limit(-1)->timeout($this->cursorTimeout)->getNext(); } if (!$hostRecord) { if (isset($record['host_record_id'])) { $this->log->log('createSolrArray', "Host record '" . $record['host_record_id'] . "' not found for record '" . $record['_id'] . "'", Logger::WARNING); } $data['container_title'] = $metadataRecord->getContainerTitle(); } else { $data['hierarchy_parent_id'] = $hostRecord['_id']; $hostMetadataRecord = RecordFactory::createRecord($hostRecord['format'], MetadataUtils::getRecordData($hostRecord, true), $hostRecord['oai_id'], $hostRecord['source_id']); $data['container_title'] = $data['hierarchy_parent_title'] = $hostMetadataRecord->getTitle(); } $data['container_volume'] = $metadataRecord->getVolume(); $data['container_issue'] = $metadataRecord->getIssue(); $data['container_start_page'] = $metadataRecord->getStartPage(); $data['container_reference'] = $metadataRecord->getContainerReference(); } else { // Add prefixes to hierarchy linking fields foreach (['hierarchy_top_id', 'hierarchy_parent_id', 'is_hierarchy_id'] as $field) { if (isset($data[$field]) && $data[$field]) { $data[$field] = $record['source_id'] . '.' . $data[$field]; } } } if ($hasComponentParts) { $data['is_hierarchy_id'] = $record['_id']; $data['is_hierarchy_title'] = $metadataRecord->getTitle(); } if (!isset($data['institution'])) { $data['institution'] = $settings['institution']; } foreach ($settings['extraFields'] as $extraField) { $fieldName = key($extraField); $fieldValue = current($extraField); if (isset($data[$fieldName])) { if (!is_array($data[$fieldName])) { $data[$fieldName] = [$data[$fieldName]]; } $data[$fieldName][] = $fieldValue; } else { $data[$fieldName] = $fieldValue; } } // Map field values according to any mapping files foreach ($settings['mappingFiles'] as $field => $map) { if (isset($data[$field]) && !empty($data[$field])) { if (is_array($data[$field])) { $newValues = null; foreach ($data[$field] as $value) { if (isset($map[$value])) { $newValues = $map[$value]; } elseif (isset($map['##default'])) { $newValues = $map['##default']; } } if (null !== $newValues) { if (is_array($newValues)) { $data[$field] = array_values(array_unique($newValues)); } else { $data[$field] = $newValues; } } } else { if (isset($map[$data[$field]])) { $data[$field] = $map[$data[$field]]; } elseif (isset($map['##default'])) { $data[$field] = $map['##default']; } } } elseif (isset($map['##empty'])) { $data[$field] = $map['##empty']; } elseif (isset($map['##emptyarray'])) { $data[$field] = [$map['##emptyarray']]; } } // Special case: Special values for building (institution/location). // Used by default if building is set as a hierarchical facet. if ($this->buildingHierarchy || isset($settings['institutionInBuilding'])) { $useInstitution = isset($settings['institutionInBuilding']) ? $settings['institutionInBuilding'] : 'institution'; switch ($useInstitution) { case 'driver': $institutionCode = $data['institution']; break; case 'none': $institutionCode = ''; break; case 'source': $institutionCode = $source; break; case 'institution/source': $institutionCode = $settings['institution'] . '/' . $source; break; default: $institutionCode = $settings['institution']; break; } if ($institutionCode) { if (isset($data['building']) && $data['building']) { if (is_array($data['building'])) { foreach ($data['building'] as &$building) { // Allow also empty values that might result from // mapping tables if ($building !== '') { $building = "{$institutionCode}/{$building}"; } } } else { $data['building'] = $institutionCode . '/' . $data['building']; } } else { $data['building'] = [$institutionCode]; } } } // Hierarchical facets if (isset($configArray['Solr']['hierarchical_facets'])) { foreach ($configArray['Solr']['hierarchical_facets'] as $facet) { if (!isset($data[$facet])) { continue; } $array = []; if (!is_array($data[$facet])) { $data[$facet] = [$data[$facet]]; } foreach ($data[$facet] as $datavalue) { if ($datavalue === '') { continue; } $values = explode('/', $datavalue); $hierarchyString = ''; for ($i = 0; $i < count($values); $i++) { $hierarchyString .= '/' . $values[$i]; $array[] = $i . $hierarchyString . '/'; } } $data[$facet] = $array; } } if (!isset($data['allfields'])) { $all = []; foreach ($data as $key => $field) { if (in_array($key, ['fullrecord', 'thumbnail', 'id', 'recordtype', 'ctrlnum'])) { continue; } if (is_array($field)) { $all = array_merge($all, $field); } else { $all[] = $field; } } $data['allfields'] = MetadataUtils::array_iunique($all); } $data['first_indexed'] = MetadataUtils::formatTimestamp($record['created']->sec); $data['last_indexed'] = MetadataUtils::formatTimestamp($record['date']->sec); $data['recordtype'] = $record['format']; if (!isset($data['fullrecord'])) { $data['fullrecord'] = $metadataRecord->toXML(); } if (!is_array($data['format'])) { $data['format'] = [$data['format']]; } if (isset($configArray['Solr']['format_in_allfields']) && $configArray['Solr']['format_in_allfields']) { foreach ($data['format'] as $format) { // Replace numbers since they may be be considered word boundaries $data['allfields'][] = str_replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ['ax', 'bx', 'cx', 'dx', 'ex', 'fx', 'gx', 'hx', 'ix', 'jx'], MetadataUtils::normalize($format)); } } if ($hiddenComponent) { $data['hidden_component_boolean'] = true; } foreach ($data as $key => &$values) { if (is_array($values)) { foreach ($values as $key => &$value) { $value = MetadataUtils::normalizeUnicode($value); if (empty($value) || $value === 0 || $value === 0.0 || $value === '0') { unset($values[$key]); } } $values = array_values(array_unique($values)); } elseif ($key != 'fullrecord') { $values = MetadataUtils::normalizeUnicode($values); } if (empty($values) || $values === 0 || $values === 0.0 || $values === '0') { unset($data[$key]); } } return $data; }
/** * Attempt to parse a string (in finnish) into a normalized date range. * * TODO: complicated normalizations like this should preferably reside within * their own, separate component which should allow modification of the algorithm * by methods other than hard-coding rules into source. * * @param string $input Date range * * @return string[] Two ISO 8601 dates */ protected function parseDateRange($input) { $input = trim(strtolower($input)); $dateMappings = ['kivikausi' => ['-8600-01-01T00:00:00Z', '-1501-12-31T23:59:59Z'], 'pronssikausi' => ['-1500-01-01T00:00:00Z', '-0501-12-31T23:59:59Z'], 'rautakausi' => ['-0500-01-01T00:00:00Z', '1299-12-31T23:59:59Z'], 'keskiaika' => ['1300-01-01T00:00:00Z', '1550-12-31T23:59:59Z'], 'ajoittamaton' => null, 'tuntematon' => null]; foreach ($dateMappings as $str => $value) { if (strstr($input, $str)) { return $value; } } $k = ['tammikuu' => '01', 'helmikuu' => '02', 'maaliskuu' => '03', 'huhtikuu' => '04', 'toukokuu' => '05', 'kesäkuu' => '06', 'heinäkuu' => '07', 'elokuu' => '08', 'syyskuu' => '09', 'lokakuu' => '10', 'marraskuu' => '11', 'joulukuu' => '12']; $imprecise = false; list($input) = explode(',', $input, 2); if (preg_match('/(\\d\\d?)\\s*.\\s*(\\d\\d?)\\s*.\\s*(\\d\\d\\d\\d)\\s*-\\s*(\\d\\d?)\\s*.\\s*(\\d\\d?)\\s*.\\s*(\\d\\d\\d\\d)/', $input, $matches) > 0) { $startDate = sprintf('%04d-%02d-%02dT00:00:00Z', $matches[3], $matches[2], $matches[1]); $endDate = sprintf('%04d-%02d-%02dT23:59:59Z', $matches[6], $matches[5], $matches[4]); $noprocess = true; } elseif (preg_match('/(\\d\\d\\d\\d)\\s*-\\s*(\\d\\d?)\\s*.\\s*(\\d\\d?)\\s*.\\s*(\\d\\d\\d\\d)/', $input, $matches) > 0) { $startDate = sprintf('%04d-01-01T00:00:00Z', $matches[1]); $endDate = sprintf('%04d-%02d-%02dT23:59:59Z', $matches[4], $matches[3], $matches[2]); $noprocess = true; } elseif (preg_match('/(\\d\\d?)\\s*.\\s*(\\d\\d?)\\s*.\\s*(\\d\\d\\d\\d)\\s*-\\s*(\\d\\d\\d\\d)/', $input, $matches) > 0) { $startDate = sprintf('%04d-%02d-%02dT00:00:00Z', $matches[3], $matches[2], $matches[1]); $endDate = sprintf('%04d-12-31T23:59:59Z', $matches[4]); $noprocess = true; } elseif (preg_match('/(\\d\\d\\d\\d)\\s*.\\s*(\\d\\d?)\\s*.\\s*(\\d\\d?)\\s*-\\s*(\\d\\d\\d\\d)\\s*.\\s*(\\d\\d?)\\s*.\\s*(\\d\\d?)/', $input, $matches) > 0) { $startDate = sprintf('%04d-%02d-%02dT00:00:00Z', $matches[1], $matches[2], $matches[3]); $endDate = sprintf('%04d-%02d-%02dT23:59:59Z', $matches[4], $matches[5], $matches[6]); $noprocess = true; } elseif (preg_match('/(\\d\\d\\d\\d)(\\d\\d?)(\\d\\d?)\\s*-\\s*(\\d\\d\\d\\d)(\\d\\d?)(\\d\\d?)/', $input, $matches) > 0) { $startDate = sprintf('%04d-%02d-%02dT00:00:00Z', $matches[1], $matches[2], $matches[3]); $endDate = sprintf('%04d-%02d-%02dT23:59:59Z', $matches[4], $matches[5], $matches[6]); $noprocess = true; } elseif (preg_match('/(\\d\\d\\d\\d)(\\d\\d?)\\s*-\\s*(\\d\\d\\d\\d)(\\d\\d?)/', $input, $matches) > 0) { $startDate = sprintf('%04d-%02d-01T00:00:00Z', $matches[1], $matches[2]); $endDate = sprintf('%04d-%02d-01', $matches[3], $matches[4]); try { $d = new DateTime($endDate); } catch (Exception $e) { global $logger; $logger->log('NdlLidoRecord', "Failed to parse date {$endDate}, record {$this->source}." . $this->getID(), Logger::ERROR); return null; } $endDate = $d->format('Y-m-t') . 'T23:59:59Z'; $noprocess = true; } elseif (preg_match('/(\\d\\d\\d\\d)-(\\d\\d?)-(\\d\\d?)/', $input, $matches) > 0) { // This one needs to be before the lazy matcher below $year = $matches[1]; $month = sprintf('%02d', $matches[2]); $day = sprintf('%02d', $matches[3]); $startDate = $year . '-' . $month . '-' . $day . 'T00:00:00Z'; $endDate = $year . '-' . $month . '-' . $day . 'T23:59:59Z'; $noprocess = true; } elseif (preg_match('/(\\d\\d\\d\\d)\\s*-\\s*(\\d\\d\\d\\d)\\s*(-luvun|-l)\\s+(loppupuoli|loppu)/', $input, $matches) > 0) { $startDate = $matches[1]; $endDate = $matches[2]; if ($endDate % 100 == 0) { // Century $endDate += 99; } elseif ($endDate % 10 == 0) { // Decade $endDate += 9; } } elseif (preg_match('/(\\d?\\d?\\d\\d)\\s*(-|~)\\s*(\\d?\\d?\\d\\d)\\s*(-luku|-l)?\\s*(\\(?\\?\\)?)?/', $input, $matches) > 0) { // 1940-1960-luku // 1940-1960-l // 1940-60-l // 1930 - 1970-luku // 30-40-luku $startDate = $matches[1]; $endDate = $matches[3]; if (isset($matches[4])) { if ($endDate % 10 == 0) { $endDate += 9; } } $imprecise = isset($matches[5]); } elseif (preg_match('/(\\d?\\d?\\d\\d)\\s+(tammikuu|helmikuu|maaliskuu|huhtikuu|toukokuu|kesäkuu|heinäkuu|elokuu|syyskuu|lokakuu|marraskuu|joulukuu)/', $input, $matches) > 0) { $year = $matches[1]; $month = $k[$matches[2]]; $startDate = $year . '-' . $month . '-01T00:00:00Z'; $endDate = $year . '-' . $month . '-01'; try { $d = new DateTime($endDate); $endDate = $d->format('Y-m-t') . 'T23:59:59Z'; } catch (Exception $e) { global $logger; $logger->log('NdlLidoRecord', "Failed to parse date {$endDate}, record {$this->source}." . $this->getID(), Logger::ERROR); return null; } $noprocess = true; } elseif (preg_match('/(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)/', $input, $matches) > 0) { $year = $matches[1]; $month = sprintf('%02d', $matches[2]); $day = sprintf('%02d', $matches[3]); $startDate = $year . '-' . $month . '-' . $day . 'T00:00:00Z'; $endDate = $year . '-' . $month . '-' . $day . 'T23:59:59Z'; $noprocess = true; } elseif (preg_match('/(\\d\\d\\d\\d)(\\d\\d)/', $input, $matches) > 0) { $year = $matches[1]; $month = sprintf('%02d', $matches[2]); $startDate = $year . '-' . $month . '-01T00:00:00Z'; $endDate = $year . '-' . $month . '-01'; try { $d = new DateTime($endDate); } catch (Exception $e) { global $logger; $logger->log('NdlLidoRecord', "Failed to parse date {$endDate}, record {$this->source}." . $this->getID(), Logger::ERROR); return null; } $endDate = $d->format('Y-m-t') . 'T23:59:59Z'; $noprocess = true; } elseif (preg_match('/(\\d\\d?)\\s*\\.\\s*(\\d\\d?)\\s*\\.\\s*(\\d\\d\\d\\d)/', $input, $matches) > 0) { $year = $matches[3]; $month = sprintf('%02d', $matches[2]); $day = sprintf('%02d', $matches[1]); $startDate = $year . '-' . $month . '-' . $day . 'T00:00:00Z'; $endDate = $year . '-' . $month . '-' . $day . 'T23:59:59Z'; $noprocess = true; } elseif (preg_match('/(\\d\\d?)\\s*\\.\\s*(\\d\\d\\d\\d)/', $input, $matches) > 0) { $year = $matches[2]; $month = sprintf('%02d', $matches[1]); $startDate = $year . '-' . $month . '-01' . 'T00:00:00Z'; $endDate = $year . '-' . $month . '-01'; try { $d = new DateTime($endDate); $endDate = $d->format('Y-m-t') . 'T23:59:59Z'; } catch (Exception $e) { global $logger; $logger->log('NdlLidoRecord', "Failed to parse date {$endDate}, record {$this->source}." . $this->getID(), Logger::ERROR); return null; } $noprocess = true; } elseif (preg_match('/(\\d?\\d?\\d\\d)\\s*-(luvun|luku)\\s+(alkupuolelta|alkupuoli|alku|alusta)/', $input, $matches) > 0) { $year = $matches[1]; if ($year % 100 == 0) { // Century $startDate = $year; $endDate = $year + 29; } elseif ($year % 10 == 0) { // Decade $startDate = $year; $endDate = $year + 3; } else { // Uhh? $startDate = $year; $endDate = $year; } } elseif (preg_match('/(\\d?\\d?\\d\\d)\\s*-(luvun|luku)\\s+(puoliväli)/', $input, $matches) > 0) { $year = $matches[1]; if ($year % 100 == 0) { // Century $startDate = $year + 29; $endDate = $year + 70; } elseif ($year % 10 == 0) { // Decade $startDate = $year + 3; $endDate = $year + 7; } else { // Uhh? $startDate = $year; $endDate = $year; } } elseif (preg_match('/(\\d?\\d?\\d\\d)\\s*(-luvun|-l)\\s+(loppupuoli|loppu|lopulta|loppupuolelta)/', $input, $matches) > 0) { $year = $matches[1]; if ($year % 100 == 0) { // Century $startDate = $year + 70; $endDate = $year + 99; } elseif ($year % 10 == 0) { // Decade $startDate = $year + 7; $endDate = $year + 9; } else { $startDate = $year; $endDate = $year; } } elseif (preg_match('/(-?\\d?\\d?\\d\\d)\\s*-(luku|luvulta|l)/', $input, $matches) > 0) { $year = $matches[1]; $startDate = $year; if ($year % 100 == 0) { $endDate = $year + 99; } elseif ($year % 10 == 0) { $endDate = $year + 9; } else { $endDate = $year; } } elseif (preg_match('/(\\d?\\d?\\d\\d)\\s*ekr.?\\s*\\-\\s*(\\d?\\d?\\d\\d)\\s*ekr.?/', $input, $matches) > 0) { $startDate = -$matches[1]; $endDate = -$matches[2]; } elseif (preg_match('/(\\d?\\d?\\d\\d)\\s*ekr.?\\s*\\-\\s*(\\d?\\d?\\d\\d)\\s*jkr.?/', $input, $matches) > 0) { $startDate = -$matches[1]; $endDate = $matches[2]; } elseif (preg_match('/(-?\\d?\\d?\\d\\d) jälkeen/', $input, $matches) > 0) { $year = $matches[1]; $startDate = $year; $endDate = $year + 9; } elseif (preg_match('/(-?\\d\\d\\d\\d)\\s*-\\s*(-?\\d\\d\\d\\d)/', $input, $matches) > 0) { $startDate = $matches[1]; $endDate = $matches[2]; } elseif (preg_match('/(-?\\d{1-4})\\s+-\\s+(-?\\d{1-4})/', $input, $matches) > 0) { $startDate = $matches[1]; $endDate = $matches[2]; } elseif (preg_match('/(-?\\d?\\d?\\d\\d)\\s*\\?/', $input, $matches) > 0) { $year = $matches[1]; $startDate = $year; $endDate = $year; $imprecise = true; } elseif (preg_match('/(-?\\d?\\d?\\d\\d)/', $input, $matches) > 0) { $year = $matches[1]; $startDate = $year; $endDate = $year; } else { return null; } if ($startDate < 0) { $startDate = '-' . substr('0000', 0, 5 - strlen($startDate)) . substr($startDate, 1); } elseif ($startDate == 0) { $startDate = '0000'; } if ($endDate < 0) { $endDate = '-' . substr('0000', 0, 5 - strlen($endDate)) . substr($endDate, 1); } elseif ($endDate == 0) { $endDate = '0000'; } switch (strlen($startDate)) { case 1: $startDate = "000{$startDate}"; break; case 2: $startDate = "19{$startDate}"; break; case 3: $startDate = "0{$startDate}"; break; } switch (strlen($endDate)) { case 1: $endDate = "000{$endDate}"; break; case 2: // Take into account possible negative sign $endDate = substr($startDate, 0, -2) . $endDate; break; case 3: $endDate = "0{$endDate}"; break; } if ($imprecise) { // This is way arbitrary, so disabled for now.. //$startDate -= 2; //$endDate += 2; } if (empty($noprocess)) { $startDate = $startDate . '-01-01T00:00:00Z'; $endDate = $endDate . '-12-31T23:59:59Z'; } // Trying to index dates into the future? I don't think so... $yearNow = date('Y'); if ($startDate > $yearNow || $endDate > $yearNow) { return null; } $start = MetadataUtils::validateISO8601Date($startDate); $end = MetadataUtils::validateISO8601Date($endDate); if ($start === false || $end === false) { global $logger; $logger->log('NdlLidoRecord', "Invalid date range {$startDate} - {$endDate} parsed from " . "'{$input}', record {$this->source}." . $this->getID(), Logger::WARNING); if ($start !== false) { $endDate = substr($startDate, 0, 4) . '-12-31T23:59:59Z'; } elseif ($end !== false) { $startDate = substr($endDate, 0, 4) . '-01-01T00:00:00Z'; } else { return null; } } elseif ($start > $end) { global $logger; $logger->log('NdlLidoRecord', "Invalid date range {$startDate} - {$endDate} parsed from '{$input}', " . "record {$this->source}." . $this->getID(), Logger::WARNING); $endDate = substr($startDate, 0, 4) . '-12-31T23:59:59Z'; } return [$startDate, $endDate]; }
/** * Tests for createSortableString * * @return void */ public function testCreateSortableString() { $this->assertEquals('A 3123', MetadataUtils::createSortableString('A 123')); $this->assertEquals('A 3123 18 ABC', MetadataUtils::createSortableString('A 123 8 abc')); $this->assertEquals('A 11 12', MetadataUtils::createSortableString('A 1 2')); }
/** * Get all values for a tag * * @param string $tag XML tag to get * * @return array */ protected function getValues($tag) { $values = []; foreach ($this->doc->{$tag} as $value) { $values[] = MetadataUtils::stripTrailingPunctuation((string) $value); } return $values; }
/** * Return fields to be indexed in Solr (an alternative to an XSL transformation) * * @param boolean $prependTitleWithSubtitle If true and title_sub differs from * title_short, title is formed by combining title_sub and title_short * * @return string[] */ public function toSolrArray($prependTitleWithSubtitle) { $data = parent::toSolrArray($prependTitleWithSubtitle); $doc = $this->doc; $unitDateRange = $this->parseDateRange((string) $doc->did->unitdate); $data['search_sdaterange_mv'] = $data['unit_sdaterange'] = MetadataUtils::dateRangeToNumeric($unitDateRange); $data['search_daterange_mv'] = $data['unit_daterange'] = MetadataUtils::dateRangeToStr($unitDateRange); if ($unitDateRange) { $data['main_date_str'] = MetadataUtils::extractYear($unitDateRange[0]); $data['main_date'] = $this->validateDate($unitDateRange[0]); // Append year range to title (only years, not the full dates) $startYear = MetadataUtils::extractYear($unitDateRange[0]); $endYear = MetadataUtils::extractYear($unitDateRange[1]); $yearRange = ''; if ($startYear != '-9999') { $yearRange = $startYear; } if ($endYear != $startYear) { $yearRange .= '-'; if ($endYear != '9999') { $yearRange .= $endYear; } } if ($yearRange) { $len = strlen($yearRange); foreach (['title_full', 'title_sort', 'title', 'title_short'] as $field) { if (substr($data[$field], -$len) != $yearRange && substr($data[$field], -$len - 2) != "({$yearRange})") { $data[$field] .= " ({$yearRange})"; } } } } // Single-valued sequence for sorting if (isset($data['hierarchy_sequence'])) { $data['hierarchy_sequence_str'] = $data['hierarchy_sequence']; } $data['source_str_mv'] = isset($data['institution']) ? $data['institution'] : $this->source; $data['datasource_str_mv'] = $this->source; // Digitized? if ($doc->did->daogrp) { if (in_array($data['format'], ['collection', 'series', 'fonds', 'item'])) { $data['format'] = 'digitized_' . $data['format']; } if ($this->doc->did->daogrp->daoloc) { foreach ($this->doc->did->daogrp->daoloc as $daoloc) { if ($daoloc->attributes()->{'href'}) { $data['online_boolean'] = true; // This is sort of special. Make sure to use source instead // of datasource. $data['online_str_mv'] = $data['source_str_mv']; break; } } } } if (isset($doc->did->unitid)) { $data['identifier'] = (string) $doc->did->unitid; } if (isset($doc->did->dimensions)) { // display measurements $data['measurements'] = (string) $doc->did->dimensions; } if (isset($doc->did->physdesc)) { $data['material'] = (string) $doc->did->physdesc; } if (isset($doc->did->accessrestrict->p)) { $data['rights'] = (string) $doc->did->accessrestrict->p; } // Usage rights if ($rights = $this->getUsageRights()) { $data['usage_rights_str_mv'] = $rights; } return $data; }