/** * Converts a string to a normalized (no-spaces, non-letters) string * * @param string $subject original string * @return string normalized string */ public static function normalize($region) { // this is influenced by the setlocale() call with category LC_CTYPE; see PopulateDatabases.php $normalized = iconv('UTF-8', 'ASCII//TRANSLIT', $region); $normalized = Parser::strtolower($normalized); $normalized = str_replace("&", "", $normalized); $normalized = str_replace("'", "", $normalized); $normalized = str_replace("+", "-", $normalized); $normalized = str_replace(" ", "-", $normalized); $normalized = Parser::preg_replace('/\\W/', "-", $normalized); while (strstr($normalized, "--")) { $normalized = str_replace("--", "-", $normalized); } return $normalized; }
/** * Create a normalize category name (lowercase, just alpha and dashes) * from supplied name * * @param string $name [optional] will otherwise use name property */ public function setNormalizedFromName($name = null) { if ($name == null) { $name = $this->name; } // convert accented character and the like to just ascii equivalent // this is influenced by the setlocale() call with category LC_CTYPE $this->normalized = iconv('UTF-8', 'ASCII//TRANSLIT', $name); $this->normalized = Parser::strtolower($this->normalized); // strip out weird characters $this->normalized = str_replace("&", "", $this->normalized); $this->normalized = str_replace("'", "", $this->normalized); // convert these to dashes $this->normalized = str_replace("+", "-", $this->normalized); $this->normalized = str_replace(" ", "-", $this->normalized); // now any other non-word character to a dash $this->normalized = Parser::preg_replace('/\\W/', "-", $this->normalized); // pair multiple dashes down to one while (strstr($this->normalized, "--")) { $this->normalized = str_replace("--", "-", $this->normalized); } return $this; }
/** * Lower-case the phrase */ public function toLower() { $this->phrase = Parser::strtolower($this->phrase); return $this; }
public function extractFormat($data_fields) { if (is_array($data_fields)) { $data_fields = implode(" ", $data_fields); // combine them into a string } $data_fields = Parser::strtolower($data_fields); if (strstr($data_fields, 'dissertation')) { return self::Thesis; } if (strstr($data_fields, 'proceeding')) { return self::ConferenceProceeding; } if (strstr($data_fields, 'conference')) { return self::ConferencePaper; } if (strstr($data_fields, 'hearing')) { return self::Hearing; } if (strstr($data_fields, 'working')) { return self::UnpublishedWork; } if (strstr($data_fields, 'book review') || strstr($data_fields, 'review-book')) { return self::BookReview; } if (strstr($data_fields, 'film review') || strstr($data_fields, 'film-book')) { return self::Review; } if (strstr("{$data_fields} ", 'review ')) { return self::Review; } if (strstr($data_fields, 'book art') || strstr($data_fields, 'book ch') || strstr($data_fields, 'chapter')) { return self::BookSection; } if (strstr($data_fields, 'journal')) { return self::Article; } if (strstr($data_fields, 'periodical') || strstr($data_fields, 'serial')) { return self::Article; } if (strstr($data_fields, 'book')) { return self::Book; } if (strstr($data_fields, 'pamphlet')) { return self::Pamphlet; } if (strstr($data_fields, 'essay')) { return self::Article; } if (strstr($data_fields, 'article')) { return self::Article; } // if we got this far, just return unknown return self::Unknown; }
/** * Add global array as xml to request xml document * * @param DOMDocument $xml [by reference] request xml document * @param DOMNode $objAppend [by reference] node to append values to * @param array $arrValues global array */ private function addElement(&$xml, &$objAppend, $arrValues) { foreach ($arrValues as $key => $value) { // @todo: change this to 'data' element and fix xslt // need to make sure the xml element has a valid name // and not something crazy with spaces or commas, etc. $strSafeKey = Parser::strtolower(preg_replace('/\\W/', '_', $key)); if (is_array($value)) { foreach ($value as $strKey => $strValue) { $objElement = $xml->createElement($strSafeKey); $objElement->setAttribute('original_key', $key); $objElement->setAttribute("key", $strKey); $objAppend->appendChild($objElement); if (is_array($strValue)) { // multi-dimensional arrays will be recursively added $this->addElement($xml, $objElement, $strValue); } else { $objElement->nodeValue = Parser::escapeXml($strValue); } } } else { $objElement = $xml->createElement($strSafeKey, Parser::escapeXml($value)); $objElement->setAttribute('original_key', $key); $objAppend->appendChild($objElement); } } }
/** * Best-guess regular expression for extracting volume, issue, pagination, * broken out here for clarity * * @param string $strJournalInfo any journal info, usually from 773 * @return array */ private function extractJournalData($strJournalInfo) { $arrFinal = array(); $arrCapture = array(); // we'll drop the whole thing to lower case and padd it // with spaces to make parsing easier $strJournalInfo = " " . Parser::strtolower($strJournalInfo) . " "; // volume if (preg_match('/ v[a-z]{0,5}[\\.]{0,1}[ ]{0,3}([0-9]{1,})/', $strJournalInfo, $arrCapture) != 0) { $arrFinal["volume"] = $arrCapture[1]; $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo); } // issue if (preg_match('/ i[a-z]{0,4}[\\.]{0,1}[ ]{0,3}([0-9]{1,})/', $strJournalInfo, $arrCapture) != 0) { $arrFinal["issue"] = $arrCapture[1]; $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo); } elseif (preg_match('/ n[a-z]{0,5}[\\.]{0,1}[ ]{0,3}([0-9]{1,})/', $strJournalInfo, $arrCapture) != 0) { $arrFinal["issue"] = $arrCapture[1]; $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo); } // pages if (preg_match("/([0-9]{1,})-([0-9]{1,})/", $strJournalInfo, $arrCapture) != 0) { $arrFinal["spage"] = $arrCapture[1]; $arrFinal["epage"] = $arrCapture[2]; $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo); } elseif (preg_match('/ p[a-z]{0,3}[\\.]{0,1}[ ]{0,3}([0-9]{1,})/', $strJournalInfo, $arrCapture) != 0) { $arrFinal["spage"] = $arrCapture[1]; $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo); } return $arrFinal; }
protected function toTitleCase($strInput) { $arrMatches = ""; // matches from regular expression $arrSmallWords = ""; // words that shouldn't be capitalized if they aren't the first word. $arrWords = ""; // individual words in input $strFinal = ""; // final string to return $strLetter = ""; // first letter of subtitle, if any // if there are no lowercase letters (and its sufficiently long a title to // not just be an aconym or something) then this is likely a title stupdily // entered into a database in ALL CAPS, so drop it entirely to // lower-case first $iMatch = preg_match("/[a-z]/", $strInput); if ($iMatch == 0 && strlen($strInput) > 10) { $strInput = Parser::strtolower($strInput); } // array of small words $arrSmallWords = array('of', 'a', 'the', 'and', 'an', 'or', 'nor', 'but', 'is', 'if', 'then', 'else', 'when', 'at', 'from', 'by', 'on', 'off', 'for', 'in', 'out', 'over', 'to', 'into', 'with', 'as'); // split the string into separate words $arrWords = explode(' ', $strInput); foreach ($arrWords as $key => $word) { // if this word is the first, or it's not one of our small words, capitalise it if ($key == 0 || !in_array(Parser::strtolower($word), $arrSmallWords)) { // make sure first character is not a quote or something if (preg_match("/^[^a-zA-Z0-9]/", $word)) { $first = substr($word, 0, 1); $rest = substr($word, 1); $arrWords[$key] = $first . ucwords($rest); } else { $arrWords[$key] = ucwords($word); } } elseif (in_array(Parser::strtolower($word), $arrSmallWords)) { $arrWords[$key] = Parser::strtolower($word); } } // join the words back into a string $strFinal = implode(' ', $arrWords); // catch subtitles if (preg_match("/: ([a-z])/", $strFinal, $arrMatches)) { $strLetter = ucwords($arrMatches[1]); $strFinal = preg_replace("/: ([a-z])/", ": " . $strLetter, $strFinal); } // catch words that start with double quotes if (preg_match("/\"([a-z])/", $strFinal, $arrMatches)) { $strLetter = ucwords($arrMatches[1]); $strFinal = preg_replace("/\"[a-z]/", "\"" . $strLetter, $strFinal); } // catch words that start with a single quote // need to be a little more cautious here and make sure there is a space before the quote when // inside the title to ensure this isn't a quote for a contraction or for possisive; separate // case to handle when the quote is the first word if (preg_match("/ '([a-z])/", $strFinal, $arrMatches)) { $strLetter = ucwords($arrMatches[1]); $strFinal = preg_replace("/ '[a-z]/", " '" . $strLetter, $strFinal); } if (preg_match("/^'([a-z])/", $strFinal, $arrMatches)) { $strLetter = ucwords($arrMatches[1]); $strFinal = preg_replace("/^'[a-z]/", "'" . $strLetter, $strFinal); } return $strFinal; }