Example #1
0
 /**
  * Converts a string to a normalized (no-spaces, non-letters) string
  *
  * @param string $subject	original string
  * @return string			normalized string
  */
 public static function normalize($region)
 {
     // this is influenced by the setlocale() call with category LC_CTYPE; see PopulateDatabases.php
     $normalized = iconv('UTF-8', 'ASCII//TRANSLIT', $region);
     $normalized = Parser::strtolower($normalized);
     $normalized = str_replace("&", "", $normalized);
     $normalized = str_replace("'", "", $normalized);
     $normalized = str_replace("+", "-", $normalized);
     $normalized = str_replace(" ", "-", $normalized);
     $normalized = Parser::preg_replace('/\\W/', "-", $normalized);
     while (strstr($normalized, "--")) {
         $normalized = str_replace("--", "-", $normalized);
     }
     return $normalized;
 }
Example #2
0
 /**
  * Create a normalize category name (lowercase, just alpha and dashes) 
  * from supplied name
  * 
  * @param string $name  [optional] will otherwise use name property
  */
 public function setNormalizedFromName($name = null)
 {
     if ($name == null) {
         $name = $this->name;
     }
     // convert accented character and the like to just ascii equivalent
     // this is influenced by the setlocale() call with category LC_CTYPE
     $this->normalized = iconv('UTF-8', 'ASCII//TRANSLIT', $name);
     $this->normalized = Parser::strtolower($this->normalized);
     // strip out weird characters
     $this->normalized = str_replace("&", "", $this->normalized);
     $this->normalized = str_replace("'", "", $this->normalized);
     // convert these to dashes
     $this->normalized = str_replace("+", "-", $this->normalized);
     $this->normalized = str_replace(" ", "-", $this->normalized);
     // now any other non-word character to a dash
     $this->normalized = Parser::preg_replace('/\\W/', "-", $this->normalized);
     // pair multiple dashes down to one
     while (strstr($this->normalized, "--")) {
         $this->normalized = str_replace("--", "-", $this->normalized);
     }
     return $this;
 }
Example #3
0
 /**
  * Lower-case the phrase
  */
 public function toLower()
 {
     $this->phrase = Parser::strtolower($this->phrase);
     return $this;
 }
Example #4
0
 public function extractFormat($data_fields)
 {
     if (is_array($data_fields)) {
         $data_fields = implode(" ", $data_fields);
         // combine them into a string
     }
     $data_fields = Parser::strtolower($data_fields);
     if (strstr($data_fields, 'dissertation')) {
         return self::Thesis;
     }
     if (strstr($data_fields, 'proceeding')) {
         return self::ConferenceProceeding;
     }
     if (strstr($data_fields, 'conference')) {
         return self::ConferencePaper;
     }
     if (strstr($data_fields, 'hearing')) {
         return self::Hearing;
     }
     if (strstr($data_fields, 'working')) {
         return self::UnpublishedWork;
     }
     if (strstr($data_fields, 'book review') || strstr($data_fields, 'review-book')) {
         return self::BookReview;
     }
     if (strstr($data_fields, 'film review') || strstr($data_fields, 'film-book')) {
         return self::Review;
     }
     if (strstr("{$data_fields} ", 'review ')) {
         return self::Review;
     }
     if (strstr($data_fields, 'book art') || strstr($data_fields, 'book ch') || strstr($data_fields, 'chapter')) {
         return self::BookSection;
     }
     if (strstr($data_fields, 'journal')) {
         return self::Article;
     }
     if (strstr($data_fields, 'periodical') || strstr($data_fields, 'serial')) {
         return self::Article;
     }
     if (strstr($data_fields, 'book')) {
         return self::Book;
     }
     if (strstr($data_fields, 'pamphlet')) {
         return self::Pamphlet;
     }
     if (strstr($data_fields, 'essay')) {
         return self::Article;
     }
     if (strstr($data_fields, 'article')) {
         return self::Article;
     }
     // if we got this far, just return unknown
     return self::Unknown;
 }
Example #5
0
 /**
  * Add global array as xml to request xml document
  *
  * @param DOMDocument $xml			[by reference] request xml document
  * @param DOMNode $objAppend		[by reference] node to append values to
  * @param array $arrValues			global array
  */
 private function addElement(&$xml, &$objAppend, $arrValues)
 {
     foreach ($arrValues as $key => $value) {
         // @todo: change this to 'data' element and fix xslt
         // need to make sure the xml element has a valid name
         // and not something crazy with spaces or commas, etc.
         $strSafeKey = Parser::strtolower(preg_replace('/\\W/', '_', $key));
         if (is_array($value)) {
             foreach ($value as $strKey => $strValue) {
                 $objElement = $xml->createElement($strSafeKey);
                 $objElement->setAttribute('original_key', $key);
                 $objElement->setAttribute("key", $strKey);
                 $objAppend->appendChild($objElement);
                 if (is_array($strValue)) {
                     // multi-dimensional arrays will be recursively added
                     $this->addElement($xml, $objElement, $strValue);
                 } else {
                     $objElement->nodeValue = Parser::escapeXml($strValue);
                 }
             }
         } else {
             $objElement = $xml->createElement($strSafeKey, Parser::escapeXml($value));
             $objElement->setAttribute('original_key', $key);
             $objAppend->appendChild($objElement);
         }
     }
 }
Example #6
0
 /**
  * Best-guess regular expression for extracting volume, issue, pagination,
  * broken out here for clarity 
  *
  * @param string $strJournalInfo		any journal info, usually from 773
  * @return array
  */
 private function extractJournalData($strJournalInfo)
 {
     $arrFinal = array();
     $arrCapture = array();
     // we'll drop the whole thing to lower case and padd it
     // with spaces to make parsing easier
     $strJournalInfo = " " . Parser::strtolower($strJournalInfo) . " ";
     // volume
     if (preg_match('/ v[a-z]{0,5}[\\.]{0,1}[ ]{0,3}([0-9]{1,})/', $strJournalInfo, $arrCapture) != 0) {
         $arrFinal["volume"] = $arrCapture[1];
         $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo);
     }
     // issue
     if (preg_match('/ i[a-z]{0,4}[\\.]{0,1}[ ]{0,3}([0-9]{1,})/', $strJournalInfo, $arrCapture) != 0) {
         $arrFinal["issue"] = $arrCapture[1];
         $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo);
     } elseif (preg_match('/ n[a-z]{0,5}[\\.]{0,1}[ ]{0,3}([0-9]{1,})/', $strJournalInfo, $arrCapture) != 0) {
         $arrFinal["issue"] = $arrCapture[1];
         $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo);
     }
     // pages
     if (preg_match("/([0-9]{1,})-([0-9]{1,})/", $strJournalInfo, $arrCapture) != 0) {
         $arrFinal["spage"] = $arrCapture[1];
         $arrFinal["epage"] = $arrCapture[2];
         $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo);
     } elseif (preg_match('/ p[a-z]{0,3}[\\.]{0,1}[ ]{0,3}([0-9]{1,})/', $strJournalInfo, $arrCapture) != 0) {
         $arrFinal["spage"] = $arrCapture[1];
         $strJournalInfo = str_replace($arrCapture[0], "", $strJournalInfo);
     }
     return $arrFinal;
 }
Example #7
0
 protected function toTitleCase($strInput)
 {
     $arrMatches = "";
     // matches from regular expression
     $arrSmallWords = "";
     // words that shouldn't be capitalized if they aren't the first word.
     $arrWords = "";
     // individual words in input
     $strFinal = "";
     // final string to return
     $strLetter = "";
     // first letter of subtitle, if any
     // if there are no lowercase letters (and its sufficiently long a title to
     // not just be an aconym or something) then this is likely a title stupdily
     // entered into a database in ALL CAPS, so drop it entirely to
     // lower-case first
     $iMatch = preg_match("/[a-z]/", $strInput);
     if ($iMatch == 0 && strlen($strInput) > 10) {
         $strInput = Parser::strtolower($strInput);
     }
     // array of small words
     $arrSmallWords = array('of', 'a', 'the', 'and', 'an', 'or', 'nor', 'but', 'is', 'if', 'then', 'else', 'when', 'at', 'from', 'by', 'on', 'off', 'for', 'in', 'out', 'over', 'to', 'into', 'with', 'as');
     // split the string into separate words
     $arrWords = explode(' ', $strInput);
     foreach ($arrWords as $key => $word) {
         // if this word is the first, or it's not one of our small words, capitalise it
         if ($key == 0 || !in_array(Parser::strtolower($word), $arrSmallWords)) {
             // make sure first character is not a quote or something
             if (preg_match("/^[^a-zA-Z0-9]/", $word)) {
                 $first = substr($word, 0, 1);
                 $rest = substr($word, 1);
                 $arrWords[$key] = $first . ucwords($rest);
             } else {
                 $arrWords[$key] = ucwords($word);
             }
         } elseif (in_array(Parser::strtolower($word), $arrSmallWords)) {
             $arrWords[$key] = Parser::strtolower($word);
         }
     }
     // join the words back into a string
     $strFinal = implode(' ', $arrWords);
     // catch subtitles
     if (preg_match("/: ([a-z])/", $strFinal, $arrMatches)) {
         $strLetter = ucwords($arrMatches[1]);
         $strFinal = preg_replace("/: ([a-z])/", ": " . $strLetter, $strFinal);
     }
     // catch words that start with double quotes
     if (preg_match("/\"([a-z])/", $strFinal, $arrMatches)) {
         $strLetter = ucwords($arrMatches[1]);
         $strFinal = preg_replace("/\"[a-z]/", "\"" . $strLetter, $strFinal);
     }
     // catch words that start with a single quote
     // need to be a little more cautious here and make sure there is a space before the quote when
     // inside the title to ensure this isn't a quote for a contraction or for possisive; separate
     // case to handle when the quote is the first word
     if (preg_match("/ '([a-z])/", $strFinal, $arrMatches)) {
         $strLetter = ucwords($arrMatches[1]);
         $strFinal = preg_replace("/ '[a-z]/", " '" . $strLetter, $strFinal);
     }
     if (preg_match("/^'([a-z])/", $strFinal, $arrMatches)) {
         $strLetter = ucwords($arrMatches[1]);
         $strFinal = preg_replace("/^'[a-z]/", "'" . $strLetter, $strFinal);
     }
     return $strFinal;
 }