public function tag($text) { //KARIM: added '-_ preg_match_all("/[\\w\\d\\.'\\-_]+/", $text, $matches); $nouns = array('NN', 'NNS'); $return = array(); $i = 0; foreach ($matches[0] as $token) { // default to a common noun $return[$i] = array('token' => $token, 'tag' => 'NN'); // remove trailing full stops if (substr($token, -1) == '.') { $token = preg_replace('/\\.+$/', '', $token); } // get from dict if set if (modelEntryExistsInMemory("EN", "PHPIR_LEXICON", "POS_ENTRY", $token)) { $return[$i]['tag'] = getLexiconItemFromMemory($token); } // Converts verbs after 'the' to nouns if ($i > 0) { if ($return[$i - 1]['tag'] == 'DT' && in_array($return[$i]['tag'], array('VBD', 'VBP', 'VB'))) { $return[$i]['tag'] = 'NN'; } } // Convert noun to number if . appears if ($return[$i]['tag'][0] == 'N' && strpos($token, '.') !== false) { $return[$i]['tag'] = 'CD'; } // Convert noun to past particile if ends with 'ed' if ($return[$i]['tag'][0] == 'N' && substr($token, -2) == 'ed') { $return[$i]['tag'] = 'VBN'; } // Anything that ends 'ly' is an adverb if (substr($token, -2) == 'ly') { $return[$i]['tag'] = 'RB'; } // Common noun to adjective if it ends with al if (in_array($return[$i]['tag'], $nouns) && substr($token, -2) == 'al') { $return[$i]['tag'] = 'JJ'; } // Noun to verb if the word before is 'would' if ($i > 0) { if ($return[$i]['tag'] == 'NN' && strtolower($return[$i - 1]['token']) == 'would') { $return[$i]['tag'] = 'VB'; } } // Convert noun to plural if it ends with an s if ($return[$i]['tag'] == 'NN' && substr($token, -1) == 's') { $return[$i]['tag'] = 'NNS'; } // Convert common noun to gerund if (in_array($return[$i]['tag'], $nouns) && substr($token, -3) == 'ing') { $return[$i]['tag'] = 'VBG'; } // If we get noun noun, and the second can be a verb, convert to verb if ($i > 0) { if (in_array($return[$i]['tag'], $nouns) && in_array($return[$i - 1]['tag'], $nouns) && modelEntryExistsInMemory("EN", "PHPIR_LEXICON", "POS_ENTRY", $token)) { if (in_array('VBN', getLexiconItemFromMemory($token))) { $return[$i]['tag'] = 'VBN'; } else { if (in_array('VBZ', getLexiconItemFromMemory($token))) { $return[$i]['tag'] = 'VBZ'; } } } } $i++; } return $return; }
function getWordInfo($word, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $fast = FALSE, $exactWord = FALSE) { $wordInfoArr = array(); $word = trim($word); $wordUthmani = ""; $wordSimple = ""; if (isSimpleQuranWord($word)) { $wordUthmani = getItemFromUthmaniToSimpleMappingTable($word); $wordSimple = $word; } else { $wordUthmani = $word; //preprint_r($UTHMANI_TO_SIMPLE_WORD_MAP_VS); // tashkeel of last char is significant, ex: lemmas will probably not be in the MAP because of that $wordSimple = getItemFromUthmaniToSimpleMappingTable($wordUthmani); } $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", ""); $freqArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$wordSimple]; //preprint_r($freqArr); $wordInfoArr['WORD_SIMPLE'] = $wordSimple; $wordInfoArr['WORD_UTHMANI'] = $wordUthmani; /*echoN("Simple:".$wordSimple); echoN("Uthmani:".$wordUthmani); echoN("Repetition:".$freqArr['TF']); echoN("TF-IDF Weight:".round($freqArr['TFIDF'],2)); */ $wordInfoArr['TF'] = $freqArr['TF']; $wordInfoArr['TFIDF'] = round($freqArr['TFIDF'], 2); //preprint_r($MODEL_QAC['QAC_MASTERTABLE']); //preprint_r(array_keys($MODEL_QAC['QAC_FEATURES'])); $buckwalterTransliteration = ""; $posTagsArr = array(); $lemmasArr = array(); $wordRoot = ""; $featuresArr = array(); $versesArr = array(); $versesTagsArr = array(); $buckwalterTransliteration = ""; $wordRoot = ""; if (empty($wordSimple) || !modelEntryExistsInMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple)) { return null; } //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$wordSimple]); $invertedIndexEntry = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple); $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; foreach ($invertedIndexEntry as $documentArrInIndex) { $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO']; // INGORE ROOT SOURCES AND PRONOUNS, WE ONLY NEED THE NROMAL CORRESPONDING WORD if ($WORD_TYPE == "PRONOUN_ANTECEDENT" || $WORD_TYPE == "ROOT") { continue; } $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); //echoN($qacLocation);exit;; //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); //exit; $qacMasterTableEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation); // search QAC for roots and LEMMAS for this word foreach ($qacMasterTableEntryArr as $segmentIndex => $segmentDataArr) { $tag = $segmentDataArr['TAG']; $segmentWord = $segmentDataArr['FORM_AR']; //echoN($segmentWord); //preprint_r($segmentDataArr); $segmentWordSimple = ""; $simpleRepresentation = getItemFromUthmaniToSimpleMappingTable($segmentWord); // !empty() produced error = Can't use function return value in write context if (strlen(trim($simpleRepresentation)) > 0) { $segmentWordSimple = $simpleRepresentation; } $buckwalterTransliteration = $segmentDataArr['FORM_EN']; if (isset($segmentDataArr['FEATURES']['LEM'])) { $lemma = $segmentDataArr['FEATURES']['LEM']; } $featuresArr = array_merge($segmentDataArr['FEATURES']); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); $wordId = getWordIndexFromQACLocation($qacLocation); if ($exactWord == TRUE) { $wordFromVerseAtLocation = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $wordId); if ($wordSimple !== $wordFromVerseAtLocation) { continue; } } //echoN("$segmentWord|$tag"); //for segments like ال no corresponding simple words to compare, not our target segment, so continue //if ( empty($segmentWordSimple)) continue; if (isset($segmentDataArr['FEATURES']['ROOT']) && $segmentDataArr['FEATURES']['ROOT'] != -1) { $wordRoot = $segmentDataArr['FEATURES']['ROOT']; } $posTagsArr[$tag] = 1; $lemmasArr[$lemma] = 1; //echoN("|$segmentWordSimple|$wordSimple|$segmentWord"); //$verseText = markSpecificWordInText($verseText,$wordId,$segmentWordSimple,"marked_fg"); $qacVerseLocation = substr($qacLocation, 0, strrpos($qacLocation, ":")); if (!isset($versesArr[$qacVerseLocation])) { $versesArr[$qacVerseLocation] = $verseText; } if (!isset($versesTagsArr[$qacVerseLocation])) { $versesTagsArr[$qacVerseLocation] = ""; } $versesTagsArr[$qacVerseLocation] = $versesTagsArr[$qacVerseLocation] . " " . $tag; } // we don't need all inverted index list except for verses, only break if we found at least one word if ($fast == true && !empty($versesArr)) { break; } } $wordInfoArr['BUCKWALTER'] = $buckwalterTransliteration; $wordInfoArr['ROOT'] = $wordRoot; $wordInfoArr['LEM'] = $lemmasArr; $wordInfoArr['POS'] = $posTagsArr; $wordInfoArr['VERSES'] = $versesArr; $wordInfoArr['VERSES_POS_TAGS'] = $versesTagsArr; $wordInfoArr['FEATURES'] = $featuresArr; return $wordInfoArr; }
function getConceptsFoundInText($text, $lang) { global $thing_class_name_ar, $is_a_relation_name_ar; $conceptsInTextArr = array(); $textWordsArr = preg_split("/ /", $text); foreach ($textWordsArr as $index => $word) { if ($lang == "EN") { $word = cleanAndTrim($word); $word = strtolower($word); // translate English name to arabic concept name/id //$wordConveretedToConceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$word]; $wordConveretedToConceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $word); } else { $wordConveretedToConceptID = convertWordToConceptID($word); } //echoN($wordConveretedToConceptID); if (modelEntryExistsInMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID)) { //preprint_r($MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]);exit; //echoN($wordConveretedToConceptID); //$mainConceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]; $mainConceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID); $conceptLabelAR = $mainConceptArr['label_ar']; $conceptLabelEN = $mainConceptArr['label_en']; $conceptFrequency = $mainConceptArr['frequency']; $conceptWeight = $mainConceptArr['weight']; $finalNodeLabel = $conceptLabelAR; if ($lang == "EN") { $finalNodeLabel = $conceptLabelEN; } if ($wordConveretedToConceptID == $thing_class_name_ar) { continue; } $conceptsInTextArr[$wordConveretedToConceptID] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $mainConceptArr, $randomXLocation, $randomYLocation, 1); } } return $conceptsInTextArr; }
$ssPoSAggregationCorrespondingSent = array(); //echoN("Word:$word"); $targetType = "POS"; if (isArabicString($word)) { $targetType = "WORD"; if (isSimpleQuranWord($word)) { $poTaggedSubsentences = getPoSTaggedSubsentences("SIMPLE"); } else { $poTaggedSubsentences = getPoSTaggedSubsentences(); } } else { $poTaggedSubsentences = getPoSTaggedSubsentences(); } $targetPOSorWord = trim($word); if ($targetType == "POS") { if (!modelEntryExistsInMemory("AR", "MODEL_QAC", "QAC_POS", $targetPOSorWord)) { showTechnicalError("Not a valid PoS tag !"); exit; } } else { if (empty($targetPOSorWord)) { showTechnicalError("Word not valid !"); exit; } } /////////// PREPARE CONTEXT ARRAY //////////////// $contextMaxLevel = 3; $contextArr = array(); $contextArr[$targetPOSorWord] = array(); for ($i = $contextMaxLevel; $i >= 1; $i--) { $contextArr[$targetPOSorWord]["+{$i}"] = array();
function ontologyTextToD3Graph($MODEL_QA_ONTOLOGY, $inputType, $searchResultTextArr, $minFreq = 0, $widthHeigthArr, $lang, $mainConceptsOnly = false, $isPhraseSearch = false, $isQuestion = false, $query = "") { global $thing_class_name_ar, $is_a_relation_name_ar; $graphObj = array(); $graphNodes = array(); $graphLinks = array(); ////// calculate start points $width = $widthHeigthArr[0]; $height = $widthHeigthArr[1]; $startLocationXMin = $width / 2 - 100; $startLocationXMax = $width / 2 + 100; $startLocationYMin = $height / 2 - 100; $startLocationYMax = $height / 2 + 100; //////////////////////////// /** SHOULD BE ZERO BASED FOR D3 TO WORK - o.target.weight = NULL**/ $nodeSerialNumber = 0; $lastWord = null; foreach ($searchResultTextArr as $index => $text) { if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY") { $textWordsArr = preg_split("/ /", $text); } else { if (!$isPhraseSearch) { // extendedQueryParam $textWordsArr = array_keys($searchResultTextArr); } else { // phrase should be checked as is $textWordsArr[0] = $query; } } foreach ($textWordsArr as $word) { if ($lang == "EN") { $word = cleanAndTrim($word); $word = strtolower($word); // translate English name to arabic concept name/id //$wordConveretedToConceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$word]; $wordConveretedToConceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $word); } else { $wordConveretedToConceptID = convertWordToConceptID($word); } if (modelEntryExistsInMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID)) { //preprint_r($MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]);exit; //echoN($wordConveretedToConceptID); $mainConceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID); //$mainConceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]; $conceptLabelAR = $mainConceptArr['label_ar']; $conceptLabelEN = $mainConceptArr['label_en']; $conceptFrequency = $mainConceptArr['frequency']; $conceptWeight = $mainConceptArr['weight']; $finalNodeLabel = $conceptLabelAR; if ($lang == "EN") { $finalNodeLabel = $conceptLabelEN; } /*if ( empty($finalNodeLabel)) { echoN($conceptLabelAR); exit; }*/ if ($conceptFrequency < $minFreq) { continue; } if ($wordConveretedToConceptID == $thing_class_name_ar) { continue; } if (!isset($graphNodes[$wordConveretedToConceptID])) { $randomXLocation = rand($startLocationXMin, $startLocationXMax); $randomYLocation = rand($startLocationYMin, $startLocationYMax); $graphNodes[$wordConveretedToConceptID] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $mainConceptArr, $randomXLocation, $randomYLocation, 1); } } } } $tooManyConcepts = count($graphNodes) > 200; $ONTOLOGY_RELATIONS = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "RELATIONS", ""); //preprint_r($graphNodes,1);exit; $linksHashLookupTable = array(); //preprint_r($graphNodes,true);exit; foreach ($graphNodes as $concept => $conceptArr) { $conceptID = convertWordToConceptID($concept); //$relationsOfConceptAsSource = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_SOURCES'][$conceptID]; $relationsOfConceptAsSource = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_SOURCES", $conceptID); //$relationsOfConceptAsTarget = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_TARGETS'][$conceptID]; $relationsOfConceptAsTarget = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_TARGETS", $conceptID); foreach ($relationsOfConceptAsSource as $index => $relArr) { $verb = $relArr["link_verb"]; $object = $relArr["target"]; //echoN("$verb==$is_a_relation_name_ar && $object==$thing_class_name_ar"); // ignore is-a thing relations if ($verb == $is_a_relation_name_ar && $object == $thing_class_name_ar) { continue; } if ($tooManyConcepts && $verb == $is_a_relation_name_ar) { continue; } // IF SHOWING MAIN CONCEPTS ONLY, IGNORE CONCEPTS NOT IN MAIN CONCEPTS LIST if ($mainConceptsOnly && !isset($graphNodes[$object])) { continue; } // NO extending by relations in case of search result text // to reduce number of concepts we only add relations with other concepts // found in the text if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY" && !isset($graphNodes[$object])) { continue; } //preprint_r($relArr,true); $randomXLocation = rand($startLocationXMin, $startLocationXMax); $randomYLocation = rand($startLocationYMin, $startLocationYMax); $relHashID = buildRelationHashID($conceptID, $verb, $object); $fullRelationArr = $ONTOLOGY_RELATIONS[$relHashID]; //$conceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$object]; $conceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $object); $finalNodeLabel = $conceptArr['label_ar']; if ($lang == "EN") { $finalNodeLabel = formatEnglishConcept($conceptArr['label_en']); $verb = $fullRelationArr['VERB_TRANSLATION_EN']; } if (!isset($graphNodes[$object])) { $graphNodes[$object] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $conceptArr, $randomXLocation, $randomYLocation, 2); } $linkArr = array("source" => $graphNodes[$concept]["id"], "target" => $graphNodes[$object]["id"], "link_verb" => $verb, "link_frequency" => $fullRelationArr['FREQUENCY']); //////// HANDLING MULTIPLE LINKS BETWEEN SAME NODES BEFORE ASSIGNING LINK $arrHash = getArrayHashForFields($linkArr, array('source', 'target')); /*preprint_r($graphNodes); echoN($finalNodeLabel); preprint_r($linkArr);*/ if (!isset($linksHashLookupTable[$arrHash])) { $graphLinks[] = $linkArr; $linksHashLookupTable[$arrHash] = count($graphLinks) - 1; } else { $linkIndex = $linksHashLookupTable[$arrHash]; if (strpos($graphLinks[$linkIndex]['link_verb'], "{$verb}") === false) { $graphLinks[$linkIndex]['link_verb'] .= "," . $verb; } } /*if ( $MODEL_QA_ONTOLOGY['CONCEPTS'][$object]['label_en']=="help") { echoN(isset($graphNodes[$object])." ".$object," "); echoN($concept); preprint_r($graphLinks[$linkIndex]); preprint_r($graphNodes[$object]); preprint_r($graphNodes[$concept]); exit; }*/ ///////////////////////////////////////////////////////////// } foreach ($relationsOfConceptAsTarget as $index => $relArr) { $verb = $relArr["link_verb"]; $subject = $relArr["source"]; $relationIndex = $relArr['relation_index']; // IF SHOWING MAIN CONCEPTS ONLY, IGNORE CONCEPTS NOT IN MAIN CONCEPTS LIST if ($mainConceptsOnly && !isset($graphNodes[$subject])) { continue; } if ($tooManyConcepts && $verb == $is_a_relation_name_ar) { continue; } // NO extending by relations in case of search result text // to reduce number of concepts we only add relations with other concepts // found in the text if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY" && !isset($graphNodes[$object])) { continue; } $relHashID = buildRelationHashID($subject, $verb, $concept); $fullRelationArr = $ONTOLOGY_RELATIONS[$relHashID]; $randomXLocation = rand($startLocationXMin, $startLocationXMax); $randomYLocation = rand($startLocationYMin, $startLocationYMax); //$conceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$subject]; $conceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $subject); $finalNodeLabel = $conceptArr['label_ar']; if ($lang == "EN") { $finalNodeLabel = formatEnglishConcept($conceptArr['label_en']); $verb = $fullRelationArr['VERB_TRANSLATION_EN']; } if (!isset($graphNodes[$subject])) { $graphNodes[$subject] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $conceptArr, $randomXLocation, $randomYLocation, 2); } $linkArr = array("source" => $graphNodes[$subject]["id"], "target" => $graphNodes[$concept]["id"], "link_verb" => $verb, "link_frequency" => $fullRelationArr['frequency']); //////// HANDLING MULTIPLE LINKS BETWEEN SAME NODES BEFORE ASSIGNING LINK $arrHash = getArrayHashForFields($linkArr, array('source', 'target')); if (!isset($linksHashLookupTable[$arrHash])) { $graphLinks[] = $linkArr; $linksHashLookupTable[$arrHash] = count($graphLinks) - 1; } else { $linkIndex = $linksHashLookupTable[$arrHash]; if (strpos($graphLinks[$linkIndex]['link_verb'], "{$verb}") === false) { $graphLinks[$linkIndex]['link_verb'] .= "," . $verb; } } ////////////////////////////////////////////////////////////// } } //preprint_r($graphLinks);exit; $graphNodesArr = array(); foreach ($graphNodes as $word => $nodeArr) { $graphNodesArr[] = $nodeArr; } //preprint_r($graphNodesArr,1);exit; //$graphNodesArr = array_slice($graphNodesArr, 1,10); //$graphLinks = array_slice($graphLinks, 1,10); $graphObj["nodes"] = $graphNodesArr; $graphObj["links"] = $graphLinks; return $graphObj; }
function wordOrPhraseIsInIndex($lang, $wordOrPhrase) { global $MODEL_SEARCH; $subwordsArr = explode(" ", $wordOrPhrase); foreach ($subwordsArr as $index => $word) { if (modelEntryExistsInMemory($lang, "MODEL_SEARCH", "INVERTED_INDEX", $word)) { return true; } } return false; }