function loadModels($modelsToBeLoaded, $lang) { global $modelSources, $serializedModelFile, $quranMetaDataFile, $META_DATA, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $MODEL_QURANA; global $UTHMANI_TO_SIMPLE_WORD_MAP, $numberOfSuras, $pauseMarksFile; global $TRANSLATION_MAP_EN_TO_AR, $TRANSLATION_MAP_AR_TO_EN, $TRANSLITERATION_WORDS_MAP, $TRANSLITERATION_VERSES_MAP; global $wordByWordTranslationFile, $transliterationFile; global $MODEL_WORDNET, $qaOntologyNamespace, $qaOntologyFile, $is_a_relation_name_ar, $is_a_relation_name_en; global $thing_class_name_ar, $thing_class_name_en; global $MODEL_QA_ONTOLOGY, $arabicStopWordsFileL2; global $TRANSLITERATION_WORDS_LOCATION_MAP, $TRANSLITERATION_WORDS_INDEX; //not working gc_enable(); if (!function_exists("apc_exists")) { throw new Exception("APC not found!"); } //echoN("MODEL EXISTS IN CACHE?:".apc_exists("EN/MODEL_CORE/TOTALS/")); ##### CHECK MODEL IN CACHE ##### ##### if (TRUE && apc_exists("EN/MODEL_CORE/TOTALS/") !== false) { // split list by comma $modelListArr = preg_split("/,/", trim($modelsToBeLoaded)); /** * TODO: CHANGE THE CODE TO REFERENCE APC MEMORY DIRECTLY INSTEAD OF LOADING DATA IN EACH SCRIPT */ foreach ($modelListArr as $modelName) { //echoN("$modelName $lang ".time()); // echoN(memory_get_peak_usage()); //echoN($modelName); if ($modelName == "ontology") { /*$MODEL_QA_ONTOLOGY = apc_fetch("MODEL_QA_ONTOLOGY"); if ($MODEL_QA_ONTOLOGY===false ) { echo "$MODEL_QA_ONTOLOGY NOT CACHED";exit; } */ } if ($modelName == "wordnet") { } if ($modelName == "core") { //$MODEL_CORE = json_decode((file_get_contents("$serializedModelFile.core")),TRUE); /*$MODEL_CORE = apc_fetch("MODEL_CORE[$lang]"); if ($MODEL_CORE===false ) { echo "CORE MODEL [$lang] NOT CACHED";exit; }*/ } else { if ($modelName == "search") { //$MODEL_SEARCH = json_decode((file_get_contents("$serializedModelFile.search")),TRUE); //$MODEL_SEARCH = apc_fetch("MODEL_SEARCH[$lang]"); /*if ($MODEL_SEARCH===false ) { echo "SEARCH MODEL [$lang] NOT CACHED";exit; }*/ } else { if ($modelName == "qac") { //$MODEL_QAC = json_decode((file_get_contents("$serializedModelFile.qac")),TRUE); /*$MODEL_QAC = apc_fetch("MODEL_QAC"); if ($MODEL_QAC===false ) { echo "QAC MODEL NOT CACHED";exit; } */ } } } } $MODEL_WORDNET['INDEX'] = apc_fetch("WORDNET_INDEX"); if ($MODEL_WORDNET['INDEX'] === false) { echo "MODEL_WORDNET['INDEX'] NOT CACHED"; exit; } $MODEL_WORDNET['LEXICO_SEMANTIC_CATEGORIES'] = apc_fetch("WORDNET_LEXICO_SEMANTIC_CATEGORIES"); if ($MODEL_WORDNET['LEXICO_SEMANTIC_CATEGORIES'] === false) { echo " MODEL MODEL_WORDNET['LEXICO_SEMANTIC_CATEGORIES'] NOT CACHED"; exit; } $MODEL_WORDNET['DATA'] = apc_fetch("WORDNET_DATA"); if ($MODEL_WORDNET['DATA'] === false) { echo "MODEL MODEL_WORDNET['DATA'] NOT CACHED"; exit; } //else if ( ($modelName=="qurana")) //{ //$MODEL_QURANA = json_decode((file_get_contents("$serializedModelFile.qurana")),TRUE); $MODEL_QURANA = apc_fetch("MODEL_QURANA"); if ($MODEL_QURANA === false) { echo "QURANA MODEL NOT CACHED"; exit; } //} return; } ########## ##### ##### ##### ##### ##### //$quran = file($quranMetaDataFile); $quranMetaDataXMLObj = simplexml_load_file($quranMetaDataFile); ###### CONVERT META XML STRUCUTURE TO OUR STRUCTURE foreach ($quranMetaDataXMLObj->suras as $index => $surasArr) { foreach ($surasArr->sura as $suraMetaArr) { $tempArr = array(); $tempArr['index'] = (string) $suraMetaArr['index']; $tempArr['ayas'] = (string) $suraMetaArr['ayas']; $tempArr['name_ar'] = (string) $suraMetaArr['name']; $tempArr['name_trans'] = (string) $suraMetaArr['tname']; $tempArr['name_en'] = (string) $suraMetaArr['ename']; $tempArr['type'] = (string) $suraMetaArr['type']; $tempArr['order'] = (string) $suraMetaArr['order']; $META_DATA['SURAS'][] = $tempArr; } } ############################################## /////////// LOAD ONTOLOGY $reader = new OWLReader(); $ontology = new OWLMemoryOntology(); $thingClassName = "{$thing_class_name_ar}"; $ontology->setNamespace($qaOntologyNamespace); $reader->readFromFile($qaOntologyFile, $ontology); //preprint_r($ontology->{'owl_data'}['classes']); //preprint_r($ontology->{'owl_data'}['properties']); //preprint_r($ontology->{'owl_data'}['labels']); //preprint_r($ontology->{'owl_data'}['annotations']); //preprint_r($ontology->{'owl_data'}['instances']); $classes = $ontology->{'owl_data'}['classes']; $instances = $ontology->{'owl_data'}['instances']; $qaOntologyConceptsArr = array(); $qaOntologyRelationsArr = array(); $relationsCount = 0; foreach ($classes as $className => $infoArr) { $className = stripOntologyNamespace($className); $qaOntologyConceptsArr[$className] = array("type" => "class"); //echoN($className); //preprint_r($infoArr); foreach ($infoArr[0]['properties'] as $index => $propertiesArr) { /** INCASE THIS INSTANCE HAS MULTIPLE PROPERTIES WITH SAME VERB **/ foreach ($propertiesArr as $index2 => $onePropertyArr) { if (empty($onePropertyArr)) { continue; } $verb = key($onePropertyArr); $objectClassArr = current($onePropertyArr); $objectConceptName = stripOntologyNamespace($objectClassArr[0]); //echoN("CLASS:***** $className => $verb -> $objectConceptName"); $attributedArr = next($onePropertyArr); $freq = $attributedArr['frequency']; $engTranslation = $attributedArr['verb_translation_en']; $verbUthmani = $attributedArr['verb_uthmani']; $relHashID = buildRelationHashID($className, $verb, $objectConceptName); $qaOntologyRelationsArr[$relHashID] = array("SUBJECT" => $className, "VERB" => $verb, "OBJECT" => $objectConceptName, "FREQUENCY" => $freq, "VERB_TRANSLATION_EN" => $engTranslation, "VERB_UTHMANI" => $verbUthmani); //preprint_r($qaOntologyRelationsArr[$relHashID]); $relationsCount++; } } } foreach ($instances as $instanceName => $intancesArr) { foreach ($intancesArr as $index => $infoArr) { $subjectConceptName = stripOntologyNamespace($instanceName); $parent = stripOntologyNamespace($infoArr['class']); //echoN("$subjectConceptName $parent"); $relHashID = buildRelationHashID($subjectConceptName, $is_a_relation_name_ar, $parent); $qaOntologyRelationsArr[$relHashID] = array("SUBJECT" => $subjectConceptName, "VERB" => "{$is_a_relation_name_ar}", "OBJECT" => $parent, "VERB_TRANSLATION_EN" => "{$is_a_relation_name_en}"); if ($parent != $thing_class_name_ar) { $relationsCount++; } $propertiesArr = $infoArr['properties']; //echoN($instanceName); //echoN("$instanceName:@@@"); //preprint_r($propertiesArr); /** INCASE THIS INSTANCE HAS MULTIPLE PROPERTIES WITH SAME VERB **/ foreach ($propertiesArr as $index2 => $onePropertyArr) { if (empty($onePropertyArr)) { continue; } $verb = key($onePropertyArr); $objectClassArr = current($onePropertyArr); $objectConceptName = stripOntologyNamespace($objectClassArr[0]); //echoN("***** $verb -> $objectConceptName"); $attributedArr = next($onePropertyArr); $freq = $attributedArr['frequency']; $engTranslation = $attributedArr['verb_translation_en']; $verbUthmani = $attributedArr['verb_uthmani']; $relHashID = buildRelationHashID($subjectConceptName, $verb, $objectConceptName); $qaOntologyRelationsArr[$relHashID] = array("SUBJECT" => $subjectConceptName, "VERB" => $verb, "OBJECT" => $objectConceptName, "FREQUENCY" => $freq, "VERB_TRANSLATION_EN" => $engTranslation, "VERB_UTHMANI" => $verbUthmani); $relationsCount++; } // if it is class dont make it instance even if it is a subject (subclass of another class // BUG: animal was not apearing on ontology graph page since it was instance if (empty($qaOntologyConceptsArr[$subjectConceptName]) || $qaOntologyConceptsArr[$subjectConceptName][type] != 'class') { $qaOntologyConceptsArr[$subjectConceptName] = array("type" => "instance"); } } } foreach ($qaOntologyConceptsArr as $conceptName => $infoArr) { $fullConceptName = $qaOntologyNamespace . $conceptName; $labelsArr = $ontology->{'owl_data'}['labels'][$fullConceptName]; foreach ($labelsArr as $labelLang => $label) { /*if ( mb_strlen($label)==1) { echon($fullConceptName); preprint_r($ontology->{'owl_data'}['labels'][$fullConceptName]); }*/ $qaOntologyConceptsArr[$conceptName]['label_' . strtolower($labelLang)] = $label; } // "Thing" does not have annotations if (isset($ontology->{'owl_data'}['annotations'][$fullConceptName])) { $annotationsArr = $ontology->{'owl_data'}['annotations'][$fullConceptName]; foreach ($annotationsArr as $index => $annotArr) { $key = $annotArr['KEY']; $val = $annotArr['VAL']; $qaOntologyConceptsArr[$conceptName][$key] = $val; //echoN("[$conceptName][$key] = $val"); } } } ////////// OUTPUT STATS /*echoN("INSTANCES COUNT:".count($ontology->{'owl_data'}['instances'])); echoN("CLASSES COUNT:".count($ontology->{'owl_data'}['classes'])); echoN("PROPERTIES COUNT - DECLERATIONS ONLY:".count($ontology->{'owl_data'}['properties']));; echoN("CONCEPTS COUNT:".count($qaOntologyConceptsArr)); echoN("RELATIONS COUNT:".$relationsCount); preprint_r($qaOntologyRelationsArr);*/ ////////////////// ///////////// QUALITY CHECK CONCEPTS $qaOntologyConceptsArr2 = array(); foreach ($qaOntologyConceptsArr as $key => $val) { $newKey = strtr($key, "_", " "); $qaOntologyConceptsArr2[$newKey] = $value; } $ONTOLOGY_EXTRACTION_FOLDER = "../data/ontology/extraction/"; $finalConcepts = unserialize(file_get_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.final.concepts.final")); $diffArr = array_diff(array_keys($qaOntologyConceptsArr2), array_keys($finalConcepts)); $conceptsDiffCount = count($matchingTable); if ($relationsDiffCount > 0) { echoN("<b>### OWL-PROPRIETARY-CONCEPTS-DIFF-COUNT:</b>" . $conceptsDiffCount); } //preprint_r($diffArr); ////////////////////////////////////////////////////////////// //////// quality check relations $relationsArr = unserialize(file_get_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.final.relations")); $matchingTable = array(); foreach ($qaOntologyRelationsArr as $index => $relArr) { $trippleStr = $relArr['SUBJECT'] . "->" . $relArr['VERB'] . "->" . $relArr['OBJECT']; //since Thing relations are not in the list we are comparing with if ($relArr['OBJECT'] == $thing_class_name_ar) { continue; } //echoN($trippleStr); $trippleStr = trim($trippleStr); $matchingTable[$trippleStr]++; } foreach ($relationsArr as $index => $relArr) { $relArr['SUBJECT'] = strtr($relArr['SUBJECT'], " ", "_"); $relArr['VERB'] = strtr($relArr['VERB'], " ", "_"); $relArr['OBJECT'] = strtr($relArr['OBJECT'], " ", "_"); $trippleStr = $relArr['SUBJECT'] . "->" . $relArr['VERB'] . "->" . $relArr['OBJECT']; $trippleStr = trim($trippleStr); $matchingTable[$trippleStr]++; } function filterFunc($v) { return $v <= 1; } $matchingTable = array_filter($matchingTable, 'filterFunc'); $relationsDiffCount = count($matchingTable); if ($relationsDiffCount > 0) { echoN("<b>### OWL-PROPRIETARY-RELATIONS-DIFF-COUNT:</b>" . $relationsDiffCount); preprint_r($matchingTable); } ////////////////////////////////////////////// //echoN( join("<br>",array_keys($qaOntologyConceptsArr))); $qaOntologyVerbIndex = array(); $qaOntologyGraphSourcesIndex = array(); $qaOntologyGraphTargetsIndex = array(); //preprint_r($qaOntologyRelationsArr); //exit; foreach ($qaOntologyRelationsArr as $index => $relArr) { $subject = $relArr['SUBJECT']; $verb = $relArr['VERB']; $verb_translation_en = $relArr['VERB_TRANSLATION_EN']; $object = $relArr['OBJECT']; //$qaOntologyVerbIndex[$verb][]=array("SUBJECT"=>$subject,"OBJECT"=>$object); //$qaOntologyVerbIndex[$verb_translation_en][]=array("SUBJECT"=>$subject,"OBJECT"=>$object); addValueToMemoryModel("ALL", "MODEL_QA_ONTOLOGY", "VERB_INDEX", $verb, array("SUBJECT" => $subject, "OBJECT" => $object)); addValueToMemoryModel("ALL", "MODEL_QA_ONTOLOGY", "VERB_INDEX", $verb_translation_en, array("SUBJECT" => $subject, "OBJECT" => $object)); //$qaOntologyGraphSourcesIndex[$subject][]=array("link_verb"=>$verb,"target"=>$object,"relation_index"=>$index); //$qaOntologyGraphTargetsIndex[$object][]=array("source"=>$subject,"link_verb"=>$verb,"relation_index"=>$index); addToMemoryModelList("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_SOURCES", $subject, array("link_verb" => $verb, "target" => $object, "relation_index" => $index)); addToMemoryModelList("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_TARGETS", $object, array("source" => $subject, "link_verb" => $verb, "relation_index" => $index)); } $qaOntologyConceptsENtoARMapArr = array(); foreach ($qaOntologyConceptsArr as $arName => $conceptArr) { $enLabel = trim(strtolower($conceptArr['label_en'])); //$qaOntologyConceptsENtoARMapArr[$enLabel]=$arName; //$qaOntologyConceptsENtoARMapArr[$enLabel]=$arName; addValueToMemoryModel("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $enLabel, $arName); } $qaSynonymsIndex = array(); foreach ($qaOntologyConceptsArr as $arName => $conceptArr) { addValueToMemoryModel("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $arName, $conceptArr); $i = 1; while (isset($conceptArr['synonym_' . $i])) { if (empty($conceptArr['synonym_' . $i])) { $i++; continue; } $synonymLabel = trim(strtolower($conceptArr['synonym_' . $i])); $qaSynonymsIndex[$synonymLabel] = $arName; addValueToMemoryModel("ALL", "MODEL_QA_ONTOLOGY", "SYNONYMS_INDEX", $synonymLabel, $arName); $i++; } } //preprint_r($qaOntologyConceptsArr);exit; //$MODEL_QA_ONTOLOGY['CONCEPTS'] = $qaOntologyConceptsArr; //$MODEL_QA_ONTOLOGY['RELATIONS'] = $qaOntologyRelationsArr; addValueToMemoryModel("ALL", "MODEL_QA_ONTOLOGY", "RELATIONS", "", $qaOntologyRelationsArr); //$MODEL_QA_ONTOLOGY['GRAPH_INDEX_SOURCES'] = $qaOntologyGraphSourcesIndex; //$MODEL_QA_ONTOLOGY['GRAPH_INDEX_TARGETS'] = $qaOntologyGraphTargetsIndex; //$MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'] = $qaOntologyConceptsENtoARMapArr; //$MODEL_QA_ONTOLOGY['VERB_INDEX'] = $qaOntologyVerbIndex; //$MODEL_QA_ONTOLOGY['SYNONYMS_INDEX'] = $qaSynonymsIndex; //$res = apc_store("MODEL_QA_ONTOLOGY",$MODEL_QA_ONTOLOGY); //if ( $res===false){ throw new Exception("Can't cache MODEL_QA_ONTOLOGY"); } //preprint_r($MODEL_QA_ONTOLOGY);exit; //////// END ONTOLOGY LOADING //////////////////////////// /// WORDNET loadWordnet($MODEL_WORDNET); ///////////// //free resources $quranMetaDataXMLObj = null; unset($quranMetaDataXMLObj); foreach ($modelSources as $supportedLang => $modelSourceArr) { $type = $modelSourceArr['type']; $file = $modelSourceArr['file']; //echoN("$lang $type $file"); loadModel($supportedLang, $type, $file); //not working $gced = gc_collect_cycles(); //echoN($gced); } //echoN(json_encode($MODEL)); ############ Uthmani/Simple mapping table ################# ############ AND WORD-WORD TRANSLATION AND TRANSLITERATION ################# $pauseMarksArr = getPauseMarksArrByFile($pauseMarksFile); $wordByWordFileArr = file($wordByWordTranslationFile, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES); $translitertationArr = file($transliterationFile, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES); $WORD_SENSES_EN = array(); $WORD_SENSES_AR = array(); $quranTextEntryFromAPC_AR = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); $quranTextEntryFromAPC_UTH = getModelEntryFromMemory("AR_UTH", "MODEL_CORE", "QURAN_TEXT", ""); /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { $suraSize = count($quranTextEntryFromAPC_AR[$s]); /* VERSES LOOP **/ for ($a = 0; $a < $suraSize; $a++) { $i++; $verseTextSimple = $quranTextEntryFromAPC_AR[$s][$a]; $simpleWordsArr = preg_split("/ /", $verseTextSimple); $verseTextUthmani = $quranTextEntryFromAPC_UTH[$s][$a]; $uthmaniWordsArr = preg_split("/ /", $verseTextUthmani); $simpleWordsArr = removePauseMarksFromArr($pauseMarksArr, $simpleWordsArr); $uthmaniWordsArr = removePauseMarksFromArr($pauseMarksArr, $uthmaniWordsArr); $verseLocation = $s + 1 . ":" . ($a + 1); $UTHMANI_TO_SIMPLE_LOCATION_MAP[$verseLocation] = array(); ///////// Transliteration ///////////// $transliterationLine = current($translitertationArr); next($translitertationArr); $lineParts = preg_split("/\\|/", $transliterationLine); $verseTransliteration = $lineParts[2]; //echoN($transliterationLine); $TRANSLITERATION_VERSES_MAP[$verseLocation] = $verseTransliteration; $wordsTransliterationArr = preg_split("/ /", $verseTransliteration); // preprint_r($wordsTransliterationArr);exit; ///////////////////////////////////////////////// $wtwIndex = 0; foreach ($uthmaniWordsArr as $index => $wordUthmani) { $qacMasterID = $s + 1 . ":" . ($a + 1) . ":" . ($index + 1); $qacMasterTableEntry = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacMasterID); $lemma = $qacMasterTableEntry[0]['FEATURES']['LEM']; // to handle multi segment words such as الدنيا if (empty($lemma)) { $lemma = $qacMasterTableEntry[1]['FEATURES']['LEM']; } //echoN("|$lemma|$wordUthmani"); //$wtwIndex (INDEX_IN_AYA_EMLA2Y) needs to be 1 based ( UTHMANI=IMLA2Y ) $UTHMANI_TO_SIMPLE_LOCATION_MAP[$s + 1 . ":" . ($a + 1)][$index + 1] = $wtwIndex + 1; $wordSimple = $simpleWordsArr[$wtwIndex++]; //$UTHMANI_TO_SIMPLE_LOCATION_MAP[($s+1).":".($a+1)][($index+1)."-".$wordUthmani]=($wtwIndex)."-".$wordSimple; /* for ayas which are different in size, do the following * if the current word is ويا or ها or يا * then join it with the next word and make them one word */ if (count($uthmaniWordsArr) != count($simpleWordsArr) && ($wordSimple == "يا" || $wordSimple == "ها" || $wordSimple == "ويا" || $wordUthmani == "وَأَلَّوِ")) { if ($wordUthmani == "يَبْنَؤُمَّ") { // example 0 => 1 $UTHMANI_TO_SIMPLE_LOCATION_MAP[$s + 1 . ":" . ($a + 1)][$index + 1] = $wtwIndex + 1; //[($index+1)."-".$wordUthmani]=($wtwIndex+1)."-".$wordSimple; $wordSimple = $wordSimple . " " . $simpleWordsArr[$wtwIndex++] . " " . $simpleWordsArr[$wtwIndex++]; } else { // example 0 => 1 $UTHMANI_TO_SIMPLE_LOCATION_MAP[$s + 1 . ":" . ($a + 1)][$index + 1] = $wtwIndex + 1; //[($index+1)."-".$wordUthmani]=($wtwIndex+1)."-".$wordSimple; $wordSimple = $wordSimple . " " . $simpleWordsArr[$wtwIndex++]; } //echoN("$wordUthmani:$wordSimple"); } // printHTMLPageHeader(); // echoN("$wordSimple|$wordUthmani"); ///////// english translation //////// $wordByWordTranslationLine = current($wordByWordFileArr); next($wordByWordFileArr); $linePartsArr = preg_split("/\\|/", $wordByWordTranslationLine); $englishTranslationForCurrentWord = $linePartsArr[5]; ///////////////////////////////////////////////// $WORD_SENSES_EN[$englishTranslationForCurrentWord][$wordUthmani]++; $WORD_SENSES_AR[$wordUthmani][$englishTranslationForCurrentWord]++; $TRANSLATION_MAP_EN_TO_AR[$englishTranslationForCurrentWord] = $wordUthmani; $TRANSLATION_MAP_AR_TO_EN[$wordUthmani] = $englishTranslationForCurrentWord; $TRANSLITERATION_WORDS_MAP[$wordUthmani] = $wordsTransliterationArr[$index]; $clenaedTranliteration = cleanTransliteratedText($wordsTransliterationArr[$index]); $TRANSLITERATION_WORDS_INDEX[$clenaedTranliteration] = 1; $TRANSLITERATION_WORDS_LOCATION_MAP["{$s}:{$a}:{$index}"] = $wordsTransliterationArr[$index]; //preprint_r($TRANSLITERATION_WORDS_LOCATION_MAP); // preprint_r($TRANSLATION_MAP_AR_TO_EN); // preprint_r($TRANSLITERATION_WORDS_MAP); $UTHMANI_TO_SIMPLE_WORD_MAP[$wordUthmani] = $wordSimple; addValueToMemoryModel("AR", "OTHERS", "UTHMANI_TO_SIMPLE_WORD_MAP", $wordUthmani, $wordSimple); $UTHMANI_TO_SIMPLE_WORD_MAP[$wordSimple] = $wordUthmani; addValueToMemoryModel("AR", "OTHERS", "UTHMANI_TO_SIMPLE_WORD_MAP", $wordSimple, $wordUthmani); if (!empty($lemma)) { if (!isset($LEMMA_TO_SIMPLE_WORD_MAP[$lemma])) { $LEMMA_TO_SIMPLE_WORD_MAP[$lemma] = $wordSimple; } else { $oldSimple = $LEMMA_TO_SIMPLE_WORD_MAP[$lemma]; if (myLevensteinEditDistance($oldSimple, $lemma) > myLevensteinEditDistance($wordSimple, $lemma)) { $LEMMA_TO_SIMPLE_WORD_MAP[$lemma] = $wordSimple; } } } } } } /////// ADD UTHMANI TO SIMPLE LOCATION MAP TO MEMORY foreach ($UTHMANI_TO_SIMPLE_LOCATION_MAP as $verseLocation => $verseMappingArr) { /*foreach($mappingArr as $uhtmaniIndex=>$imal2yIndex) { }*/ addValueToMemoryModel("AR", "OTHERS", "UTHMANI_TO_SIMPLE_LOCATION_MAP", $verseLocation, $verseMappingArr); } /////////////////////////////////////////////////////// //preprint_r($TRANSLATION_MAP_EN_TO_AR);exit; //preprint_r($WORD_SENSES_AR);exit; // CAN'T BE ADDED IN THE CORE_MODEL since the mapping happens after loadModel //$res = apc_store("UTHMANI_TO_SIMPLE_WORD_MAP",$UTHMANI_TO_SIMPLE_WORD_MAP); //if ( $res===false){ throw new Exception("Can't cache UTHMANI_TO_SIMPLE_WORD_MAP"); } //$res = apc_store("UTHMANI_TO_SIMPLE_LOCATION_MAP",$UTHMANI_TO_SIMPLE_LOCATION_MAP); //if ( $res===false){ throw new Exception("Can't cache UTHMANI_TO_SIMPLE_LOCATION_MAP"); } $res = apc_store("LEMMA_TO_SIMPLE_WORD_MAP", $LEMMA_TO_SIMPLE_WORD_MAP); if ($res === false) { throw new Exception("Can't cache LEMMA_TO_SIMPLE_WORD_MAP"); } $res = apc_store("WORDS_TRANSLATIONS_EN_AR", $TRANSLATION_MAP_EN_TO_AR); if ($res === false) { throw new Exception("Can't cache WORDS_TRANSLATIONS_EN_AR"); } $res = apc_store("WORDS_TRANSLATIONS_AR_EN", $TRANSLATION_MAP_AR_TO_EN); if ($res === false) { throw new Exception("Can't cache WORDS_TRANSLATIONS_AR_EN"); } $res = apc_store("WORDS_TRANSLITERATION", $TRANSLITERATION_WORDS_MAP); if ($res === false) { throw new Exception("Can't cache WORDS_TRANSLITERATION"); } $res = apc_store("TRANSLITERATION_WORDS_LOCATION_MAP", $TRANSLITERATION_WORDS_LOCATION_MAP); if ($res === false) { throw new Exception("Can't cache TRANSLITERATION_WORDS_LOCATION_MAP"); } $res = apc_store("TRANSLITERATION_VERSES_MAP", $TRANSLITERATION_VERSES_MAP); if ($res === false) { throw new Exception("Can't cache TRANSLITERATION_VERSES_MAP"); } $res = apc_store("TRANSLITERATION_WORDS_INDEX", $TRANSLITERATION_WORDS_INDEX); if ($res === false) { throw new Exception("Can't cache TRANSLITERATION_WORDS_INDEX"); } $res = apc_store("WORD_SENSES_EN", $WORD_SENSES_EN); if ($res === false) { throw new Exception("Can't cache WORD_SENSES_EN"); } $res = apc_store("WORD_SENSES_AR", $WORD_SENSES_AR); if ($res === false) { throw new Exception("Can't cache {$WORD_SENSES_AR}"); } //// ENRICH INVERTED INDEX BY UTHMANI-EMLA2Y INDEXES //echoN(count($MODEL_SEARCH['AR']['INVERTED_INDEX'])); foreach (getAPCIterator("AR\\/MODEL_SEARCH\\/INVERTED_INDEX\\/.*") as $invertedIndexCursor) { $wordDataArr = $invertedIndexCursor['value']; $key = $invertedIndexCursor['key']; $word = getEntryKeyFromAPCKey($key); foreach ($wordDataArr as $index => $documentArrInIndex) { $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; //echoN($word." ".$WORD_TYPE); if ($WORD_TYPE == "NORMAL_WORD") { $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; foreach ($UTHMANI_TO_SIMPLE_LOCATION_MAP[$SURA + 1 . ":" . ($AYA + 1)] as $uhtmaniIndex => $imal2yIndex) { if ($imal2yIndex == $INDEX_IN_AYA_EMLA2Y) { $INDEX_IN_AYA_UTHMANI = $uhtmaniIndex; break; } } //echoN($INDEX_IN_AYA_UTHMANI); $wordDataArr[$index]['INDEX_IN_AYA_UTHMANI'] = $INDEX_IN_AYA_UTHMANI; } else { // needed for highlighting pronoun charcters in search $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $INDEX_IN_AYA_EMLA2Y = getSimpleWordIndexByUthmaniWordIndex($SURA + 1 . ":" . ($AYA + 1), $INDEX_IN_AYA_UTHMANI); $wordDataArr[$index]['INDEX_IN_AYA_EMLA2Y'] = $INDEX_IN_AYA_EMLA2Y; } } //UPDATE updateModelData($key, $wordDataArr); } //$res = apc_store("MODEL_SEARCH[AR]",$MODEL_SEARCH['AR']); //if ( $res===false){ throw new Exception("Can't cache MODEL_SEARCH[AR]"); } //preprint_r($TRANSLITERATION_WORDS_LOCATION_MAP); /// ADD TRANSLITERATION TO INVERETD INDEX WWITH ENGLISH WORDS if ($lang == "EN") { $invertedIndexBatchApcArr = array(); foreach ($TRANSLITERATION_WORDS_LOCATION_MAP as $location => $transliteratedWord) { $locationArr = explode(":", $location); $s = $locationArr[0]; $a = $locationArr[1]; $wordIndex = $locationArr[2]; //echoN("$transliteratedWord,$s,$a,$wordIndex"); $transliteratedWord = strtolower(strip_tags($transliteratedWord)); //$MODEL_SEARCH['EN']['INVERTED_INDEX'][$word] addToInvertedIndex($invertedIndexBatchApcArr, $lang, $transliteratedWord, $s, $a, $wordIndex, "NORMAL_WORD"); } addToMemoryModelBatch($invertedIndexBatchApcArr); //$res = apc_store("MODEL_SEARCH[EN]",$MODEL_SEARCH['EN']); } //if ( $res===false){ throw new Exception("Can't cache MODEL_SEARCH[EN]"); } ///////////////////////////////////////////////////////// //preprint_r($UTHMANI_TO_SIMPLE_WORD_MAP); //preprint_r($MODEL_CORE["AR_UTH"]['QURAN_TEXT']);exit; ############################################################## // get memory usage $debug = memory_get_usage(true) / 1024 / 1024 . "/" . memory_get_peak_usage(true) / 1024 / 1024 . "Memory <br>"; //echoN($debug); //needed to be set here after both languages has been loaded // reload all models from memory to set all variables (WORDNET) - after model generation /* needed to reload all generated models from memory specialy model_core since * it has 3 languages, if this line is removed: all 3 langauges are loaded although only one language * is requested, also it caused a bug in getPoSTaggedSubsentences */ //loadModels($modelsToBeLoaded,$lang); }
function getDistanceBetweenWords($word1, $word2) { $distance = 0; $word1Length = mb_strlen($word1); $word2Length = mb_strlen($word2); if ($word1Length == $word2Length) { //echoN("EQUAL"); return getHammingDistance($word1, $word2); } else { return myLevensteinEditDistance($word1, $word2); } }
function extendQueryWordsByDerivations($taggedSignificantWords, $lang) { foreach ($taggedSignificantWords as $word => $posTag) { // avoid small words, will lead to too many iirelevant derivations if (mb_strlen($word) <= 2) { continue; } if ($lang == "EN") { if ($posTag == "NN") { $plural = $word . "s"; $taggedSignificantWords[$plural] = "{$posTag}" . "S"; } else { if ($posTag == "NNS") { $single = substr($word, 0, -1); $taggedSignificantWords[$single] = substr($posTag, 0, -1); } } } else { $simmlarWords = array(); $qaOntologyConceptsIterator = getAPCIterator("ALL\\/MODEL_QA_ONTOLOGY\\/CONCEPTS\\/.*"); foreach ($qaOntologyConceptsIterator as $conceptsCursor) { $conceptID = getEntryKeyFromAPCKey($conceptsCursor['key']); $mainConceptArr = $conceptsCursor['value']; $conceptLabelAR = $mainConceptArr['label_ar']; $dist = myLevensteinEditDistance($word, $conceptLabelAR); if ($dist <= 5) { $dist = getDistanceByCommonUniqueChars($word, $conceptLabelAR); $simmlarWords[$conceptLabelAR] = $dist; } $i = 1; while (isset($mainConceptArr['synonym_' . $i]) && isArabicString($mainConceptArr['synonym_' . $i])) { $synonym = $mainConceptArr['synonym_' . $i]; $dist = myLevensteinEditDistance($word, $synonym); if ($dist <= 5) { $dist = getDistanceByCommonUniqueChars($word, $synonym); $simmlarWords[$synonym] = $dist; } $i++; } } foreach ($simmlarWords as $conceptWord => $distBySimChars) { $diff = mb_strlen($conceptWord) - mb_strlen($word); $absDiffSize = abs($diff); // $word is bigger if ($diff < 0) { $absDiffSize = abs($diff); $diffStr = getStringDiff($conceptWord, $word); //echoN($diffStr); //حيوان => الحيوانات // the bigger word should not contain space الله => سبيل الله // $diffStr=="الات" for حيوان = الحياوانات if (mb_strpos($word, $conceptWord) !== false && strpos($word, " ") === false && ($diffStr == "ات" || $diffStr == "ال" || $diffStr == "الات")) { //echoN("$word, $conceptWord"); /// convert word to noun plular $taggedSignificantWords[$word] = "NNS"; // concept word is singular $taggedSignificantWords[$conceptWord] = "NN"; } else { if ($diff == 1) { $wordLastCharTrimmed = mb_substr($conceptWord, 0, -1); if ($wordLastCharTrimmed . "ات" == $word) { /// convert word to noun plular $taggedSignificantWords[$word] = "NNS"; /// convert word to noun plular $taggedSignificantWords[$conceptWord] = "NN"; } } } } else { $diffStr = getStringDiff($conceptWord, $word); // الحيوانات => حيوان // the bigger word should not contain space الله => سبيل الله if ($diff != 0 && mb_strpos($conceptWord, $word) !== false && strpos($conceptWord, " ") === false && ($diffStr == "ات" || $diffStr == "ال" || $diffStr == "الات")) { //echoN("$word,$conceptWord"); /// convert word to noun plular $taggedSignificantWords[$conceptWord] = "NNS"; } else { if ($diff == 1) { $wordLastCharTrimmed = mb_substr($word, 0, -1); if ($wordLastCharTrimmed . "ات" == $conceptWord) { /// convert word to noun plular $taggedSignificantWords[$conceptWord] = "NNS"; } } } } } // limit the number of derivations+original terms to 15 $taggedSignificantWords = array_slice($taggedSignificantWords, 0, 10); //arsort($simmlarWords); //preprint_r($taggedSignificantWords); //preprint_r($simmlarWords); //exit; } } //preprint_r($taggedSignificantWords); return $taggedSignificantWords; }