<th> Uthmani </th> <th> Simple </th> </tr> <?php $uthmaniCounter = 0; $simpleCounter = 0; $qaOntologyConceptsIterator = getAPCIterator("AR\\/OTHERS\\/UTHMANI_TO_SIMPLE_WORD_MAP\\/.*"); foreach ($qaOntologyConceptsIterator as $conceptsCursor) { $mapTermKey = getEntryKeyFromAPCKey($conceptsCursor['key']); $mapTermVal = $conceptsCursor['value']; if (isSimpleQuranWord($mapTermKey)) { $simpleCounter++; //echoN("##".$mapTermKey); continue; } $uthmaniCounter++; ?> <tr> <td><?php echo $mapTermKey; ?> </td> <td><?php echo $mapTermVal; ?>
$noOntologyExtentionConstraint = true; } } } } } //preprint_r($columnSearchKeyValParams);exit; //echoN("IS QUESTION:$isQuestion"); //echoN("noOntologyExtentionConstraint:$noOntologyExtentionConstraint"); //echoN("noDerivationsConstraint:$noDerivationsConstraint"); /// CLEANING $query = cleanAndTrim($query); //$query = removeTashkeel($query); // remove tashkeel - convert from uthmani to simple // didn't use remove tashkeel since it leaves "hamzet el wasl" which is not in the simple text if (!isSimpleQuranWord($query)) { $query = convertUthamniQueryToSimple($query); } // CASE HANDLING if ($lang == "EN") { $query = strtolower($query); $query = removeSpecialCharactersFromMidQuery($query); } else { $query = removeNonArabicAndSpaceChars($query); } $originalQueryWordsArr = preg_split("/ /", $query); //for faster access $originalQueryWordsArrSwaped = swapAssocArrayKeyValues($originalQueryWordsArr); //echoN(memory_get_peak_usage()); // CHECK IF TRANSLITERATION if ($lang == "EN" && !$isConceptSearch && !$isPhraseSearch && !$isQuestion) {
function getWordInfo($word, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $fast = FALSE, $exactWord = FALSE) { $wordInfoArr = array(); $word = trim($word); $wordUthmani = ""; $wordSimple = ""; if (isSimpleQuranWord($word)) { $wordUthmani = getItemFromUthmaniToSimpleMappingTable($word); $wordSimple = $word; } else { $wordUthmani = $word; //preprint_r($UTHMANI_TO_SIMPLE_WORD_MAP_VS); // tashkeel of last char is significant, ex: lemmas will probably not be in the MAP because of that $wordSimple = getItemFromUthmaniToSimpleMappingTable($wordUthmani); } $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", ""); $freqArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$wordSimple]; //preprint_r($freqArr); $wordInfoArr['WORD_SIMPLE'] = $wordSimple; $wordInfoArr['WORD_UTHMANI'] = $wordUthmani; /*echoN("Simple:".$wordSimple); echoN("Uthmani:".$wordUthmani); echoN("Repetition:".$freqArr['TF']); echoN("TF-IDF Weight:".round($freqArr['TFIDF'],2)); */ $wordInfoArr['TF'] = $freqArr['TF']; $wordInfoArr['TFIDF'] = round($freqArr['TFIDF'], 2); //preprint_r($MODEL_QAC['QAC_MASTERTABLE']); //preprint_r(array_keys($MODEL_QAC['QAC_FEATURES'])); $buckwalterTransliteration = ""; $posTagsArr = array(); $lemmasArr = array(); $wordRoot = ""; $featuresArr = array(); $versesArr = array(); $versesTagsArr = array(); $buckwalterTransliteration = ""; $wordRoot = ""; if (empty($wordSimple) || !modelEntryExistsInMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple)) { return null; } //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$wordSimple]); $invertedIndexEntry = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple); $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; foreach ($invertedIndexEntry as $documentArrInIndex) { $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO']; // INGORE ROOT SOURCES AND PRONOUNS, WE ONLY NEED THE NROMAL CORRESPONDING WORD if ($WORD_TYPE == "PRONOUN_ANTECEDENT" || $WORD_TYPE == "ROOT") { continue; } $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); //echoN($qacLocation);exit;; //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); //exit; $qacMasterTableEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation); // search QAC for roots and LEMMAS for this word foreach ($qacMasterTableEntryArr as $segmentIndex => $segmentDataArr) { $tag = $segmentDataArr['TAG']; $segmentWord = $segmentDataArr['FORM_AR']; //echoN($segmentWord); //preprint_r($segmentDataArr); $segmentWordSimple = ""; $simpleRepresentation = getItemFromUthmaniToSimpleMappingTable($segmentWord); // !empty() produced error = Can't use function return value in write context if (strlen(trim($simpleRepresentation)) > 0) { $segmentWordSimple = $simpleRepresentation; } $buckwalterTransliteration = $segmentDataArr['FORM_EN']; if (isset($segmentDataArr['FEATURES']['LEM'])) { $lemma = $segmentDataArr['FEATURES']['LEM']; } $featuresArr = array_merge($segmentDataArr['FEATURES']); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); $wordId = getWordIndexFromQACLocation($qacLocation); if ($exactWord == TRUE) { $wordFromVerseAtLocation = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $wordId); if ($wordSimple !== $wordFromVerseAtLocation) { continue; } } //echoN("$segmentWord|$tag"); //for segments like ال no corresponding simple words to compare, not our target segment, so continue //if ( empty($segmentWordSimple)) continue; if (isset($segmentDataArr['FEATURES']['ROOT']) && $segmentDataArr['FEATURES']['ROOT'] != -1) { $wordRoot = $segmentDataArr['FEATURES']['ROOT']; } $posTagsArr[$tag] = 1; $lemmasArr[$lemma] = 1; //echoN("|$segmentWordSimple|$wordSimple|$segmentWord"); //$verseText = markSpecificWordInText($verseText,$wordId,$segmentWordSimple,"marked_fg"); $qacVerseLocation = substr($qacLocation, 0, strrpos($qacLocation, ":")); if (!isset($versesArr[$qacVerseLocation])) { $versesArr[$qacVerseLocation] = $verseText; } if (!isset($versesTagsArr[$qacVerseLocation])) { $versesTagsArr[$qacVerseLocation] = ""; } $versesTagsArr[$qacVerseLocation] = $versesTagsArr[$qacVerseLocation] . " " . $tag; } // we don't need all inverted index list except for verses, only break if we found at least one word if ($fast == true && !empty($versesArr)) { break; } } $wordInfoArr['BUCKWALTER'] = $buckwalterTransliteration; $wordInfoArr['ROOT'] = $wordRoot; $wordInfoArr['LEM'] = $lemmasArr; $wordInfoArr['POS'] = $posTagsArr; $wordInfoArr['VERSES'] = $versesArr; $wordInfoArr['VERSES_POS_TAGS'] = $versesTagsArr; $wordInfoArr['FEATURES'] = $featuresArr; return $wordInfoArr; }
function addRelation(&$relationsArr, $type, $subject, $verb, $object, $joinedPattern, $verbEngTranslation = "", $fullVerbQuranWord = "") { global $WORDS_TRANSLATIONS_AR_EN; global $is_a_relation_name_en; if (empty($subject) || empty($object)) { return false; } // make shallow last resort, since it spoils words and lead to duplicate oncepts if (!isSimpleQuranWord($subject)) { //CONVERT UTHMANI TO SIMPLE $subjectSimple = getItemFromUthmaniToSimpleMappingTable($subject); // IF NOT CORRESPONDING SIMPLE WORD, CONVERT USING SHALLOW CONVERSION ALGORITHM if (empty($subjectSimple)) { $subjectSimple = shallowUthmaniToSimpleConversion($subject); } } else { $subjectSimple = $subject; } // SAME AS ABOVE BUT FOR OBJECT if (!isSimpleQuranWord($object)) { $objectSimple = getItemFromUthmaniToSimpleMappingTable($object); //object simple to avoid null in case when not in the mapping table if (empty($objectSimple)) { $objectSimple = shallowUthmaniToSimpleConversion($object); } } else { $objectSimple = $object; } $verbUthmani = $verb; $verbSimple = ""; ///////// VERB TRANSLATION if (empty($verbEngTranslation)) { $verbEngTranslation = ""; // SINGLE WORD VERB if (!isMultiWordStr($verb)) { $verb = trim($verb); $translatableVerb = $fullVerbQuranWord; // VERB IS SIMPLE if (isSimpleQuranWord($verb)) { $translatableVerb = getItemFromUthmaniToSimpleMappingTable($fullVerbQuranWord); } else { $verbSimple = getItemFromUthmaniToSimpleMappingTable($verb); } $verbEngTranslation = cleanEnglishTranslation($WORDS_TRANSLATIONS_AR_EN[$translatableVerb]); //IF NOT IN TRANSLATION TABLE - EX: ONE OF THE SEGMENTS TRIMMED if (empty($verbEngTranslation)) { // CHECK IF IS ALSO NOTO IN TRANSLATION ENTRY if (!isFoundInTranslationTable($translatableVerb, "VERB")) { // TRANSLATE USING MICROSOFT API $verbEngTranslation = translateText($translatableVerb, "ar", "en"); // ADD TO QA CUSTOM TRANSLATION TABLE addTranslationEntry($verbEngTranslation, "VERB", $translatableVerb, "AR"); //no need //persistTranslationTable(); } else { $customTranslationEntryArr = getTranlationEntryByEntryKeyword($translatableVerb); $verbEngTranslation = $customTranslationEntryArr['EN_TEXT']; } } } else { //SPLIT PHRASE $verbPhraseArr = preg_split("/ /", $verb); foreach ($verbPhraseArr as $verbPart) { $translatableVerb = $verbPart; // IF SIMPLE if (isSimpleQuranWord($verbPart)) { //GET UTHMANI WORD TO BE ABEL TO TRANSLATE $translatableVerb = getItemFromUthmaniToSimpleMappingTable($verbPart); } else { // GET SIMPLE WORD TO BE ADDED IN RELATION META $simplePart = getItemFromUthmaniToSimpleMappingTable($verbPart); //if not in translation table, use shalow conversion if (empty($simplePart)) { $simplePart = shallowUthmaniToSimpleConversion($verbPart); } $verbSimple = $verbSimple . " " . $simplePart; // THIS VARIABLE NEEDED FOR TRANSLATION $translatableVerb = $simplePart; } // TRANSLATE $verbPartTranslated = cleanEnglishTranslation($WORDS_TRANSLATIONS_AR_EN[$translatableVerb]); //IF NOT IN TRANSLATION TABLE - EX: ONE OF THE SEGMENTS TRIMMED if (empty($verbPartTranslated)) { // CHECK IF IS ALSO NOTO IN TRANSLATION ENTRY if (!isFoundInTranslationTable($verbPart, "VERB")) { // TRANSLATE USING MICROSOFT API $verbPartTranslated = translateText($verbPart, "ar", "en"); // ADD TO QA CUSTOM TRANSLATION TABLE addTranslationEntry($verbPartTranslated, "VERB", $verbPart, "AR"); //persistTranslationTable(); } else { $customTranslationEntryArr = getTranlationEntryByEntryKeyword($verbPart); $verbPartTranslated = $customTranslationEntryArr['EN_TEXT']; } } // TRANSLATION ACCUMILATION $verbEngTranslation = $verbEngTranslation . " " . $verbPartTranslated; } } } if ($verbEngTranslation != "is kind of" && $verbEngTranslation != "part of" && $verbEngTranslation != $is_a_relation_name_en) { //$verbEngTranslation = removeBasicEnglishStopwordsNoNegation($verbEngTranslation); } $verbSimple = trim($verbSimple); if (empty($verbSimple)) { $verbSimple = removeTashkeel(shallowUthmaniToSimpleConversion($verbUthmani)); } return addNewRelation($relationsArr, $type, $subjectSimple, $verbSimple, $objectSimple, $joinedPattern, $verbEngTranslation, $verbUthmani); }
if ($lang == "EN") { showTechnicalError("Only Arabic is supported here, you chose English !"); } $lang = "AR"; loadModels("core,search,qac", $lang); $word = trim($_GET['word']); //preprint_r($poTaggedSubsentences); //echoN("SubSentences Count:".addCommasToNumber(count($poTaggedSubsentences))); $topPoSAggregation = array(); $ssPoSAggregation = array(); $ssPoSAggregationCorrespondingSent = array(); //echoN("Word:$word"); $targetType = "POS"; if (isArabicString($word)) { $targetType = "WORD"; if (isSimpleQuranWord($word)) { $poTaggedSubsentences = getPoSTaggedSubsentences("SIMPLE"); } else { $poTaggedSubsentences = getPoSTaggedSubsentences(); } } else { $poTaggedSubsentences = getPoSTaggedSubsentences(); } $targetPOSorWord = trim($word); if ($targetType == "POS") { if (!modelEntryExistsInMemory("AR", "MODEL_QAC", "QAC_POS", $targetPOSorWord)) { showTechnicalError("Not a valid PoS tag !"); exit; } } else { if (empty($targetPOSorWord)) {
function extendQueryByExtractingQACDerviations($extendedQueryWordsArr) { global $MODEL_SEARCH; /** GET ROOT/STEM FOR EACH QUERY WORD **/ foreach ($extendedQueryWordsArr as $word => $index) { //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$word]);exit; $invertedIndexEntryArr1 = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $word); foreach ($invertedIndexEntryArr1 as $documentArrInIndex) { $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO']; //echoN("|$INDEX_IN_AYA_EMLA2Y|"); //$INDEX_IN_AYA_EMLA2Y = getImla2yWordIndexByUthmaniLocation(getQACLocationStr($SURA+1,$AYA+1,$INDEX_IN_AYA_EMLA2Y),$UTHMANI_TO_SIMPLE_LOCATION_MAP); //echoN("|$INDEX_IN_AYA_UTHMANI|"); $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); //echoN($word); //echoN($WORD_TYPE); //preprint_r($documentArrInIndex); //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); $qacMasterTableEntryArr2 = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation); // search QAC for roots and LEMMAS for this word foreach ($qacMasterTableEntryArr2 as $segmentIndex => $segmentDataArr) { $segmentFormAR = $segmentDataArr['FORM_AR']; $segmentFormARimla2y = getItemFromUthmaniToSimpleMappingTable($segmentFormAR); //preprint_r($segmentDataArr); //echoN($segmentFormAR); //echoN($segmentFormARimla2y); //echoN($qacLocation); // the current query word has a ROOT in the current QAC segment if ($WORD_TYPE == "NORMAL_WORD" && isset($segmentDataArr['FEATURES']['STEM'])) { // get QAC root and LEM for the current query word $rootOfQueryQord = $segmentDataArr['FEATURES']['ROOT']; $stemOfQueryWord = $segmentDataArr['FEATURES']['LEM']; /* if ( empty($stemOfQueryWord) || empty($rootOfQueryQord)) { preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); echoN($rootOfQueryQord); echoN($stemOfQueryWord); exit; }*/ // add the STEMS to out extended query words if (!empty($rootOfQueryQord) && !isset($extendedQueryWordsArr[$rootOfQueryQord])) { $extendedQueryWordsArr[$rootOfQueryQord] = 1; } if (!isset($extendedQueryWordsArr[$stemOfQueryWord])) { $extendedQueryWordsArr[$stemOfQueryWord] = 1; } } } } ////////// CUSTOM ROOT TABLE /////////// //TODO: $zawaga = "زوج"; $CUSTOM_ROOTS_TABLE['الزواج'] = $zawaga; if (isset($CUSTOM_ROOTS_TABLE[$word])) { $extendedQueryWordsArr[$CUSTOM_ROOTS_TABLE[$word]] = 1; } //////////////////////////////////////// } $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; /** GET EMLA2Y (SIMPLE) WORDS CORRESPONDING TO ANY QAC SEGMENT CONTAINING THE ROOT/STEMS IN THE EXTENDED QUERY WORD FROM INVERTED INDEX * ADD TO EXTENDED QUERY WORDS * TODO: recheck to remove this whole loop * **/ foreach ($extendedQueryWordsArr as $word => $dummy) { // ONLY UTHMANI SHOULD BE HANDLED if (isSimpleQuranWord($word)) { continue; } $invertedIndexEntry = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $word); foreach ($invertedIndexEntry as $documentArrInIndex) { $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO']; $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); $wordFromVerse = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $INDEX_IN_AYA_EMLA2Y); if (empty($wordFromVerse)) { continue; } if ($WORD_TYPE == "PRONOUN_ANTECEDENT") { //echoN($wordFromVerse); // PRONOUNS SHOULD NOT BE ADDED TO THE QUERY BECAUSE THEY CAN REFER TO MANY THINGS // OTHER THAN THE ORIGINAL QUERY continue; } if (!isset($extendedQueryWordsArr[$wordFromVerse])) { $extendedQueryWordsArr[$wordFromVerse] = $qacLocation; } } } return $extendedQueryWordsArr; }