function getWordInfo($word, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $fast = FALSE, $exactWord = FALSE) { $wordInfoArr = array(); $word = trim($word); $wordUthmani = ""; $wordSimple = ""; if (isSimpleQuranWord($word)) { $wordUthmani = getItemFromUthmaniToSimpleMappingTable($word); $wordSimple = $word; } else { $wordUthmani = $word; //preprint_r($UTHMANI_TO_SIMPLE_WORD_MAP_VS); // tashkeel of last char is significant, ex: lemmas will probably not be in the MAP because of that $wordSimple = getItemFromUthmaniToSimpleMappingTable($wordUthmani); } $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", ""); $freqArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$wordSimple]; //preprint_r($freqArr); $wordInfoArr['WORD_SIMPLE'] = $wordSimple; $wordInfoArr['WORD_UTHMANI'] = $wordUthmani; /*echoN("Simple:".$wordSimple); echoN("Uthmani:".$wordUthmani); echoN("Repetition:".$freqArr['TF']); echoN("TF-IDF Weight:".round($freqArr['TFIDF'],2)); */ $wordInfoArr['TF'] = $freqArr['TF']; $wordInfoArr['TFIDF'] = round($freqArr['TFIDF'], 2); //preprint_r($MODEL_QAC['QAC_MASTERTABLE']); //preprint_r(array_keys($MODEL_QAC['QAC_FEATURES'])); $buckwalterTransliteration = ""; $posTagsArr = array(); $lemmasArr = array(); $wordRoot = ""; $featuresArr = array(); $versesArr = array(); $versesTagsArr = array(); $buckwalterTransliteration = ""; $wordRoot = ""; if (empty($wordSimple) || !modelEntryExistsInMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple)) { return null; } //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$wordSimple]); $invertedIndexEntry = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple); $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; foreach ($invertedIndexEntry as $documentArrInIndex) { $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO']; // INGORE ROOT SOURCES AND PRONOUNS, WE ONLY NEED THE NROMAL CORRESPONDING WORD if ($WORD_TYPE == "PRONOUN_ANTECEDENT" || $WORD_TYPE == "ROOT") { continue; } $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); //echoN($qacLocation);exit;; //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); //exit; $qacMasterTableEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation); // search QAC for roots and LEMMAS for this word foreach ($qacMasterTableEntryArr as $segmentIndex => $segmentDataArr) { $tag = $segmentDataArr['TAG']; $segmentWord = $segmentDataArr['FORM_AR']; //echoN($segmentWord); //preprint_r($segmentDataArr); $segmentWordSimple = ""; $simpleRepresentation = getItemFromUthmaniToSimpleMappingTable($segmentWord); // !empty() produced error = Can't use function return value in write context if (strlen(trim($simpleRepresentation)) > 0) { $segmentWordSimple = $simpleRepresentation; } $buckwalterTransliteration = $segmentDataArr['FORM_EN']; if (isset($segmentDataArr['FEATURES']['LEM'])) { $lemma = $segmentDataArr['FEATURES']['LEM']; } $featuresArr = array_merge($segmentDataArr['FEATURES']); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); $wordId = getWordIndexFromQACLocation($qacLocation); if ($exactWord == TRUE) { $wordFromVerseAtLocation = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $wordId); if ($wordSimple !== $wordFromVerseAtLocation) { continue; } } //echoN("$segmentWord|$tag"); //for segments like ال no corresponding simple words to compare, not our target segment, so continue //if ( empty($segmentWordSimple)) continue; if (isset($segmentDataArr['FEATURES']['ROOT']) && $segmentDataArr['FEATURES']['ROOT'] != -1) { $wordRoot = $segmentDataArr['FEATURES']['ROOT']; } $posTagsArr[$tag] = 1; $lemmasArr[$lemma] = 1; //echoN("|$segmentWordSimple|$wordSimple|$segmentWord"); //$verseText = markSpecificWordInText($verseText,$wordId,$segmentWordSimple,"marked_fg"); $qacVerseLocation = substr($qacLocation, 0, strrpos($qacLocation, ":")); if (!isset($versesArr[$qacVerseLocation])) { $versesArr[$qacVerseLocation] = $verseText; } if (!isset($versesTagsArr[$qacVerseLocation])) { $versesTagsArr[$qacVerseLocation] = ""; } $versesTagsArr[$qacVerseLocation] = $versesTagsArr[$qacVerseLocation] . " " . $tag; } // we don't need all inverted index list except for verses, only break if we found at least one word if ($fast == true && !empty($versesArr)) { break; } } $wordInfoArr['BUCKWALTER'] = $buckwalterTransliteration; $wordInfoArr['ROOT'] = $wordRoot; $wordInfoArr['LEM'] = $lemmasArr; $wordInfoArr['POS'] = $posTagsArr; $wordInfoArr['VERSES'] = $versesArr; $wordInfoArr['VERSES_POS_TAGS'] = $versesTagsArr; $wordInfoArr['FEATURES'] = $featuresArr; return $wordInfoArr; }
//$segmentWord = removeTashkeel($segmentWord); if ($POS == "DET") { // second segment PoS $segmentPoStag = $qacMasterTableEntry[$segmentId]['TAG']; //number of segments $numberOfSegmentsInWord = count($qacMasterTableEntry); if (($segmentPoStag == "N" || $segmentPoStag == "ADJ") && $numberOfSegmentsInWord == 2) { continue; } } // get word index in verse $wordIndex = getWordIndexFromQACLocation($location); //echoN($segmentFormARimla2y); // get simple version of the word index $imla2yWordIndex = getImla2yWordIndexByUthmaniLocation($location); // get verse text $verseText = getVerseByQACLocation($QURAN_TEXT, $location); $imla2yWord = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $imla2yWordIndex); //echoN($imla2yWord); $stopWordsFromQuran[$imla2yWord] = 1; echoN($imla2yWord); } } $ya = "يا"; //add يا $stopWordsFromQuran[$ya] = 1; echoN(count($stopWordsFromQuran)); //preprint_r($stopWordsFromQuran); //exit; file_put_contents(dirname(__FILE__) . "/../data/quran-stop-words.strict.l2.ar", implode("\n", array_keys($stopWordsFromQuran))); exit;
function extendQueryByExtractingQACDerviations($extendedQueryWordsArr) { global $MODEL_SEARCH; /** GET ROOT/STEM FOR EACH QUERY WORD **/ foreach ($extendedQueryWordsArr as $word => $index) { //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$word]);exit; $invertedIndexEntryArr1 = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $word); foreach ($invertedIndexEntryArr1 as $documentArrInIndex) { $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO']; //echoN("|$INDEX_IN_AYA_EMLA2Y|"); //$INDEX_IN_AYA_EMLA2Y = getImla2yWordIndexByUthmaniLocation(getQACLocationStr($SURA+1,$AYA+1,$INDEX_IN_AYA_EMLA2Y),$UTHMANI_TO_SIMPLE_LOCATION_MAP); //echoN("|$INDEX_IN_AYA_UTHMANI|"); $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); //echoN($word); //echoN($WORD_TYPE); //preprint_r($documentArrInIndex); //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); $qacMasterTableEntryArr2 = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation); // search QAC for roots and LEMMAS for this word foreach ($qacMasterTableEntryArr2 as $segmentIndex => $segmentDataArr) { $segmentFormAR = $segmentDataArr['FORM_AR']; $segmentFormARimla2y = getItemFromUthmaniToSimpleMappingTable($segmentFormAR); //preprint_r($segmentDataArr); //echoN($segmentFormAR); //echoN($segmentFormARimla2y); //echoN($qacLocation); // the current query word has a ROOT in the current QAC segment if ($WORD_TYPE == "NORMAL_WORD" && isset($segmentDataArr['FEATURES']['STEM'])) { // get QAC root and LEM for the current query word $rootOfQueryQord = $segmentDataArr['FEATURES']['ROOT']; $stemOfQueryWord = $segmentDataArr['FEATURES']['LEM']; /* if ( empty($stemOfQueryWord) || empty($rootOfQueryQord)) { preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); echoN($rootOfQueryQord); echoN($stemOfQueryWord); exit; }*/ // add the STEMS to out extended query words if (!empty($rootOfQueryQord) && !isset($extendedQueryWordsArr[$rootOfQueryQord])) { $extendedQueryWordsArr[$rootOfQueryQord] = 1; } if (!isset($extendedQueryWordsArr[$stemOfQueryWord])) { $extendedQueryWordsArr[$stemOfQueryWord] = 1; } } } } ////////// CUSTOM ROOT TABLE /////////// //TODO: $zawaga = "زوج"; $CUSTOM_ROOTS_TABLE['الزواج'] = $zawaga; if (isset($CUSTOM_ROOTS_TABLE[$word])) { $extendedQueryWordsArr[$CUSTOM_ROOTS_TABLE[$word]] = 1; } //////////////////////////////////////// } $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; /** GET EMLA2Y (SIMPLE) WORDS CORRESPONDING TO ANY QAC SEGMENT CONTAINING THE ROOT/STEMS IN THE EXTENDED QUERY WORD FROM INVERTED INDEX * ADD TO EXTENDED QUERY WORDS * TODO: recheck to remove this whole loop * **/ foreach ($extendedQueryWordsArr as $word => $dummy) { // ONLY UTHMANI SHOULD BE HANDLED if (isSimpleQuranWord($word)) { continue; } $invertedIndexEntry = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $word); foreach ($invertedIndexEntry as $documentArrInIndex) { $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO']; $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); $wordFromVerse = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $INDEX_IN_AYA_EMLA2Y); if (empty($wordFromVerse)) { continue; } if ($WORD_TYPE == "PRONOUN_ANTECEDENT") { //echoN($wordFromVerse); // PRONOUNS SHOULD NOT BE ADDED TO THE QUERY BECAUSE THEY CAN REFER TO MANY THINGS // OTHER THAN THE ORIGINAL QUERY continue; } if (!isset($extendedQueryWordsArr[$wordFromVerse])) { $extendedQueryWordsArr[$wordFromVerse] = $qacLocation; } } } return $extendedQueryWordsArr; }