function getWordInfo($word, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $fast = FALSE, $exactWord = FALSE) { $wordInfoArr = array(); $word = trim($word); $wordUthmani = ""; $wordSimple = ""; if (isSimpleQuranWord($word)) { $wordUthmani = getItemFromUthmaniToSimpleMappingTable($word); $wordSimple = $word; } else { $wordUthmani = $word; //preprint_r($UTHMANI_TO_SIMPLE_WORD_MAP_VS); // tashkeel of last char is significant, ex: lemmas will probably not be in the MAP because of that $wordSimple = getItemFromUthmaniToSimpleMappingTable($wordUthmani); } $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", ""); $freqArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$wordSimple]; //preprint_r($freqArr); $wordInfoArr['WORD_SIMPLE'] = $wordSimple; $wordInfoArr['WORD_UTHMANI'] = $wordUthmani; /*echoN("Simple:".$wordSimple); echoN("Uthmani:".$wordUthmani); echoN("Repetition:".$freqArr['TF']); echoN("TF-IDF Weight:".round($freqArr['TFIDF'],2)); */ $wordInfoArr['TF'] = $freqArr['TF']; $wordInfoArr['TFIDF'] = round($freqArr['TFIDF'], 2); //preprint_r($MODEL_QAC['QAC_MASTERTABLE']); //preprint_r(array_keys($MODEL_QAC['QAC_FEATURES'])); $buckwalterTransliteration = ""; $posTagsArr = array(); $lemmasArr = array(); $wordRoot = ""; $featuresArr = array(); $versesArr = array(); $versesTagsArr = array(); $buckwalterTransliteration = ""; $wordRoot = ""; if (empty($wordSimple) || !modelEntryExistsInMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple)) { return null; } //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$wordSimple]); $invertedIndexEntry = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple); $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; foreach ($invertedIndexEntry as $documentArrInIndex) { $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO']; // INGORE ROOT SOURCES AND PRONOUNS, WE ONLY NEED THE NROMAL CORRESPONDING WORD if ($WORD_TYPE == "PRONOUN_ANTECEDENT" || $WORD_TYPE == "ROOT") { continue; } $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); //echoN($qacLocation);exit;; //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); //exit; $qacMasterTableEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation); // search QAC for roots and LEMMAS for this word foreach ($qacMasterTableEntryArr as $segmentIndex => $segmentDataArr) { $tag = $segmentDataArr['TAG']; $segmentWord = $segmentDataArr['FORM_AR']; //echoN($segmentWord); //preprint_r($segmentDataArr); $segmentWordSimple = ""; $simpleRepresentation = getItemFromUthmaniToSimpleMappingTable($segmentWord); // !empty() produced error = Can't use function return value in write context if (strlen(trim($simpleRepresentation)) > 0) { $segmentWordSimple = $simpleRepresentation; } $buckwalterTransliteration = $segmentDataArr['FORM_EN']; if (isset($segmentDataArr['FEATURES']['LEM'])) { $lemma = $segmentDataArr['FEATURES']['LEM']; } $featuresArr = array_merge($segmentDataArr['FEATURES']); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); $wordId = getWordIndexFromQACLocation($qacLocation); if ($exactWord == TRUE) { $wordFromVerseAtLocation = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $wordId); if ($wordSimple !== $wordFromVerseAtLocation) { continue; } } //echoN("$segmentWord|$tag"); //for segments like ال no corresponding simple words to compare, not our target segment, so continue //if ( empty($segmentWordSimple)) continue; if (isset($segmentDataArr['FEATURES']['ROOT']) && $segmentDataArr['FEATURES']['ROOT'] != -1) { $wordRoot = $segmentDataArr['FEATURES']['ROOT']; } $posTagsArr[$tag] = 1; $lemmasArr[$lemma] = 1; //echoN("|$segmentWordSimple|$wordSimple|$segmentWord"); //$verseText = markSpecificWordInText($verseText,$wordId,$segmentWordSimple,"marked_fg"); $qacVerseLocation = substr($qacLocation, 0, strrpos($qacLocation, ":")); if (!isset($versesArr[$qacVerseLocation])) { $versesArr[$qacVerseLocation] = $verseText; } if (!isset($versesTagsArr[$qacVerseLocation])) { $versesTagsArr[$qacVerseLocation] = ""; } $versesTagsArr[$qacVerseLocation] = $versesTagsArr[$qacVerseLocation] . " " . $tag; } // we don't need all inverted index list except for verses, only break if we found at least one word if ($fast == true && !empty($versesArr)) { break; } } $wordInfoArr['BUCKWALTER'] = $buckwalterTransliteration; $wordInfoArr['ROOT'] = $wordRoot; $wordInfoArr['LEM'] = $lemmasArr; $wordInfoArr['POS'] = $posTagsArr; $wordInfoArr['VERSES'] = $versesArr; $wordInfoArr['VERSES_POS_TAGS'] = $versesTagsArr; $wordInfoArr['FEATURES'] = $featuresArr; return $wordInfoArr; }
function getScoredDocumentsFromInveretdIndex($extendedQueryWordsArr, $query, $isPhraseSearch, $isQuestion, $isColumnSearch, $columnSearchKeyValParams, $isConceptSearch, $lang, $isTransliterationSearch) { global $MODEL_CORE, $MODEL_SEARCH; $QURAN_TEXT = getModelEntryFromMemory($lang, "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory($lang, "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; if ($isColumnSearch) { $SURA = $columnSearchKeyValParams['KEY'] - 1; $isFullChapter = $columnSearchKeyValParams['VAL'] == "ALL"; if ($isFullChapter) { $suraSize = count($QURAN_TEXT[$SURA]); for ($AYA = 0; $AYA < $suraSize; $AYA++) { $scoringTable[$SURA . ":" . $AYA] = array(); $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 1; $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA; $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA; } } else { $AYA = $columnSearchKeyValParams['VAL'] - 1; // VERSE VALIDITY CHECK $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, 0); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); if (empty($verseText)) { return array(); } ////////////////////// $scoringTable[$SURA . ":" . $AYA] = array(); $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 1; $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA; $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA; } return $scoringTable; } // $MODEL_QURANA = apc_fetch("MODEL_QURANA"); //preprint_r($extendedQueryWordsArr); //$isOneWordQuery = preg_match("/ /", $query)==0; //preprint_r($extendedQueryWordsArr); /** * GET ALL RESULT FORM INDEX USING EXTENDED QUERY WORD (WHICH INCLUDES ALL VARIATIONS AND PRONOUNS) */ foreach ($extendedQueryWordsArr as $word => $targetQACLocation) { //echoN("|$word|"); //echoN($lang); //echoN($isConceptSearch); /*if ($lang=="EN" && $isConceptSearch ) { }*/ //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$word]); $invertedIndexEntry = getModelEntryFromMemory($lang, "MODEL_SEARCH", "INVERTED_INDEX", $word); foreach ($invertedIndexEntry as $documentArrInIndex) { //echoN("$word"); //preprint_r($documentArrInIndex);; $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_INFO = $documentArrInIndex['EXTRA_INFO']; //echo getQACLocationStr($SURA,$AYA,$INDEX_IN_AYA_EMLA2Y); $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); /* * * NOTE: A DECISION SHOULD BE TAKEN TO SERACH AROUND AND REMOVE PAUSE MARKS OR NOT */ if ($lang == "AR") { $verseTextWithoutPauseMarks = removePauseMarkFromVerse($verseText); } else { $verseTextWithoutPauseMarks = removeSpecialCharactersFromMidQuery($verseText); } //echoN("|$query|$verseTextWithoutPauseMarks"); $fullQueryIsFoundInVerseCount = preg_match_all("/(^|[ ]){$query}([ ]|\$)/umi", $verseTextWithoutPauseMarks); //echoN("$query | $word"); if ($isPhraseSearch && $WORD_TYPE != "PRONOUN_ANTECEDENT") { $numberOfOccurencesForWord = $fullQueryIsFoundInVerseCount; if ($numberOfOccurencesForWord == 0) { continue; } } else { $numberOfOccurencesForWord = preg_match_all("/{$word}/um", $verseText); /*if ( $numberOfOccurencesForWord> 100) { echoN($word); echoN($verseText); preprint_r($extendedQueryWordsArr); exit; }*/ } //echoN($numberOfOccurencesForWord); //echoN("$qacLocation|$targetQACLocation|$word|$EXTRA_INFO|$WORD_TYPE"); // incase of non normal word ( QAC/QURANA) .. translate WordIndex from Uthmani script to Imla2y script /*if ( $WORD_TYPE!="NORMAL_WORD" ) { //echoN("OLD:$INDEX_IN_AYA_EMLA2Y"); $INDEX_IN_AYA_EMLA2Y = getImla2yWordIndexByUthmaniLocation($qacLocation,$UTHMANI_TO_SIMPLE_LOCATION_MAP); //echoN("NEW:$INDEX_IN_AYA_EMLA2Y"); }*/ //echoN($word); //preprint_r($documentArrInIndex); //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); if (!isset($scoringTable[$SURA . ":" . $AYA])) { $scoringTable[$SURA . ":" . $AYA] = array(); $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 0; $scoringTable[$SURA . ":" . $AYA]['FREQ'] = 0; $scoringTable[$SURA . ":" . $AYA]['DISTANCE'] = 0; $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] = 0; $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] = 0; $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] = 0; $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA; $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA; $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'] = array(); $scoringTable[$SURA . ":" . $AYA]['WORD_TYPE'] = $WORD_TYPE; $scoringTable[$SURA . ":" . $AYA]['EXTRA_INFO'] = $EXTRA_INFO; $scoringTable[$SURA . ":" . $AYA]['INDEX_IN_AYA_EMLA2Y'] = $INDEX_IN_AYA_EMLA2Y; $scoringTable[$SURA . ":" . $AYA]['INDEX_IN_AYA_UTHMANI'] = $INDEX_IN_AYA_UTHMANI; $scoringTable[$SURA . ":" . $AYA]['PRONOUNS'] = array(); } $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] = $numberOfOccurencesForWord; //echoN($numberOfOccurencesForWord); if (!isset($scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word]) && $numberOfOccurencesForWord > 0 && $scoringTable[$SURA . ":" . $AYA]['FREQ'] > 0) { //TODO: seems duplicate of WORD_OCCURENCES_COUNT // Raise the frequency (score) of ayas containing more than one of the query items $scoringTable[$SURA . ":" . $AYA]['FREQ']++; //=$numberOfOccurencesForWord; } else { $scoringTable[$SURA . ":" . $AYA]['FREQ']++; } /*$verseArr = preg_split("/ /",$MODEL_CORE['QURAN_TEXT'][$SURA][$AYA]); $verseArr = removePauseMarksFromArr($MODEL_CORE['TOTALS']['PAUSEMARKS'],$verseArr); $simpleWordFromText = $verseArr[$INDEX_IN_AYA_EMLA2Y-1]; */ /* if ( empty($simpleWordFromText)) { echoN($INDEX_IN_AYA_EMLA2Y); preprint_r($verseArr); } echoN($qacLocation); echoN($word); echoN($INDEX_IN_AYA_EMLA2Y); echoN($MODEL_CORE['QURAN_TEXT'][$SURA][$AYA]); echoN($simpleWordFromText); preprint_r($verseArr); */ // STEM or PRONOUN if ($WORD_TYPE == "PRONOUN_ANTECEDENT") { $scoringTable[$SURA . ":" . $AYA]['PRONOUNS'][$EXTRA_INFO] = $INDEX_IN_AYA_EMLA2Y; } else { if ($WORD_TYPE == "ROOT" || $WORD_TYPE == "LEM") { // for non-normal words this will get the whole segment $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE; // needed to fix root that are sometimes converted by uthmani/simple map below $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][removeTashkeel($EXTRA_INFO)] = $WORD_TYPE; // try to convert QAC uthmani word to simpleimla2y using the MAP table with and withou tashkeel $wordInAya = getItemFromUthmaniToSimpleMappingTable($EXTRA_INFO); if (empty($wordInAya)) { $wordInAya = getItemFromUthmaniToSimpleMappingTable(removeTashkeel($EXTRA_INFO)); } if (empty($wordInAya)) { $wordInAya = removeTashkeel($EXTRA_INFO); } /*if ( empty($wordInAya ) ) { preprint_r($documentArrInIndex); echoN($EXTRA_INFO); echo"HERE"; preprint_r($scoringTable[$SURA.":".$AYA]);exit; }*/ //echoN("$word-$wordInAya-$EXTRA_INFO"); $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$wordInAya] = $WORD_TYPE; } else { if ($isTransliterationSearch) { $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE; } else { // word was in original user query, not in our extended one ///if ( in_array($word,$queryWordsArr)) //{ $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE; } //} } } $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] = $fullQueryIsFoundInVerseCount; $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] = count($scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS']); $scoringTable[$SURA . ":" . $AYA]['SCORE'] = $scoringTable[$SURA . ":" . $AYA]['FREQ'] / 2 + $scoringTable[$SURA . ":" . $AYA]['DISTANCE'] * 1 + $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] * 10 + count($scoringTable[$SURA . ":" . $AYA]['PRONOUNS']) * 1 + $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] * 1 + $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] * 20; } } rsortBy($scoringTable, 'SCORE'); //preprint_r($scoringTable);exit; return $scoringTable; }