Example #1
0
function getWordInfo($word, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $fast = FALSE, $exactWord = FALSE)
{
    $wordInfoArr = array();
    $word = trim($word);
    $wordUthmani = "";
    $wordSimple = "";
    if (isSimpleQuranWord($word)) {
        $wordUthmani = getItemFromUthmaniToSimpleMappingTable($word);
        $wordSimple = $word;
    } else {
        $wordUthmani = $word;
        //preprint_r($UTHMANI_TO_SIMPLE_WORD_MAP_VS);
        // tashkeel of last char is significant, ex: lemmas will probably not be in the MAP because of that
        $wordSimple = getItemFromUthmaniToSimpleMappingTable($wordUthmani);
    }
    $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", "");
    $freqArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$wordSimple];
    //preprint_r($freqArr);
    $wordInfoArr['WORD_SIMPLE'] = $wordSimple;
    $wordInfoArr['WORD_UTHMANI'] = $wordUthmani;
    /*echoN("Simple:".$wordSimple);
    		 echoN("Uthmani:".$wordUthmani);
    		
    		
    		
    		echoN("Repetition:".$freqArr['TF']);
    		echoN("TF-IDF Weight:".round($freqArr['TFIDF'],2));
    		
    		
    		
    		*/
    $wordInfoArr['TF'] = $freqArr['TF'];
    $wordInfoArr['TFIDF'] = round($freqArr['TFIDF'], 2);
    //preprint_r($MODEL_QAC['QAC_MASTERTABLE']);
    //preprint_r(array_keys($MODEL_QAC['QAC_FEATURES']));
    $buckwalterTransliteration = "";
    $posTagsArr = array();
    $lemmasArr = array();
    $wordRoot = "";
    $featuresArr = array();
    $versesArr = array();
    $versesTagsArr = array();
    $buckwalterTransliteration = "";
    $wordRoot = "";
    if (empty($wordSimple) || !modelEntryExistsInMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple)) {
        return null;
    }
    //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$wordSimple]);
    $invertedIndexEntry = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple);
    $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", "");
    $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", "");
    $PAUSEMARKS = $TOTALS['PAUSEMARKS'];
    foreach ($invertedIndexEntry as $documentArrInIndex) {
        $SURA = $documentArrInIndex['SURA'];
        $AYA = $documentArrInIndex['AYA'];
        $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y'];
        $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI'];
        $WORD_TYPE = $documentArrInIndex['WORD_TYPE'];
        $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO'];
        // INGORE ROOT SOURCES AND PRONOUNS, WE ONLY NEED THE NROMAL CORRESPONDING WORD
        if ($WORD_TYPE == "PRONOUN_ANTECEDENT" || $WORD_TYPE == "ROOT") {
            continue;
        }
        $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI);
        //echoN($qacLocation);exit;;
        //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]);
        //exit;
        $qacMasterTableEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation);
        // search QAC for roots and LEMMAS for this word
        foreach ($qacMasterTableEntryArr as $segmentIndex => $segmentDataArr) {
            $tag = $segmentDataArr['TAG'];
            $segmentWord = $segmentDataArr['FORM_AR'];
            //echoN($segmentWord);
            //preprint_r($segmentDataArr);
            $segmentWordSimple = "";
            $simpleRepresentation = getItemFromUthmaniToSimpleMappingTable($segmentWord);
            // !empty() produced error = Can't use function return value in write context
            if (strlen(trim($simpleRepresentation)) > 0) {
                $segmentWordSimple = $simpleRepresentation;
            }
            $buckwalterTransliteration = $segmentDataArr['FORM_EN'];
            if (isset($segmentDataArr['FEATURES']['LEM'])) {
                $lemma = $segmentDataArr['FEATURES']['LEM'];
            }
            $featuresArr = array_merge($segmentDataArr['FEATURES']);
            $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation);
            $wordId = getWordIndexFromQACLocation($qacLocation);
            if ($exactWord == TRUE) {
                $wordFromVerseAtLocation = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $wordId);
                if ($wordSimple !== $wordFromVerseAtLocation) {
                    continue;
                }
            }
            //echoN("$segmentWord|$tag");
            //for segments like ال no corresponding simple words to compare, not our target segment, so continue
            //if ( empty($segmentWordSimple)) continue;
            if (isset($segmentDataArr['FEATURES']['ROOT']) && $segmentDataArr['FEATURES']['ROOT'] != -1) {
                $wordRoot = $segmentDataArr['FEATURES']['ROOT'];
            }
            $posTagsArr[$tag] = 1;
            $lemmasArr[$lemma] = 1;
            //echoN("|$segmentWordSimple|$wordSimple|$segmentWord");
            //$verseText = markSpecificWordInText($verseText,$wordId,$segmentWordSimple,"marked_fg");
            $qacVerseLocation = substr($qacLocation, 0, strrpos($qacLocation, ":"));
            if (!isset($versesArr[$qacVerseLocation])) {
                $versesArr[$qacVerseLocation] = $verseText;
            }
            if (!isset($versesTagsArr[$qacVerseLocation])) {
                $versesTagsArr[$qacVerseLocation] = "";
            }
            $versesTagsArr[$qacVerseLocation] = $versesTagsArr[$qacVerseLocation] . " " . $tag;
        }
        // we don't need all inverted index list except for verses, only break if we found at least one word
        if ($fast == true && !empty($versesArr)) {
            break;
        }
    }
    $wordInfoArr['BUCKWALTER'] = $buckwalterTransliteration;
    $wordInfoArr['ROOT'] = $wordRoot;
    $wordInfoArr['LEM'] = $lemmasArr;
    $wordInfoArr['POS'] = $posTagsArr;
    $wordInfoArr['VERSES'] = $versesArr;
    $wordInfoArr['VERSES_POS_TAGS'] = $versesTagsArr;
    $wordInfoArr['FEATURES'] = $featuresArr;
    return $wordInfoArr;
}
function getScoredDocumentsFromInveretdIndex($extendedQueryWordsArr, $query, $isPhraseSearch, $isQuestion, $isColumnSearch, $columnSearchKeyValParams, $isConceptSearch, $lang, $isTransliterationSearch)
{
    global $MODEL_CORE, $MODEL_SEARCH;
    $QURAN_TEXT = getModelEntryFromMemory($lang, "MODEL_CORE", "QURAN_TEXT", "");
    $TOTALS = getModelEntryFromMemory($lang, "MODEL_CORE", "TOTALS", "");
    $PAUSEMARKS = $TOTALS['PAUSEMARKS'];
    if ($isColumnSearch) {
        $SURA = $columnSearchKeyValParams['KEY'] - 1;
        $isFullChapter = $columnSearchKeyValParams['VAL'] == "ALL";
        if ($isFullChapter) {
            $suraSize = count($QURAN_TEXT[$SURA]);
            for ($AYA = 0; $AYA < $suraSize; $AYA++) {
                $scoringTable[$SURA . ":" . $AYA] = array();
                $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 1;
                $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA;
                $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA;
            }
        } else {
            $AYA = $columnSearchKeyValParams['VAL'] - 1;
            // VERSE VALIDITY CHECK
            $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, 0);
            $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation);
            if (empty($verseText)) {
                return array();
            }
            //////////////////////
            $scoringTable[$SURA . ":" . $AYA] = array();
            $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 1;
            $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA;
            $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA;
        }
        return $scoringTable;
    }
    //	$MODEL_QURANA  = apc_fetch("MODEL_QURANA");
    //preprint_r($extendedQueryWordsArr);
    //$isOneWordQuery = preg_match("/ /", $query)==0;
    //preprint_r($extendedQueryWordsArr);
    /**
     * GET ALL RESULT FORM INDEX USING EXTENDED QUERY WORD (WHICH INCLUDES ALL VARIATIONS AND PRONOUNS)
     */
    foreach ($extendedQueryWordsArr as $word => $targetQACLocation) {
        //echoN("|$word|");
        //echoN($lang);
        //echoN($isConceptSearch);
        /*if ($lang=="EN" && $isConceptSearch )
        		{
        
        			
        		}*/
        //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$word]);
        $invertedIndexEntry = getModelEntryFromMemory($lang, "MODEL_SEARCH", "INVERTED_INDEX", $word);
        foreach ($invertedIndexEntry as $documentArrInIndex) {
            //echoN("$word");
            //preprint_r($documentArrInIndex);;
            $SURA = $documentArrInIndex['SURA'];
            $AYA = $documentArrInIndex['AYA'];
            $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y'];
            $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI'];
            $WORD_TYPE = $documentArrInIndex['WORD_TYPE'];
            $EXTRA_INFO = $documentArrInIndex['EXTRA_INFO'];
            //echo getQACLocationStr($SURA,$AYA,$INDEX_IN_AYA_EMLA2Y);
            $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI);
            $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation);
            /*
             *
             * NOTE: A DECISION SHOULD BE TAKEN TO SERACH AROUND AND REMOVE PAUSE MARKS OR NOT
             */
            if ($lang == "AR") {
                $verseTextWithoutPauseMarks = removePauseMarkFromVerse($verseText);
            } else {
                $verseTextWithoutPauseMarks = removeSpecialCharactersFromMidQuery($verseText);
            }
            //echoN("|$query|$verseTextWithoutPauseMarks");
            $fullQueryIsFoundInVerseCount = preg_match_all("/(^|[ ]){$query}([ ]|\$)/umi", $verseTextWithoutPauseMarks);
            //echoN("$query | $word");
            if ($isPhraseSearch && $WORD_TYPE != "PRONOUN_ANTECEDENT") {
                $numberOfOccurencesForWord = $fullQueryIsFoundInVerseCount;
                if ($numberOfOccurencesForWord == 0) {
                    continue;
                }
            } else {
                $numberOfOccurencesForWord = preg_match_all("/{$word}/um", $verseText);
                /*if ( $numberOfOccurencesForWord> 100)
                		{
                			echoN($word);
                			echoN($verseText);
                			preprint_r($extendedQueryWordsArr);
                			exit;
                		}*/
            }
            //echoN($numberOfOccurencesForWord);
            //echoN("$qacLocation|$targetQACLocation|$word|$EXTRA_INFO|$WORD_TYPE");
            // incase of non normal word ( QAC/QURANA) .. translate WordIndex from Uthmani script to Imla2y script
            /*if ( $WORD_TYPE!="NORMAL_WORD"   )
            		{
            		//echoN("OLD:$INDEX_IN_AYA_EMLA2Y");
            		$INDEX_IN_AYA_EMLA2Y = getImla2yWordIndexByUthmaniLocation($qacLocation,$UTHMANI_TO_SIMPLE_LOCATION_MAP);
            		//echoN("NEW:$INDEX_IN_AYA_EMLA2Y");
            		}*/
            //echoN($word);
            //preprint_r($documentArrInIndex);
            //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]);
            if (!isset($scoringTable[$SURA . ":" . $AYA])) {
                $scoringTable[$SURA . ":" . $AYA] = array();
                $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 0;
                $scoringTable[$SURA . ":" . $AYA]['FREQ'] = 0;
                $scoringTable[$SURA . ":" . $AYA]['DISTANCE'] = 0;
                $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] = 0;
                $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] = 0;
                $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] = 0;
                $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA;
                $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA;
                $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'] = array();
                $scoringTable[$SURA . ":" . $AYA]['WORD_TYPE'] = $WORD_TYPE;
                $scoringTable[$SURA . ":" . $AYA]['EXTRA_INFO'] = $EXTRA_INFO;
                $scoringTable[$SURA . ":" . $AYA]['INDEX_IN_AYA_EMLA2Y'] = $INDEX_IN_AYA_EMLA2Y;
                $scoringTable[$SURA . ":" . $AYA]['INDEX_IN_AYA_UTHMANI'] = $INDEX_IN_AYA_UTHMANI;
                $scoringTable[$SURA . ":" . $AYA]['PRONOUNS'] = array();
            }
            $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] = $numberOfOccurencesForWord;
            //echoN($numberOfOccurencesForWord);
            if (!isset($scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word]) && $numberOfOccurencesForWord > 0 && $scoringTable[$SURA . ":" . $AYA]['FREQ'] > 0) {
                //TODO: seems duplicate of WORD_OCCURENCES_COUNT
                // Raise the frequency (score) of ayas containing more than one of the query items
                $scoringTable[$SURA . ":" . $AYA]['FREQ']++;
                //=$numberOfOccurencesForWord;
            } else {
                $scoringTable[$SURA . ":" . $AYA]['FREQ']++;
            }
            /*$verseArr = preg_split("/ /",$MODEL_CORE['QURAN_TEXT'][$SURA][$AYA]);
            			 	
            			$verseArr = removePauseMarksFromArr($MODEL_CORE['TOTALS']['PAUSEMARKS'],$verseArr);
            				
            	
            			$simpleWordFromText = $verseArr[$INDEX_IN_AYA_EMLA2Y-1];
            			*/
            /*
             if ( empty($simpleWordFromText))
             {
            echoN($INDEX_IN_AYA_EMLA2Y);
            preprint_r($verseArr);
            }
            	
            echoN($qacLocation);
            echoN($word);
            echoN($INDEX_IN_AYA_EMLA2Y);
            echoN($MODEL_CORE['QURAN_TEXT'][$SURA][$AYA]);
            echoN($simpleWordFromText);
            preprint_r($verseArr);
            */
            // STEM or PRONOUN
            if ($WORD_TYPE == "PRONOUN_ANTECEDENT") {
                $scoringTable[$SURA . ":" . $AYA]['PRONOUNS'][$EXTRA_INFO] = $INDEX_IN_AYA_EMLA2Y;
            } else {
                if ($WORD_TYPE == "ROOT" || $WORD_TYPE == "LEM") {
                    // for non-normal words this will get the whole  segment
                    $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE;
                    // needed to fix root that are sometimes converted by uthmani/simple map below
                    $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][removeTashkeel($EXTRA_INFO)] = $WORD_TYPE;
                    // try to convert QAC uthmani word to simpleimla2y using the MAP table with and withou tashkeel
                    $wordInAya = getItemFromUthmaniToSimpleMappingTable($EXTRA_INFO);
                    if (empty($wordInAya)) {
                        $wordInAya = getItemFromUthmaniToSimpleMappingTable(removeTashkeel($EXTRA_INFO));
                    }
                    if (empty($wordInAya)) {
                        $wordInAya = removeTashkeel($EXTRA_INFO);
                    }
                    /*if ( empty($wordInAya ) )
                    		 {
                    		preprint_r($documentArrInIndex);
                    		echoN($EXTRA_INFO);
                    		echo"HERE";
                    		preprint_r($scoringTable[$SURA.":".$AYA]);exit;
                    		}*/
                    //echoN("$word-$wordInAya-$EXTRA_INFO");
                    $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$wordInAya] = $WORD_TYPE;
                } else {
                    if ($isTransliterationSearch) {
                        $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE;
                    } else {
                        // word was in original user query, not in our extended one
                        ///if ( in_array($word,$queryWordsArr))
                        //{
                        $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE;
                    }
                    //}
                }
            }
            $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] = $fullQueryIsFoundInVerseCount;
            $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] = count($scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS']);
            $scoringTable[$SURA . ":" . $AYA]['SCORE'] = $scoringTable[$SURA . ":" . $AYA]['FREQ'] / 2 + $scoringTable[$SURA . ":" . $AYA]['DISTANCE'] * 1 + $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] * 10 + count($scoringTable[$SURA . ":" . $AYA]['PRONOUNS']) * 1 + $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] * 1 + $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] * 20;
        }
    }
    rsortBy($scoringTable, 'SCORE');
    //preprint_r($scoringTable);exit;
    return $scoringTable;
}