$segmentWord = $qacMasterTableEntry[$segmentId - 1]['FORM_AR'];
        $segmentWordLema = $qacMasterTableEntry[$segmentId - 1]['FEATURES']['LEM'];
        $segmentWordRoot = $qacMasterTableEntry[$segmentId - 1]['FEATURES']['ROOT'];
        $verseLocation = substr($location, 0, strlen($location) - 2);
        //$segmentWord = removeTashkeel($segmentWord);
        if ($POS == "DET") {
            // second segment PoS
            $segmentPoStag = $qacMasterTableEntry[$segmentId]['TAG'];
            //number of segments
            $numberOfSegmentsInWord = count($qacMasterTableEntry);
            if (($segmentPoStag == "N" || $segmentPoStag == "ADJ") && $numberOfSegmentsInWord == 2) {
                continue;
            }
        }
        // get word index in verse
        $wordIndex = getWordIndexFromQACLocation($location);
        //echoN($segmentFormARimla2y);
        // get simple version of the word index
        $imla2yWordIndex = getImla2yWordIndexByUthmaniLocation($location);
        // get verse text
        $verseText = getVerseByQACLocation($QURAN_TEXT, $location);
        $imla2yWord = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $imla2yWordIndex);
        //echoN($imla2yWord);
        $stopWordsFromQuran[$imla2yWord] = 1;
        echoN($imla2yWord);
    }
}
$ya = "يا";
//add يا
$stopWordsFromQuran[$ya] = 1;
echoN(count($stopWordsFromQuran));
コード例 #2
0
/** Returns words from QAC by PoS tags - grouped by lemma **/
function getWordsByPos(&$finalTerms, $POS)
{
    global $LEMMA_TO_SIMPLE_WORD_MAP;
    $qacPosEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_POS", $POS);
    $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", "");
    $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", "");
    $PAUSEMARKS = $TOTALS['PAUSEMARKS'];
    $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", "");
    // Get all segment in QAC for that PoS
    foreach ($qacPosEntryArr as $location => $segmentId) {
        $qacMasterTableEntry = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $location);
        // get Word, Lema and root
        $segmentWord = $qacMasterTableEntry[$segmentId - 1]['FORM_AR'];
        $segmentWordLema = $qacMasterTableEntry[$segmentId - 1]['FEATURES']['LEM'];
        $segmentWordRoot = $qacMasterTableEntry[$segmentId - 1]['FEATURES']['ROOT'];
        $verseLocation = substr($location, 0, strlen($location) - 2);
        //$segmentWord = removeTashkeel($segmentWord);
        // get word index in verse
        $wordIndex = getWordIndexFromQACLocation($location);
        //$segmentFormARimla2y = $UTHMANI_TO_SIMPLE_WORD_MAP_AND_VS[$segmentWord];
        // get simple version of the word index
        $imla2yWordIndex = getImla2yWordIndexByUthmaniLocation($location);
        // get verse text
        $verseText = getVerseByQACLocation($QURAN_TEXT, $location);
        //echoN("|$segmentWord|$imla2yWord");
        $segmentWordNoTashkeel = removeTashkeel($segmentWordLema);
        $superscriptAlef = json_decode('"\\u0670"');
        $alefWasla = "ٱ";
        //U+0671
        //$imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema];
        // this block is important since $LEMMA_TO_SIMPLE_WORD_MAP is not good for  non $superscriptAlef words
        // ex زيت lemma is converted to زيتها which spoiled the ontology concept list results
        if (mb_strpos($segmentWordLema, $superscriptAlef) !== false || mb_strpos($segmentWordLema, $alefWasla) !== false) {
            $imla2yWord = getItemFromUthmaniToSimpleMappingTable($segmentWordLema);
            if (empty($imla2yWord)) {
                $imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema];
            }
        } else {
            $imla2yWord = getItemFromUthmaniToSimpleMappingTable($segmentWordLema);
            if (empty($imla2yWord)) {
                $imla2yWord = shallowUthmaniToSimpleConversion($segmentWordLema);
                //$segmentWordNoTashkeel;
            }
        }
        /// in case the word was not found after removing tashkeel, try the lema mappign table
        $termWeightArr = $MODEL_CORE['WORDS_FREQUENCY']['WORDS_TFIDF'][$imla2yWord];
        // NOT WORKING BECAUSE LEMMAS WILL NOT BE IN SIMPLE WORDS LIST و الصابيئن =>صَّٰبِـِٔين
        // if the word after removing tashkeel is not found in quran simple words list, then try lemma table
        /*if (!isset($MODEL_CORE['WORDS_FREQUENCY']['WORDS'][$imla2yWord]) )
        		 {
        		 $imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema];
        
        		 if ( empty($imla2yWord) )
        		 {
        		 echoN($segmentWordLema);
        		 echoN($imla2yWord);
        		 preprint_r($LEMMA_TO_SIMPLE_WORD_MAP);
        		 preprint_r($MODEL_CORE['WORDS_FREQUENCY']['WORDS']);
        		 exit;
        		 }
        		 }*/
        if (empty($termWeightArr)) {
            //only for weight since the lema table decrease qurana matching
            $imla2yWordForWeight = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema];
            $termWeightArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$imla2yWordForWeight];
        }
        $termWeight = $termWeightArr['TFIDF'];
        ////////////////////////////////////////////
        $termWord = $segmentWordLema;
        //$imla2yWord;//"|$segmentWord| ".$imla2yWord ." - $location:$segmentId - $wordIndex=$imla2yWordIndex";
        if (!isset($finalTerms[$termWord])) {
            $finalTerms[$termWord] = generateEmptyConceptMetadata();
            $finalTerms[$termWord]['LEM'] = $segmentWordLema;
            $finalTerms[$termWord]['POS'] = $POS;
            $finalTerms[$termWord]['SIMPLE_WORD'] = $imla2yWord;
            $finalTerms[$termWord]['ROOT'] = $segmentWordRoot;
            $finalTerms[$termWord]['WEIGHT'] = $termWeight;
        }
        $finalTerms[$termWord]["FREQ"] = $finalTerms[$termWord]["FREQ"] + 1;
        if (!isset($finalTerms[$termWord]["SEG"][$segmentWord])) {
            $finalTerms[$termWord]["SEG"][$segmentWord] = $imla2yWord;
        }
        if (!isset($finalTerms[$termWord]["POSES"][$POS])) {
            $finalTerms[$termWord]["POSES"][$POS] = 1;
        }
    }
    return $finalTerms;
}
コード例 #3
0
function getWordInfo($word, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $fast = FALSE, $exactWord = FALSE)
{
    $wordInfoArr = array();
    $word = trim($word);
    $wordUthmani = "";
    $wordSimple = "";
    if (isSimpleQuranWord($word)) {
        $wordUthmani = getItemFromUthmaniToSimpleMappingTable($word);
        $wordSimple = $word;
    } else {
        $wordUthmani = $word;
        //preprint_r($UTHMANI_TO_SIMPLE_WORD_MAP_VS);
        // tashkeel of last char is significant, ex: lemmas will probably not be in the MAP because of that
        $wordSimple = getItemFromUthmaniToSimpleMappingTable($wordUthmani);
    }
    $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", "");
    $freqArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$wordSimple];
    //preprint_r($freqArr);
    $wordInfoArr['WORD_SIMPLE'] = $wordSimple;
    $wordInfoArr['WORD_UTHMANI'] = $wordUthmani;
    /*echoN("Simple:".$wordSimple);
    		 echoN("Uthmani:".$wordUthmani);
    		
    		
    		
    		echoN("Repetition:".$freqArr['TF']);
    		echoN("TF-IDF Weight:".round($freqArr['TFIDF'],2));
    		
    		
    		
    		*/
    $wordInfoArr['TF'] = $freqArr['TF'];
    $wordInfoArr['TFIDF'] = round($freqArr['TFIDF'], 2);
    //preprint_r($MODEL_QAC['QAC_MASTERTABLE']);
    //preprint_r(array_keys($MODEL_QAC['QAC_FEATURES']));
    $buckwalterTransliteration = "";
    $posTagsArr = array();
    $lemmasArr = array();
    $wordRoot = "";
    $featuresArr = array();
    $versesArr = array();
    $versesTagsArr = array();
    $buckwalterTransliteration = "";
    $wordRoot = "";
    if (empty($wordSimple) || !modelEntryExistsInMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple)) {
        return null;
    }
    //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$wordSimple]);
    $invertedIndexEntry = getModelEntryFromMemory("AR", "MODEL_SEARCH", "INVERTED_INDEX", $wordSimple);
    $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", "");
    $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", "");
    $PAUSEMARKS = $TOTALS['PAUSEMARKS'];
    foreach ($invertedIndexEntry as $documentArrInIndex) {
        $SURA = $documentArrInIndex['SURA'];
        $AYA = $documentArrInIndex['AYA'];
        $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y'];
        $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI'];
        $WORD_TYPE = $documentArrInIndex['WORD_TYPE'];
        $EXTRA_WORD_TYPE_INFO = $documentArrInIndex['EXTRA_INFO'];
        // INGORE ROOT SOURCES AND PRONOUNS, WE ONLY NEED THE NROMAL CORRESPONDING WORD
        if ($WORD_TYPE == "PRONOUN_ANTECEDENT" || $WORD_TYPE == "ROOT") {
            continue;
        }
        $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI);
        //echoN($qacLocation);exit;;
        //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]);
        //exit;
        $qacMasterTableEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation);
        // search QAC for roots and LEMMAS for this word
        foreach ($qacMasterTableEntryArr as $segmentIndex => $segmentDataArr) {
            $tag = $segmentDataArr['TAG'];
            $segmentWord = $segmentDataArr['FORM_AR'];
            //echoN($segmentWord);
            //preprint_r($segmentDataArr);
            $segmentWordSimple = "";
            $simpleRepresentation = getItemFromUthmaniToSimpleMappingTable($segmentWord);
            // !empty() produced error = Can't use function return value in write context
            if (strlen(trim($simpleRepresentation)) > 0) {
                $segmentWordSimple = $simpleRepresentation;
            }
            $buckwalterTransliteration = $segmentDataArr['FORM_EN'];
            if (isset($segmentDataArr['FEATURES']['LEM'])) {
                $lemma = $segmentDataArr['FEATURES']['LEM'];
            }
            $featuresArr = array_merge($segmentDataArr['FEATURES']);
            $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation);
            $wordId = getWordIndexFromQACLocation($qacLocation);
            if ($exactWord == TRUE) {
                $wordFromVerseAtLocation = getWordFromVerseByIndex($PAUSEMARKS, $verseText, $wordId);
                if ($wordSimple !== $wordFromVerseAtLocation) {
                    continue;
                }
            }
            //echoN("$segmentWord|$tag");
            //for segments like ال no corresponding simple words to compare, not our target segment, so continue
            //if ( empty($segmentWordSimple)) continue;
            if (isset($segmentDataArr['FEATURES']['ROOT']) && $segmentDataArr['FEATURES']['ROOT'] != -1) {
                $wordRoot = $segmentDataArr['FEATURES']['ROOT'];
            }
            $posTagsArr[$tag] = 1;
            $lemmasArr[$lemma] = 1;
            //echoN("|$segmentWordSimple|$wordSimple|$segmentWord");
            //$verseText = markSpecificWordInText($verseText,$wordId,$segmentWordSimple,"marked_fg");
            $qacVerseLocation = substr($qacLocation, 0, strrpos($qacLocation, ":"));
            if (!isset($versesArr[$qacVerseLocation])) {
                $versesArr[$qacVerseLocation] = $verseText;
            }
            if (!isset($versesTagsArr[$qacVerseLocation])) {
                $versesTagsArr[$qacVerseLocation] = "";
            }
            $versesTagsArr[$qacVerseLocation] = $versesTagsArr[$qacVerseLocation] . " " . $tag;
        }
        // we don't need all inverted index list except for verses, only break if we found at least one word
        if ($fast == true && !empty($versesArr)) {
            break;
        }
    }
    $wordInfoArr['BUCKWALTER'] = $buckwalterTransliteration;
    $wordInfoArr['ROOT'] = $wordRoot;
    $wordInfoArr['LEM'] = $lemmasArr;
    $wordInfoArr['POS'] = $posTagsArr;
    $wordInfoArr['VERSES'] = $versesArr;
    $wordInfoArr['VERSES_POS_TAGS'] = $versesTagsArr;
    $wordInfoArr['FEATURES'] = $featuresArr;
    return $wordInfoArr;
}
コード例 #4
0
        foreach ($featuresArr as $oneFeature) {
            $isFeatureFound = isset($qacMasterTableEntry[$segmentId - 1]['FEATURES'][$features]);
        }
        if ($isFeatureFound == false) {
            continue;
        }
    }
    if (!isset($unrepeatedWords[$segmentWord])) {
        $unrepeatedWords[$segmentWord] = 1;
    }
    if (!isset($markedVerses[$verseLocation])) {
        $verseText = getVerseByQACLocation($QURAN_TEXT, $location);
    } else {
        $verseText = $markedVerses[$verseLocation];
    }
    $wordId = getWordIndexFromQACLocation($location) - 1;
    $verseText = markSpecificWordInText($verseText, $wordId, $segmentWord, "marked_fg");
    $markedVerses[$verseLocation] = $verseText;
}
?>
					<div id='pos-words-verses-statistics'> 
					
					
					<b><?php 
echo addCommasToNumber($allOccurencesCount);
?>
</b> All Segments - 
					<b><?php 
echo addCommasToNumber(count($unrepeatedWords));
?>
</b> Distinct Words -