Esempio n. 1
0
function getPoSTaggedSubsentences($coreModelUsed = "UTH")
{
    global $MODEL_QAC, $numberOfSuras;
    global $saktaLatifaMark, $sajdahMark;
    $posTaggedSubSentencesArr = array();
    if ($coreModelUsed == "UTH") {
        $QURAN_TEXT = getModelEntryFromMemory("AR_UTH", "MODEL_CORE", "QURAN_TEXT", "");
    } else {
        $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", "");
    }
    $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", "");
    $PAUSEMARKS = $TOTALS['PAUSEMARKS'];
    //preprint_r($PAUSEMARKS);
    /* SURA'S LOOP **/
    for ($s = 0; $s < $numberOfSuras; $s++) {
        $suraSize = count($QURAN_TEXT[$s]);
        /* VERSES LOOP **/
        for ($a = 0; $a < $suraSize; $a++) {
            $i++;
            $verseTextUthmani = $QURAN_TEXT[$s][$a];
            $uthmaniWordsArr = preg_split("/ /", $verseTextUthmani);
            // echoN($verseTextUthmani);
            $subsentenceIndex = 1;
            $verseLocation = $s + 1 . ":" . ($a + 1) . "-" . $subsentenceIndex;
            // ARRAY INIT FOR THIS LOCATION
            if (!isset($posTaggedSubSentencesArr[$verseLocation])) {
                $posTaggedSubSentencesArr[$verseLocation] = array("WORDS" => array(), "POS_TAGS" => array(), "QAC_WORD_INDEXES" => array());
            }
            //$uthmaniWordsArr = removePauseMarksFromArr($pauseMarksArr,$uthmaniWordsArr);
            $wordsInSubSentence = 0;
            $verseNonPauseWordsIndex = 1;
            // LOOP ON WORDS
            foreach ($uthmaniWordsArr as $index => $uthmaniWord) {
                //echoN("$index|$uthmaniWord");
                // WORD IS A PUASE MARK
                if (isPauseMark($uthmaniWord, $PAUSEMARKS, $saktaLatifaMark, $sajdahMark)) {
                    // INCREASE SUBSENTENCE INDEX
                    $subsentenceIndex++;
                    // RESET WORD COUNTER ( IN SS)
                    $wordsInSubSentence = 0;
                    // REGENERATE VERSE LOCATION
                    $verseLocation = $s + 1 . ":" . ($a + 1) . "-" . $subsentenceIndex;
                    continue;
                }
                //$simpleWord = $UTHMANI_TO_SIMPLE_WORD_MAP_AND_VS[$uthmaniWord];
                // GET CORRESPONDING QAC LOCATION FOR CURRENT WORD
                // $verseNonPauseWordsIndex = QAC WORD INDEX EXCLUDING PAUSE MAKRS
                $qacLocation = $s + 1 . ":" . ($a + 1) . ":" . $verseNonPauseWordsIndex;
                $qacWordSegmentsArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation);
                //$qacWordSegmentsArr = $MODEL_QAC['QAC_MASTERTABLE'][$qacLocation];
                //echoN($qacLocation);
                //echoN($verseLocation);
                //DEBUG
                /*if ( (($s+1).":".($a+1))=="47:38" && $subsentenceIndex==4)
                		{
                			echon($verseNonPauseWordsIndex);
                			echoN($qacLocation);
                			preprint_r($qacWordSegmentsArr);
                			preprint_r($uthmaniWordsArr);exit;
                			
                		}*/
                // INIT NEW LOCATION ARRAYS
                if (!isset($posTaggedSubSentencesArr[$verseLocation])) {
                    $posTaggedSubSentencesArr[$verseLocation] = array("WORDS" => array(), "POS_TAGS" => array(), "QAC_WORD_INDEXES" => array());
                }
                // FILL SUBSENTCE WORDS ARRAY
                $posTaggedSubSentencesArr[$verseLocation]['WORDS'][$wordsInSubSentence] = $uthmaniWord;
                //echoN(print_r($qacWordSegmentsArr,true));
                // GENERATE TAGS LIST STRING
                $currentWordTags = "";
                foreach ($qacWordSegmentsArr as $segmentIndex => $segmentArr) {
                    //$lemma = $qacWordSegmentsArr[$segmentIndex]['FEATURES']['LEM'];
                    //$segmentAR = $qacWordSegmentsArr[$segmentIndex]['FORM_AR'];
                    $newTag = $qacWordSegmentsArr[$segmentIndex]['TAG'];
                    $currentWordTags = $currentWordTags . " " . $newTag;
                }
                // FILL SUBSENTCENCE TAGS AND CORRESPONDING QAC WORD INDEX
                $posTaggedSubSentencesArr[$verseLocation]['POS_TAGS'][$wordsInSubSentence] = trim($currentWordTags);
                $posTaggedSubSentencesArr[$verseLocation]['QAC_WORD_INDEXES'][$wordsInSubSentence] = $verseNonPauseWordsIndex;
                $wordsInSubSentence++;
                $verseNonPauseWordsIndex++;
                //preprint_r($posTaggedSubSentencesArr[$verseLocation]);
            }
            //echoN("###".$verseLocation);
        }
    }
    return $posTaggedSubSentencesArr;
}
Esempio n. 2
0
function getStatisticallySginificantWords($extendedQueryWordsArr, $scoringTable)
{
    global $MODEL_CORE, $MODEL_CORE_UTH, $script;
    global $saktaLatifaMark, $sajdahMark;
    //preprint_r($extendedQueryWordsArr);exit;
    $queryTermsCollocation = array();
    $relevanceReverseOrderIndex = count($documentScoreArr);
    foreach ($scoringTable as $documentID => $documentScoreArr) {
        $SURA = $documentScoreArr['SURA'];
        $AYA = $documentScoreArr['AYA'];
        $TEXT = $MODEL_CORE['QURAN_TEXT'][$SURA][$AYA];
        $TEXT_UTH = $MODEL_CORE_UTH['QURAN_TEXT'][$SURA][$AYA];
        $wordsArr = explode(" ", $TEXT);
        $lastWord = null;
        foreach ($wordsArr as $word) {
            $word = cleanAndTrim($word);
            if (empty($word)) {
                continue;
            }
            $word = strtolower($word);
            if (isset($MODEL_CORE['STOP_WORDS'][$word])) {
                continue;
            }
            // ignore pause marks
            if (isPauseMark($word, $MODEL_CORE['TOTALS']['PAUSEMARKS'], $saktaLatifaMark, $sajdahMark)) {
                continue;
            }
            if (!empty($lastWord) && isset($extendedQueryWordsArr[$word]) && !isset($extendedQueryWordsArr[$lastWord])) {
                $queryTermsCollocation[$lastWord]++;
            }
            if (!empty($lastWord) && isset($extendedQueryWordsArr[$lastWord]) && !isset($extendedQueryWordsArr[$word])) {
                $queryTermsCollocation[$word]++;
            }
            $lastWord = $word;
        }
    }
    arsort($queryTermsCollocation);
    //preprint_r($queryTermsCollocation);exit;
    $queryTermsCollocation = array_slice($queryTermsCollocation, 0, 10);
    return $queryTermsCollocation;
}