function getPoSTaggedSubsentences($coreModelUsed = "UTH") { global $MODEL_QAC, $numberOfSuras; global $saktaLatifaMark, $sajdahMark; $posTaggedSubSentencesArr = array(); if ($coreModelUsed == "UTH") { $QURAN_TEXT = getModelEntryFromMemory("AR_UTH", "MODEL_CORE", "QURAN_TEXT", ""); } else { $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); } $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; //preprint_r($PAUSEMARKS); /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { $suraSize = count($QURAN_TEXT[$s]); /* VERSES LOOP **/ for ($a = 0; $a < $suraSize; $a++) { $i++; $verseTextUthmani = $QURAN_TEXT[$s][$a]; $uthmaniWordsArr = preg_split("/ /", $verseTextUthmani); // echoN($verseTextUthmani); $subsentenceIndex = 1; $verseLocation = $s + 1 . ":" . ($a + 1) . "-" . $subsentenceIndex; // ARRAY INIT FOR THIS LOCATION if (!isset($posTaggedSubSentencesArr[$verseLocation])) { $posTaggedSubSentencesArr[$verseLocation] = array("WORDS" => array(), "POS_TAGS" => array(), "QAC_WORD_INDEXES" => array()); } //$uthmaniWordsArr = removePauseMarksFromArr($pauseMarksArr,$uthmaniWordsArr); $wordsInSubSentence = 0; $verseNonPauseWordsIndex = 1; // LOOP ON WORDS foreach ($uthmaniWordsArr as $index => $uthmaniWord) { //echoN("$index|$uthmaniWord"); // WORD IS A PUASE MARK if (isPauseMark($uthmaniWord, $PAUSEMARKS, $saktaLatifaMark, $sajdahMark)) { // INCREASE SUBSENTENCE INDEX $subsentenceIndex++; // RESET WORD COUNTER ( IN SS) $wordsInSubSentence = 0; // REGENERATE VERSE LOCATION $verseLocation = $s + 1 . ":" . ($a + 1) . "-" . $subsentenceIndex; continue; } //$simpleWord = $UTHMANI_TO_SIMPLE_WORD_MAP_AND_VS[$uthmaniWord]; // GET CORRESPONDING QAC LOCATION FOR CURRENT WORD // $verseNonPauseWordsIndex = QAC WORD INDEX EXCLUDING PAUSE MAKRS $qacLocation = $s + 1 . ":" . ($a + 1) . ":" . $verseNonPauseWordsIndex; $qacWordSegmentsArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $qacLocation); //$qacWordSegmentsArr = $MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]; //echoN($qacLocation); //echoN($verseLocation); //DEBUG /*if ( (($s+1).":".($a+1))=="47:38" && $subsentenceIndex==4) { echon($verseNonPauseWordsIndex); echoN($qacLocation); preprint_r($qacWordSegmentsArr); preprint_r($uthmaniWordsArr);exit; }*/ // INIT NEW LOCATION ARRAYS if (!isset($posTaggedSubSentencesArr[$verseLocation])) { $posTaggedSubSentencesArr[$verseLocation] = array("WORDS" => array(), "POS_TAGS" => array(), "QAC_WORD_INDEXES" => array()); } // FILL SUBSENTCE WORDS ARRAY $posTaggedSubSentencesArr[$verseLocation]['WORDS'][$wordsInSubSentence] = $uthmaniWord; //echoN(print_r($qacWordSegmentsArr,true)); // GENERATE TAGS LIST STRING $currentWordTags = ""; foreach ($qacWordSegmentsArr as $segmentIndex => $segmentArr) { //$lemma = $qacWordSegmentsArr[$segmentIndex]['FEATURES']['LEM']; //$segmentAR = $qacWordSegmentsArr[$segmentIndex]['FORM_AR']; $newTag = $qacWordSegmentsArr[$segmentIndex]['TAG']; $currentWordTags = $currentWordTags . " " . $newTag; } // FILL SUBSENTCENCE TAGS AND CORRESPONDING QAC WORD INDEX $posTaggedSubSentencesArr[$verseLocation]['POS_TAGS'][$wordsInSubSentence] = trim($currentWordTags); $posTaggedSubSentencesArr[$verseLocation]['QAC_WORD_INDEXES'][$wordsInSubSentence] = $verseNonPauseWordsIndex; $wordsInSubSentence++; $verseNonPauseWordsIndex++; //preprint_r($posTaggedSubSentencesArr[$verseLocation]); } //echoN("###".$verseLocation); } } return $posTaggedSubSentencesArr; }
function getStatisticallySginificantWords($extendedQueryWordsArr, $scoringTable) { global $MODEL_CORE, $MODEL_CORE_UTH, $script; global $saktaLatifaMark, $sajdahMark; //preprint_r($extendedQueryWordsArr);exit; $queryTermsCollocation = array(); $relevanceReverseOrderIndex = count($documentScoreArr); foreach ($scoringTable as $documentID => $documentScoreArr) { $SURA = $documentScoreArr['SURA']; $AYA = $documentScoreArr['AYA']; $TEXT = $MODEL_CORE['QURAN_TEXT'][$SURA][$AYA]; $TEXT_UTH = $MODEL_CORE_UTH['QURAN_TEXT'][$SURA][$AYA]; $wordsArr = explode(" ", $TEXT); $lastWord = null; foreach ($wordsArr as $word) { $word = cleanAndTrim($word); if (empty($word)) { continue; } $word = strtolower($word); if (isset($MODEL_CORE['STOP_WORDS'][$word])) { continue; } // ignore pause marks if (isPauseMark($word, $MODEL_CORE['TOTALS']['PAUSEMARKS'], $saktaLatifaMark, $sajdahMark)) { continue; } if (!empty($lastWord) && isset($extendedQueryWordsArr[$word]) && !isset($extendedQueryWordsArr[$lastWord])) { $queryTermsCollocation[$lastWord]++; } if (!empty($lastWord) && isset($extendedQueryWordsArr[$lastWord]) && !isset($extendedQueryWordsArr[$word])) { $queryTermsCollocation[$word]++; } $lastWord = $word; } } arsort($queryTermsCollocation); //preprint_r($queryTermsCollocation);exit; $queryTermsCollocation = array_slice($queryTermsCollocation, 0, 10); return $queryTermsCollocation; }