Пример #1
0
/** Returns words from QAC by PoS tags - grouped by lemma **/
function getWordsByPos(&$finalTerms, $POS)
{
    global $LEMMA_TO_SIMPLE_WORD_MAP;
    $qacPosEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_POS", $POS);
    $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", "");
    $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", "");
    $PAUSEMARKS = $TOTALS['PAUSEMARKS'];
    $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", "");
    // Get all segment in QAC for that PoS
    foreach ($qacPosEntryArr as $location => $segmentId) {
        $qacMasterTableEntry = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $location);
        // get Word, Lema and root
        $segmentWord = $qacMasterTableEntry[$segmentId - 1]['FORM_AR'];
        $segmentWordLema = $qacMasterTableEntry[$segmentId - 1]['FEATURES']['LEM'];
        $segmentWordRoot = $qacMasterTableEntry[$segmentId - 1]['FEATURES']['ROOT'];
        $verseLocation = substr($location, 0, strlen($location) - 2);
        //$segmentWord = removeTashkeel($segmentWord);
        // get word index in verse
        $wordIndex = getWordIndexFromQACLocation($location);
        //$segmentFormARimla2y = $UTHMANI_TO_SIMPLE_WORD_MAP_AND_VS[$segmentWord];
        // get simple version of the word index
        $imla2yWordIndex = getImla2yWordIndexByUthmaniLocation($location);
        // get verse text
        $verseText = getVerseByQACLocation($QURAN_TEXT, $location);
        //echoN("|$segmentWord|$imla2yWord");
        $segmentWordNoTashkeel = removeTashkeel($segmentWordLema);
        $superscriptAlef = json_decode('"\\u0670"');
        $alefWasla = "ٱ";
        //U+0671
        //$imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema];
        // this block is important since $LEMMA_TO_SIMPLE_WORD_MAP is not good for  non $superscriptAlef words
        // ex زيت lemma is converted to زيتها which spoiled the ontology concept list results
        if (mb_strpos($segmentWordLema, $superscriptAlef) !== false || mb_strpos($segmentWordLema, $alefWasla) !== false) {
            $imla2yWord = getItemFromUthmaniToSimpleMappingTable($segmentWordLema);
            if (empty($imla2yWord)) {
                $imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema];
            }
        } else {
            $imla2yWord = getItemFromUthmaniToSimpleMappingTable($segmentWordLema);
            if (empty($imla2yWord)) {
                $imla2yWord = shallowUthmaniToSimpleConversion($segmentWordLema);
                //$segmentWordNoTashkeel;
            }
        }
        /// in case the word was not found after removing tashkeel, try the lema mappign table
        $termWeightArr = $MODEL_CORE['WORDS_FREQUENCY']['WORDS_TFIDF'][$imla2yWord];
        // NOT WORKING BECAUSE LEMMAS WILL NOT BE IN SIMPLE WORDS LIST و الصابيئن =>صَّٰبِـِٔين
        // if the word after removing tashkeel is not found in quran simple words list, then try lemma table
        /*if (!isset($MODEL_CORE['WORDS_FREQUENCY']['WORDS'][$imla2yWord]) )
        		 {
        		 $imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema];
        
        		 if ( empty($imla2yWord) )
        		 {
        		 echoN($segmentWordLema);
        		 echoN($imla2yWord);
        		 preprint_r($LEMMA_TO_SIMPLE_WORD_MAP);
        		 preprint_r($MODEL_CORE['WORDS_FREQUENCY']['WORDS']);
        		 exit;
        		 }
        		 }*/
        if (empty($termWeightArr)) {
            //only for weight since the lema table decrease qurana matching
            $imla2yWordForWeight = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema];
            $termWeightArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$imla2yWordForWeight];
        }
        $termWeight = $termWeightArr['TFIDF'];
        ////////////////////////////////////////////
        $termWord = $segmentWordLema;
        //$imla2yWord;//"|$segmentWord| ".$imla2yWord ." - $location:$segmentId - $wordIndex=$imla2yWordIndex";
        if (!isset($finalTerms[$termWord])) {
            $finalTerms[$termWord] = generateEmptyConceptMetadata();
            $finalTerms[$termWord]['LEM'] = $segmentWordLema;
            $finalTerms[$termWord]['POS'] = $POS;
            $finalTerms[$termWord]['SIMPLE_WORD'] = $imla2yWord;
            $finalTerms[$termWord]['ROOT'] = $segmentWordRoot;
            $finalTerms[$termWord]['WEIGHT'] = $termWeight;
        }
        $finalTerms[$termWord]["FREQ"] = $finalTerms[$termWord]["FREQ"] + 1;
        if (!isset($finalTerms[$termWord]["SEG"][$segmentWord])) {
            $finalTerms[$termWord]["SEG"][$segmentWord] = $imla2yWord;
        }
        if (!isset($finalTerms[$termWord]["POSES"][$POS])) {
            $finalTerms[$termWord]["POSES"][$POS] = 1;
        }
    }
    return $finalTerms;
}
         //SPLIT PHRASE ON SPACE
         $biGramWords = preg_split("/ /", $concept);
         // FIRST WORD IS PARENT
         $parentConcept = $biGramWords[0];
         // GET ALL INFO ABOUT THIS WORD - INCLUDING POS TAG
         $wordInfoArr = $wordsInfoArr[$parentConcept];
         //getWordInfo($parentConcept, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC,true);
         $parentPosArr = $wordInfoArr['POS'];
         //echoN("%%2:$parentConcept:".preprint_r($parentPosArr,true));
         // if the is a quanic word it has to be PN, N or ADJ
         if (!empty($parentPosArr) && !isset($parentPosArr['PN']) && !isset($parentPosArr['N']) && !isset($parentPosArr['ADJ'])) {
             continue;
         }
         $subclassConcept = $concept;
         if (!isset($finalConcepts[$parentConcept])) {
             $finalConcepts[$parentConcept] = array("CONCEPT_TYPE" => "T-BOX", "EXTRACTION_PHASE" => "TAX-RELATIONS", "FREQ" => 1, "EXTRA" => generateEmptyConceptMetadata());
             $finalConcepts[$parentConcept]['EXTRA']['ENG_TRANSLATION'] = cleanEnglishTranslation($WORDS_TRANSLATIONS_AR_EN[$parentConcept]);
         } else {
             // SHOULD SWITCH TO T-BOX SINCE IT IS A PARENT CLASS NOW - FOR OWL SERIALIZATION BUGS
             $finalConcepts[$parentConcept]['CONCEPT_TYPE'] = 'T-BOX';
         }
         $hasType = "{$is_a_relation_name_ar}";
         $type = "TAXONOMIC";
         addRelation($relationsArr, $type, $subclassConcept, $hasType, $parentConcept, "{$pos}", "{$is_a_relation_name_en}");
     }
 }
 echoN("TAXONOMIC RELATIONS - BIGRAM PARENT :" . (count($relationsArr) - $countOfRelationsBefore));
 ///////////////////////////////////////////////////////////////////
 echoN("FINAL TAXONOMIC RELATIONS :" . (count($relationsArr) - $countOfRelationsFirst));
 echoN("BA-A:" . count($finalConcepts));
 //preprint_r($finalConcepts);exit;