/** Returns words from QAC by PoS tags - grouped by lemma **/ function getWordsByPos(&$finalTerms, $POS) { global $LEMMA_TO_SIMPLE_WORD_MAP; $qacPosEntryArr = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_POS", $POS); $QURAN_TEXT = getModelEntryFromMemory("AR", "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory("AR", "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; $WORDS_FREQUENCY = getModelEntryFromMemory("AR", "MODEL_CORE", "WORDS_FREQUENCY", ""); // Get all segment in QAC for that PoS foreach ($qacPosEntryArr as $location => $segmentId) { $qacMasterTableEntry = getModelEntryFromMemory("AR", "MODEL_QAC", "QAC_MASTERTABLE", $location); // get Word, Lema and root $segmentWord = $qacMasterTableEntry[$segmentId - 1]['FORM_AR']; $segmentWordLema = $qacMasterTableEntry[$segmentId - 1]['FEATURES']['LEM']; $segmentWordRoot = $qacMasterTableEntry[$segmentId - 1]['FEATURES']['ROOT']; $verseLocation = substr($location, 0, strlen($location) - 2); //$segmentWord = removeTashkeel($segmentWord); // get word index in verse $wordIndex = getWordIndexFromQACLocation($location); //$segmentFormARimla2y = $UTHMANI_TO_SIMPLE_WORD_MAP_AND_VS[$segmentWord]; // get simple version of the word index $imla2yWordIndex = getImla2yWordIndexByUthmaniLocation($location); // get verse text $verseText = getVerseByQACLocation($QURAN_TEXT, $location); //echoN("|$segmentWord|$imla2yWord"); $segmentWordNoTashkeel = removeTashkeel($segmentWordLema); $superscriptAlef = json_decode('"\\u0670"'); $alefWasla = "ٱ"; //U+0671 //$imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema]; // this block is important since $LEMMA_TO_SIMPLE_WORD_MAP is not good for non $superscriptAlef words // ex زيت lemma is converted to زيتها which spoiled the ontology concept list results if (mb_strpos($segmentWordLema, $superscriptAlef) !== false || mb_strpos($segmentWordLema, $alefWasla) !== false) { $imla2yWord = getItemFromUthmaniToSimpleMappingTable($segmentWordLema); if (empty($imla2yWord)) { $imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema]; } } else { $imla2yWord = getItemFromUthmaniToSimpleMappingTable($segmentWordLema); if (empty($imla2yWord)) { $imla2yWord = shallowUthmaniToSimpleConversion($segmentWordLema); //$segmentWordNoTashkeel; } } /// in case the word was not found after removing tashkeel, try the lema mappign table $termWeightArr = $MODEL_CORE['WORDS_FREQUENCY']['WORDS_TFIDF'][$imla2yWord]; // NOT WORKING BECAUSE LEMMAS WILL NOT BE IN SIMPLE WORDS LIST و الصابيئن =>صَّٰبِـِٔين // if the word after removing tashkeel is not found in quran simple words list, then try lemma table /*if (!isset($MODEL_CORE['WORDS_FREQUENCY']['WORDS'][$imla2yWord]) ) { $imla2yWord = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema]; if ( empty($imla2yWord) ) { echoN($segmentWordLema); echoN($imla2yWord); preprint_r($LEMMA_TO_SIMPLE_WORD_MAP); preprint_r($MODEL_CORE['WORDS_FREQUENCY']['WORDS']); exit; } }*/ if (empty($termWeightArr)) { //only for weight since the lema table decrease qurana matching $imla2yWordForWeight = $LEMMA_TO_SIMPLE_WORD_MAP[$segmentWordLema]; $termWeightArr = $WORDS_FREQUENCY['WORDS_TFIDF'][$imla2yWordForWeight]; } $termWeight = $termWeightArr['TFIDF']; //////////////////////////////////////////// $termWord = $segmentWordLema; //$imla2yWord;//"|$segmentWord| ".$imla2yWord ." - $location:$segmentId - $wordIndex=$imla2yWordIndex"; if (!isset($finalTerms[$termWord])) { $finalTerms[$termWord] = generateEmptyConceptMetadata(); $finalTerms[$termWord]['LEM'] = $segmentWordLema; $finalTerms[$termWord]['POS'] = $POS; $finalTerms[$termWord]['SIMPLE_WORD'] = $imla2yWord; $finalTerms[$termWord]['ROOT'] = $segmentWordRoot; $finalTerms[$termWord]['WEIGHT'] = $termWeight; } $finalTerms[$termWord]["FREQ"] = $finalTerms[$termWord]["FREQ"] + 1; if (!isset($finalTerms[$termWord]["SEG"][$segmentWord])) { $finalTerms[$termWord]["SEG"][$segmentWord] = $imla2yWord; } if (!isset($finalTerms[$termWord]["POSES"][$POS])) { $finalTerms[$termWord]["POSES"][$POS] = 1; } } return $finalTerms; }
//SPLIT PHRASE ON SPACE $biGramWords = preg_split("/ /", $concept); // FIRST WORD IS PARENT $parentConcept = $biGramWords[0]; // GET ALL INFO ABOUT THIS WORD - INCLUDING POS TAG $wordInfoArr = $wordsInfoArr[$parentConcept]; //getWordInfo($parentConcept, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC,true); $parentPosArr = $wordInfoArr['POS']; //echoN("%%2:$parentConcept:".preprint_r($parentPosArr,true)); // if the is a quanic word it has to be PN, N or ADJ if (!empty($parentPosArr) && !isset($parentPosArr['PN']) && !isset($parentPosArr['N']) && !isset($parentPosArr['ADJ'])) { continue; } $subclassConcept = $concept; if (!isset($finalConcepts[$parentConcept])) { $finalConcepts[$parentConcept] = array("CONCEPT_TYPE" => "T-BOX", "EXTRACTION_PHASE" => "TAX-RELATIONS", "FREQ" => 1, "EXTRA" => generateEmptyConceptMetadata()); $finalConcepts[$parentConcept]['EXTRA']['ENG_TRANSLATION'] = cleanEnglishTranslation($WORDS_TRANSLATIONS_AR_EN[$parentConcept]); } else { // SHOULD SWITCH TO T-BOX SINCE IT IS A PARENT CLASS NOW - FOR OWL SERIALIZATION BUGS $finalConcepts[$parentConcept]['CONCEPT_TYPE'] = 'T-BOX'; } $hasType = "{$is_a_relation_name_ar}"; $type = "TAXONOMIC"; addRelation($relationsArr, $type, $subclassConcept, $hasType, $parentConcept, "{$pos}", "{$is_a_relation_name_en}"); } } echoN("TAXONOMIC RELATIONS - BIGRAM PARENT :" . (count($relationsArr) - $countOfRelationsBefore)); /////////////////////////////////////////////////////////////////// echoN("FINAL TAXONOMIC RELATIONS :" . (count($relationsArr) - $countOfRelationsFirst)); echoN("BA-A:" . count($finalConcepts)); //preprint_r($finalConcepts);exit;