function answerUserQuestion($query, $queryWordsArr, $taggedSignificantWords, $scoringTable, $lang) { global $is_a_relation_name_ar; $QURAN_TEXT = getModelEntryFromMemory($lang, "MODEL_CORE", "QURAN_TEXT", ""); // answering by relevance and similarity $conceptsFromTaxRelations = extendQueryWordsByConceptTaxRelations($taggedSignificantWords, $lang, true); $COMMON_CONCEPTS_FACTOR = 10; $COMMON_QUESTION_TYPE_CONCEPTS_FACTOR = 10; $COMMON_ROOTS_FACTOR = 10; $COMMON_DERIVATIONS_FACTOR = 10; $scoredAnswerVersesArr = array(); //preprint_r($taggedSignificantWords); //echoN($query); $questionType = containsQuestionWords($query, $lang); ////////// COMMON CONCEPTS IN QUESTION $conceptsInQuestionTextArr = getConceptsFoundInText($query, $lang); //preprint_r($conceptsInQuestionTextArr); /////////////////////////////////////// /////////// GET CONCEPTS FOR THE QUESTION TYPE /// GET INSTANCE CONCEPTS FROM QUESTION TYPE CLASS $questionType = cleanAndTrim(strtolower($questionType)); //echoN($questionType); //$conceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$questionType]; $conceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $questionType); //echoN($conceptID); //$relationsOfConceptAsTarget = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_TARGETS'][$conceptID]; $relationsOfConceptAsTarget = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_TARGETS", $conceptID); $questionTypeConceptsArr = array(); foreach ($relationsOfConceptAsTarget as $index => $relArr) { $verb = $relArr["link_verb"]; $subject = $relArr["source"]; //echoN("CC:$is_a_relation_name_ar|$verb"); if ($verb != $is_a_relation_name_ar) { continue; } $questionTypeConceptsArr[] = $subject; } ////////////////////////////////////////////// $debugArr = array(); //// Answering by similarity and relevance foreach ($scoringTable as $documentID => $documentScoreArr) { //preprint_r($documentScoreArr); $relevanceReverseOrderIndex--; $SURA = $documentScoreArr['SURA']; $AYA = $documentScoreArr['AYA']; $TEXT = $QURAN_TEXT[$SURA][$AYA]; $score = $documentScoreArr['SCORE']; //echoN("SCORE BEFORE QUESTION RELEVANCE:$score"); if ($lang == "EN") { $TEXT = strtolower($TEXT); } //echoN($TEXT); $conceptsInTextArr = getConceptsFoundInText($TEXT, $lang); //preprint_r($conceptsInTextArr); /////////// COMMON CONCEPTS BWTEEEN QUESTION AND A VERSE TEXT $commonQuestionVerseConceptsCount = getIntersectionCountOfTwoArrays(array_keys($conceptsInQuestionTextArr), array_keys($conceptsInTextArr)); //echoN("Common Concepts:$commonQuestionVerseConceptsCount"); $debugArr[$documentID]['COMMON_CONCEPTS'] = $commonQuestionVerseConceptsCount; $debugArr[$documentID]['COMMON_CONCEPTS_LIST'] = join(" ", array_intersect(array_keys($conceptsInQuestionTextArr), array_keys($conceptsInTextArr))); //preprint_r($debugArr);exit; $score += $commonQuestionVerseConceptsCount * $COMMON_CONCEPTS_FACTOR; /////////////////////////////////////////////////////////// //preprint_r($questionTypeConceptsArr); //preprint_r(array_keys($conceptsInTextArr)); $numberOfSharedConceptsForThisQuestionType = getIntersectionCountOfTwoArrays($questionTypeConceptsArr, array_keys($conceptsInTextArr)); //echoN($numberOfSharedConceptsForThisQuestionType); $score += $numberOfSharedConceptsForThisQuestionType * $COMMON_QUESTION_TYPE_CONCEPTS_FACTOR; $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS'] = $numberOfSharedConceptsForThisQuestionType; $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS_LIST'] = join(" ", array_intersect($questionTypeConceptsArr, array_keys($conceptsInTextArr))); //// QUESION-VERSE SIMILARITY MESUREMENT (wITH DERIVATIONS CONSIDERED) $wordsInVerseTextArr = explode(" ", $TEXT); $derivationHandledB4 = array(); $commonDerivations = 0; if ($lang == "EN") { foreach ($taggedSignificantWords as $wordInQuestion => $pos) { //echoN("$word $pos"); // for words like i (NOUN in the lexicon for some reson ) if (mb_strlen($wordInQuestion) <= 2) { continue; } if ($pos == "VBN" || $pos == "VBD" || $pos == "VBG" || $pos == "NN" || $pos == "NNS") { foreach ($wordsInVerseTextArr as $index => $wordInArray) { $wordInArray = cleanAndTrim($wordInArray); if (mb_strlen($wordInArray) <= 2) { continue; } // if any word (noun/verb) in the quetion is a substring if (strpos($wordInArray, $wordInQuestion) !== false || strpos($wordInQuestion, $wordInArray) !== false) { if (isset($derivationHandledB4[$wordInArray])) { continue; } //echoN("$word is SS in VerseText"); $commonDerivations++; $derivationHandledB4[$wordInArray] = 1; //$debugArr[$documentID]['COMMON_DERIVATIONS_LIST']= //$debugArr[$documentID]['COMMON_DERIVATIONS_LIST']."|".$wordInArray; } } } } $score += $commonDerivations * $COMMON_DERIVATIONS_FACTOR; $debugArr[$documentID]['COMMON_DERIVATIONS'] = $commonDerivations; } else { $questionWordsRootsArr = array(); foreach ($taggedSignificantWords as $wordInQuestion => $pos) { if (mb_strlen($wordInQuestion) <= 2) { continue; } if ($pos == "NN" || $pos == "NNS") { //echoN("===$wordInQuestion"); $root = getRootOfSimpleWord($wordInQuestion, array("N", "V")); if (!empty($root)) { $questionWordsRootsArr[] = $root; } } } //preprint_r($questionWordsRootsArr); //exit; $verseWordsRootsArr = array(); foreach ($wordsInVerseTextArr as $index => $wordInArray) { if (mb_strlen($wordInArray) <= 2) { continue; } $root = getRootOfSimpleWord($wordInArray, array("N", "V")); if (!empty($root)) { $verseWordsRootsArr[] = $root; } } //preprint_r($verseWordsRootsArr); } $commonRootsCount = getIntersectionCountOfTwoArrays($verseWordsRootsArr, $questionWordsRootsArr); $score += $commonRootsCount * $COMMON_ROOTS_FACTOR; $debugArr[$documentID]['COMMON_ROOTS'] = $commonRootsCount; //echoN($commonRootsCount); ///////////////////////////////////////////////////////// //echoN("SCORE AFTER QUESTION RELEVANCE:$score"); $scoringTable[$documentID]['SCORE'] = $score; $scoredAnswerVersesArr[$documentID] = $scoringTable[$documentID]; } rsortBy($scoredAnswerVersesArr, "SCORE"); //preprint_r($debugArr); //preprint_r($scoredAnswerVersesArr);exit; $scoredAnswerVersesArr = array_slice($scoredAnswerVersesArr, 0, 3); //// REMOVE ANY VERSE FROM THE FINAL LIST WHICH HAS NO OBVIOUS SIMILARITY WITH THE QUESTION foreach ($scoredAnswerVersesArr as $documentID => $verseArr) { //preprint_r($debugArr[$documentID]); if ($debugArr[$documentID]['COMMON_ROOTS'] == 0 && $debugArr[$documentID]['COMMON_CONCEPTS'] == 0 && $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS'] == 0 && $debugArr[$documentID]['COMMON_DERIVATIONS'] == 0) { unset($scoredAnswerVersesArr[$documentID]); } } ///////////////////////////////////// //preprint_r($scoredAnswerVersesArr); //preprint_r($scoredAnswerVersesArr); return array("ANSWER_CONCEPTS" => $conceptsFromTaxRelations, "ANSWER_VERSES" => $scoredAnswerVersesArr); }
function getScoredDocumentsFromInveretdIndex($extendedQueryWordsArr, $query, $isPhraseSearch, $isQuestion, $isColumnSearch, $columnSearchKeyValParams, $isConceptSearch, $lang, $isTransliterationSearch) { global $MODEL_CORE, $MODEL_SEARCH; $QURAN_TEXT = getModelEntryFromMemory($lang, "MODEL_CORE", "QURAN_TEXT", ""); $TOTALS = getModelEntryFromMemory($lang, "MODEL_CORE", "TOTALS", ""); $PAUSEMARKS = $TOTALS['PAUSEMARKS']; if ($isColumnSearch) { $SURA = $columnSearchKeyValParams['KEY'] - 1; $isFullChapter = $columnSearchKeyValParams['VAL'] == "ALL"; if ($isFullChapter) { $suraSize = count($QURAN_TEXT[$SURA]); for ($AYA = 0; $AYA < $suraSize; $AYA++) { $scoringTable[$SURA . ":" . $AYA] = array(); $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 1; $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA; $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA; } } else { $AYA = $columnSearchKeyValParams['VAL'] - 1; // VERSE VALIDITY CHECK $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, 0); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); if (empty($verseText)) { return array(); } ////////////////////// $scoringTable[$SURA . ":" . $AYA] = array(); $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 1; $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA; $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA; } return $scoringTable; } // $MODEL_QURANA = apc_fetch("MODEL_QURANA"); //preprint_r($extendedQueryWordsArr); //$isOneWordQuery = preg_match("/ /", $query)==0; //preprint_r($extendedQueryWordsArr); /** * GET ALL RESULT FORM INDEX USING EXTENDED QUERY WORD (WHICH INCLUDES ALL VARIATIONS AND PRONOUNS) */ foreach ($extendedQueryWordsArr as $word => $targetQACLocation) { //echoN("|$word|"); //echoN($lang); //echoN($isConceptSearch); /*if ($lang=="EN" && $isConceptSearch ) { }*/ //preprint_r($MODEL_SEARCH['INVERTED_INDEX'][$word]); $invertedIndexEntry = getModelEntryFromMemory($lang, "MODEL_SEARCH", "INVERTED_INDEX", $word); foreach ($invertedIndexEntry as $documentArrInIndex) { //echoN("$word"); //preprint_r($documentArrInIndex);; $SURA = $documentArrInIndex['SURA']; $AYA = $documentArrInIndex['AYA']; $INDEX_IN_AYA_EMLA2Y = $documentArrInIndex['INDEX_IN_AYA_EMLA2Y']; $INDEX_IN_AYA_UTHMANI = $documentArrInIndex['INDEX_IN_AYA_UTHMANI']; $WORD_TYPE = $documentArrInIndex['WORD_TYPE']; $EXTRA_INFO = $documentArrInIndex['EXTRA_INFO']; //echo getQACLocationStr($SURA,$AYA,$INDEX_IN_AYA_EMLA2Y); $qacLocation = getQACLocationStr($SURA + 1, $AYA + 1, $INDEX_IN_AYA_UTHMANI); $verseText = getVerseByQACLocation($QURAN_TEXT, $qacLocation); /* * * NOTE: A DECISION SHOULD BE TAKEN TO SERACH AROUND AND REMOVE PAUSE MARKS OR NOT */ if ($lang == "AR") { $verseTextWithoutPauseMarks = removePauseMarkFromVerse($verseText); } else { $verseTextWithoutPauseMarks = removeSpecialCharactersFromMidQuery($verseText); } //echoN("|$query|$verseTextWithoutPauseMarks"); $fullQueryIsFoundInVerseCount = preg_match_all("/(^|[ ]){$query}([ ]|\$)/umi", $verseTextWithoutPauseMarks); //echoN("$query | $word"); if ($isPhraseSearch && $WORD_TYPE != "PRONOUN_ANTECEDENT") { $numberOfOccurencesForWord = $fullQueryIsFoundInVerseCount; if ($numberOfOccurencesForWord == 0) { continue; } } else { $numberOfOccurencesForWord = preg_match_all("/{$word}/um", $verseText); /*if ( $numberOfOccurencesForWord> 100) { echoN($word); echoN($verseText); preprint_r($extendedQueryWordsArr); exit; }*/ } //echoN($numberOfOccurencesForWord); //echoN("$qacLocation|$targetQACLocation|$word|$EXTRA_INFO|$WORD_TYPE"); // incase of non normal word ( QAC/QURANA) .. translate WordIndex from Uthmani script to Imla2y script /*if ( $WORD_TYPE!="NORMAL_WORD" ) { //echoN("OLD:$INDEX_IN_AYA_EMLA2Y"); $INDEX_IN_AYA_EMLA2Y = getImla2yWordIndexByUthmaniLocation($qacLocation,$UTHMANI_TO_SIMPLE_LOCATION_MAP); //echoN("NEW:$INDEX_IN_AYA_EMLA2Y"); }*/ //echoN($word); //preprint_r($documentArrInIndex); //preprint_r($MODEL_QAC['QAC_MASTERTABLE'][$qacLocation]); if (!isset($scoringTable[$SURA . ":" . $AYA])) { $scoringTable[$SURA . ":" . $AYA] = array(); $scoringTable[$SURA . ":" . $AYA]['SCORE'] = 0; $scoringTable[$SURA . ":" . $AYA]['FREQ'] = 0; $scoringTable[$SURA . ":" . $AYA]['DISTANCE'] = 0; $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] = 0; $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] = 0; $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] = 0; $scoringTable[$SURA . ":" . $AYA]['SURA'] = $SURA; $scoringTable[$SURA . ":" . $AYA]['AYA'] = $AYA; $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'] = array(); $scoringTable[$SURA . ":" . $AYA]['WORD_TYPE'] = $WORD_TYPE; $scoringTable[$SURA . ":" . $AYA]['EXTRA_INFO'] = $EXTRA_INFO; $scoringTable[$SURA . ":" . $AYA]['INDEX_IN_AYA_EMLA2Y'] = $INDEX_IN_AYA_EMLA2Y; $scoringTable[$SURA . ":" . $AYA]['INDEX_IN_AYA_UTHMANI'] = $INDEX_IN_AYA_UTHMANI; $scoringTable[$SURA . ":" . $AYA]['PRONOUNS'] = array(); } $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] = $numberOfOccurencesForWord; //echoN($numberOfOccurencesForWord); if (!isset($scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word]) && $numberOfOccurencesForWord > 0 && $scoringTable[$SURA . ":" . $AYA]['FREQ'] > 0) { //TODO: seems duplicate of WORD_OCCURENCES_COUNT // Raise the frequency (score) of ayas containing more than one of the query items $scoringTable[$SURA . ":" . $AYA]['FREQ']++; //=$numberOfOccurencesForWord; } else { $scoringTable[$SURA . ":" . $AYA]['FREQ']++; } /*$verseArr = preg_split("/ /",$MODEL_CORE['QURAN_TEXT'][$SURA][$AYA]); $verseArr = removePauseMarksFromArr($MODEL_CORE['TOTALS']['PAUSEMARKS'],$verseArr); $simpleWordFromText = $verseArr[$INDEX_IN_AYA_EMLA2Y-1]; */ /* if ( empty($simpleWordFromText)) { echoN($INDEX_IN_AYA_EMLA2Y); preprint_r($verseArr); } echoN($qacLocation); echoN($word); echoN($INDEX_IN_AYA_EMLA2Y); echoN($MODEL_CORE['QURAN_TEXT'][$SURA][$AYA]); echoN($simpleWordFromText); preprint_r($verseArr); */ // STEM or PRONOUN if ($WORD_TYPE == "PRONOUN_ANTECEDENT") { $scoringTable[$SURA . ":" . $AYA]['PRONOUNS'][$EXTRA_INFO] = $INDEX_IN_AYA_EMLA2Y; } else { if ($WORD_TYPE == "ROOT" || $WORD_TYPE == "LEM") { // for non-normal words this will get the whole segment $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE; // needed to fix root that are sometimes converted by uthmani/simple map below $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][removeTashkeel($EXTRA_INFO)] = $WORD_TYPE; // try to convert QAC uthmani word to simpleimla2y using the MAP table with and withou tashkeel $wordInAya = getItemFromUthmaniToSimpleMappingTable($EXTRA_INFO); if (empty($wordInAya)) { $wordInAya = getItemFromUthmaniToSimpleMappingTable(removeTashkeel($EXTRA_INFO)); } if (empty($wordInAya)) { $wordInAya = removeTashkeel($EXTRA_INFO); } /*if ( empty($wordInAya ) ) { preprint_r($documentArrInIndex); echoN($EXTRA_INFO); echo"HERE"; preprint_r($scoringTable[$SURA.":".$AYA]);exit; }*/ //echoN("$word-$wordInAya-$EXTRA_INFO"); $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$wordInAya] = $WORD_TYPE; } else { if ($isTransliterationSearch) { $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE; } else { // word was in original user query, not in our extended one ///if ( in_array($word,$queryWordsArr)) //{ $scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS'][$word] = $WORD_TYPE; } //} } } $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] = $fullQueryIsFoundInVerseCount; $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] = count($scoringTable[$SURA . ":" . $AYA]['POSSIBLE_HIGHLIGHTABLE_WORDS']); $scoringTable[$SURA . ":" . $AYA]['SCORE'] = $scoringTable[$SURA . ":" . $AYA]['FREQ'] / 2 + $scoringTable[$SURA . ":" . $AYA]['DISTANCE'] * 1 + $scoringTable[$SURA . ":" . $AYA]['QUERY_WORDS_IN_VERSE'] * 10 + count($scoringTable[$SURA . ":" . $AYA]['PRONOUNS']) * 1 + $scoringTable[$SURA . ":" . $AYA]['WORD_OCCURENCES_COUNT'] * 1 + $scoringTable[$SURA . ":" . $AYA]['IS_FULL_QUERY_IN_VERSE'] * 20; } } rsortBy($scoringTable, 'SCORE'); //preprint_r($scoringTable);exit; return $scoringTable; }
function loadModel($lang, $type, $file) { global $WORDS_FREQUENCY_ARR, $TOTALS_ARR, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $MODEL_QURANA; global $sajdahMark, $saktaLatifaMark, $pauseMarksFile, $serializedModelFile, $basmalaTextUthmani; global $numberOfSuras, $numberOfVerses, $quranMetaDataFile, $arabicStopWordsFile, $englishStopWordsFile; global $META_DATA, $basmalaText, $englishResourceFile, $arabicResourceFile, $quranCorpusMorphologyFile; global $quranaPronounResolutionConceptsFile, $quranaPronounResolutionDataFileTemplate, $quranFileUthmaniAR; global $TRANSLATION_MAP_EN_TO_AR, $TRANSLATION_MAP_AR_TO_EN, $TRANSLITERATION_WORDS_MAP, $TRANSLITERATION_VERSES_MAP; global $basmalaTextUthmani2, $arabicStopWordsFileL2; global $TRANSLITERATION_WORDS_LOCATION_MAP; $QURAN_TEXT = array(); $invertedIndexBatchApcArr = array(); $qacMasterTableBatchApcArr = array(); $qacPOSTableBatchApcArr = array(); $qacFeatureTableBatchApcArr = array(); $TOTALS_ARR = array(); $TOTALS_ARR['CHARS'] = 0; $TOTALS_ARR['WORDS'] = 0; $TOTALS_ARR['NRWORDS'] = 0; $TOTALS_ARR['VERSES'] = 0; $TOTALS_ARR['SURAS'] = $numberOfSuras; $TOTALS_ARR['CHAPTERS'] = 30; $TOTALS_ARR['TOTAL_PER_SURA'] = array(); $TOTALS_ARR['SAJDAT_TELAWA'] = array(); $TOTALS_ARR['PAUSEMARKS'] = array(); $TOTALS_ARR['MIN_WORD_LENGTH'] = 0; $TOTALS_ARR['AVG_WORD_LENGTH'] = 0; $TOTALS_ARR['MAX_WORD_LENGTH'] = 0; $TOTALS_ARR['MIN_WORD'] = null; $TOTALS_ARR['MAX_WORD'] = null; $TOTALS_ARR['MIN_VERSE_LENGTH'] = 0; $TOTALS_ARR['AVG_VERSE_LENGTH'] = 0; $TOTALS_ARR['MAX_VERSE_LENGTH'] = 0; $TOTALS_ARR['MIN_VERSE'] = null; $TOTALS_ARR['MAX_VERSE'] = null; $TOTALS_ARR['SAJDAT_TELAWA']['COUNT'] = 0; $TOTALS_ARR['SAJDAT_TELAWA']['VERSES'] = array(); $TOTALS_ARR['SAKTA_LATIFA']['COUNT'] = 0; $TOTALS_ARR['SAKTA_LATIFA']['VERSES'] = array(); $INVERTED_INDEX = array(); $WORDS_FREQUENCY_ARR = array(); $WORDS_FREQUENCY_ARR['WORDS'] = array(); $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'] = array(); $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'] = array(); $WORDS_FREQUENCY_ARR['WORDS_TFIDF'] = array(); $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'] = array(); $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'] = array(); /** WORD LENGTH **/ $minWordLength = 1000; $minWord = null; $maxWordLength = -1; $maxWord = null; $avgWordLength = 0; /** VERSE LENGTH **/ $minVerseLength = 1000; $minVerse = null; $maxVerseLength = -1; $maxVerse = null; $avgVerseLength = 0; /** QAC Model **/ // Master model, contains all QAC data $qacMasterSegmentTable = array(); //pinters/indexes on the master table for POS and features $qacPOSTable = array(); $qacFeaturesTable = array(); //$qacWordsTable = array(); $qacSegmentToWordTable = array(); /** QURANA Corpus **/ $quranaConcecpts = array(); $quranaResolvedPronouns = array(); ########### LOAD DATA ACCORDING TO MODEL SOURCE TYPE if ($type == "XML") { $sourceContent = simplexml_load_file($file); } else { $sourceContent = file($file, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES); } if ($type == "TXT") { for ($s = 0; $s < $numberOfVerses; $s++) { $line = $sourceContent[$s]; $lineArr = preg_split("/\\|/", $line); $suraIndex = $lineArr[0]; $ayaIndex = $lineArr[1]; $text = $lineArr[2]; //strip "besm allah alrahman al raheem" from furst aya of all suras except the first one if (strpos($lang, "AR") !== false && $ayaIndex == 1 && $s != 0) { if ($lang == "AR") { $text = trim(str_replace($basmalaText, "", $text)); } else { if ($lang == "AR_UTH") { $text = trim(str_replace($basmalaTextUthmani, "", $text)); $text = trim(str_replace($basmalaTextUthmani2, "", $text)); } } } if (!isset($QURAN_TEXT[$suraIndex - 1])) { $QURAN_TEXT[$suraIndex - 1] = array(); } $QURAN_TEXT[$suraIndex - 1][$ayaIndex - 1] = $text; } } else { if ($type == "XML") { for ($s = 0; $s < $numberOfSuras; $s++) { $suraSize = $META_DATA['SURAS'][$s]['ayas']; for ($a = 0; $a < $suraSize; $a++) { $QURAN_TEXT[$s][$a] = (string) $sourceContent->sura[$s]->aya[$a]['text']; } } } else { throw new Exception("Invalid Source Type ({$type})"); } } ############################################################## // free resources $sourceContent = null; unset($sourceContent); if ($lang == "AR") { ############ LOAD QAC (Quranic Arabic Corpus) FILE ################################### //dont skip new lines here (FILE_SKIP_EMPTY_LINES) for the skipping "57" condition below to work $qacFileLinesArr = file($quranCorpusMorphologyFile, FILE_IGNORE_NEW_LINES); $rootsLookupArray = array(); $headerIndex = 0; $segmentIndex = 1; foreach ($qacFileLinesArr as $line) { $headerIndex++; //ignore header sections if ($headerIndex <= 57) { continue; } //if ( $segmentIndex >= 2) exit; //echoN($line); // convert columns to array $lineArr = preg_split("/\t/", $line); $location = $lineArr[0]; $formOrSegment = $lineArr[1]; $posTAG = $lineArr[2]; $featuresList = $lineArr[3]; //preprint_r($lineArr); // remove brackets from location and keep it only SURA/AYA/WORDINDEX/SEGMENTINDEX $masterID = preg_replace("/\\(|\\)|/", "", $location); $locationArr = preg_split("/\\:/", $masterID); $wordSegmentID = $locationArr[count($locationArr) - 1]; $wordIndex = $locationArr[count($locationArr) - 2]; $verseID = $locationArr[count($locationArr) - 3]; $suraID = $locationArr[count($locationArr) - 4]; // Remove segment index from location ( will be added as new array below ) $masterID = substr($masterID, 0, strlen($masterID) - 2); // get the reversed buackwalter transliteration for the segment $formOrSegmentReverseTransliterated = buckwalterReverseTransliteration($formOrSegment); //echoN($formOrSegmentReverseTransliterated); // separate features $featuresTempArr = preg_split("/\\|/", $featuresList); //preprint_r($featuresTempArr); $featuresArr = array(); foreach ($featuresTempArr as $oneFeature) { // feature is a key/value set if (strpos($oneFeature, ":") !== false) { $oneFeatureKeyValueArr = preg_split("/\\:/", $oneFeature); $featureName = $oneFeatureKeyValueArr[0]; $featureValue = $oneFeatureKeyValueArr[1]; if ($featureName == "LEM" || $featureName == "ROOT") { //echoN($featureValue); $featureValue = buckwalterReverseTransliteration($featureValue); } } else { $featureName = $oneFeature; // 1 here just a dummy value $featureValue = -1; } $featureValue = trim($featureValue); // fill Features Index table //$qacFeaturesTable[$featureName][$masterID]= $featureValue; $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_FEATURES/{$featureName}"; $qacFeatureTableBatchApcArr[$apcMemoryEntryKey][$masterID] = $featureValue; $featuresArr[$featureName] = $featureValue; // non-word features should not be included if ($featureName == "LEM" || $featureName == "ROOT") { addToInvertedIndex($invertedIndexBatchApcArr, $lang, trim($featureValue), $suraID - 1, $verseID - 1, $wordIndex, trim($featureName), $formOrSegmentReverseTransliterated); if ($featureName == "ROOT") { //$rootsLookupArray[$formOrSegmentReverseTransliterated]=$featureValue; addValueToMemoryModel($lang, "MODEL_QAC", "QAC_ROOTS_LOOKUP", $formOrSegmentReverseTransliterated, $featureValue); } } } //location significant before increment below $qacSegmentToWordTable[$segmentIndex] = $wordIndex; // Fill master table //$qacMasterSegmentTable[$masterID][] $qacMasterTableEntry = array("FORM_EN" => $formOrSegment, "FORM_AR" => $formOrSegmentReverseTransliterated, "TAG" => $posTAG, "SEGMENT_INDEX" => $segmentIndex++, "FEATURES" => $featuresArr); $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_MASTERTABLE/{$masterID}"; $qacMasterTableBatchApcArr[$apcMemoryEntryKey][] = $qacMasterTableEntry; // Fill Part of Speech tagging table $qacPOSTable[$posTAG][$masterID] = $wordSegmentID; $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_POS/{$posTAG}"; $qacPOSTableBatchApcArr[$apcMemoryEntryKey][$masterID] = $wordSegmentID; } //preprint_r($qacMasterSegmentTable); //preprint_r($qacFeaturesTable); //preprint_r($qacPOSTable); ############################################################## // free resources $qacFileLinesArr = null; unset($qacFileLinesArr); // need to fluch tabel in memory since it is needed by Qurana - in segment function addToMemoryModelBatch($qacMasterTableBatchApcArr); } ######### Qurana Pronomial Anaphone Corpus ################### //echoN($quranaPronounResolutionConceptsFile); // GET XML FILE CONTENT $xmlContent = file_get_contents($quranaPronounResolutionConceptsFile); // LOAD XML OBJECT - trim used to avoid first line empty error $concepts = simplexml_load_string(trim(stripHTMLComments($xmlContent))); // LOAD CONCEPTS foreach ($concepts->con as $index => $conceptObj) { $conceptID = (string) $conceptObj['id']; $conceptNameEN = (string) $conceptObj->english; $conceptNameAR = (string) $conceptObj->arabic; $quranaConcecpts[$conceptID] = array("EN" => trim($conceptNameEN), "AR" => trim($conceptNameAR), "FREQ" => 0); } $pronounsCount = 0; $segmentsCount = 0; //preprint_r($quranaConcecpts); // LOAD PRONOUNS // load & parse the file of each SURA and load it in the model for ($s = 0; $s < $numberOfSuras; $s++) { $suraID = $s + 1; $pronounDataFileName = preg_replace("/%s/", $suraID, $quranaPronounResolutionDataFileTemplate); //echoN($pronounDataFileName); // GET XML FILE CONTENT of the current SURA by customizing file name $xmlContent = file_get_contents($pronounDataFileName); // LOAD XML OBJECT - trim used to avoid first line empty error $chapter = simplexml_load_string(trim(stripHTMLComments($xmlContent))); // LOAD CONCEPTS foreach ($chapter->verse as $index => $verseObj) { $verseLocalSegmentIndex = 0; $versesCount++; // Loop on all children foreach ($verseObj->children() as $index => $childObj) { // get tag name $tagName = $childObj->getName(); $verseLocalSegmentIndex++; $segmentsCount++; // we got a prounoun tag if ($tagName == "pron") { $pronounsCount++; // get the verse including this pronoun $verseID = (string) $verseObj['id']; // get pronoun concept ID and antecendent $conceptID = (string) $childObj['con']; $pronounAntecedent = (string) $childObj['ant']; // get segment ID and word form $quranaSegmentID = (string) $childObj->seg['id']; $quranaSegmentForm = (string) $childObj->seg->__toString(); $quranaSegmentForm = trim($quranaSegmentForm); // convert Qurana Segment ID to QAC segment for cross referenceing $qacSegment = getQACSegmentByQuranaSeqment($suraID, $verseID, $verseLocalSegmentIndex, $quranaSegmentForm); //echo("$qacSegment,$quranaSegmentID\n"); // get the id of the word where the segment is $wordId = $qacSegmentToWordTable[$qacSegment]; $quranaConcecpts[$conceptID]["FREQ"]++; // fill pronouns array $quranaResolvedPronouns["{$suraID}:{$verseID}:{$wordId}"][] = array("CONCEPT_ID" => $conceptID, "SEGMENT_INDEX" => $qacSegment, "ANTECEDENT_SEGMENTS" => preg_split("/ /", $pronounAntecedent)); if ($lang == "EN") { addToInvertedIndex($invertedIndexBatchApcArr, $lang, strtolower($quranaConcecpts[$conceptID]['EN']), $suraID - 1, $verseID - 1, $wordId, "PRONOUN_ANTECEDENT", $quranaSegmentForm); } else { addToInvertedIndex($invertedIndexBatchApcArr, $lang, $quranaConcecpts[$conceptID]['AR'], $suraID - 1, $verseID - 1, $wordId, "PRONOUN_ANTECEDENT", $quranaSegmentForm); } } } } } //echoN("SEG:$segmentsCount PRON:$pronounsCount"); //preprint_r($quranaResolvedPronouns); //preprint_r($INVERTED_INDEX);exit; ############################################################## // free resources $xmlContent = null; $concepts = null; unset($xmlContent); unset($concepts); //echo preprint_r($QURAN_TEXT);; if (strpos($lang, "AR") !== false) { $stopWordsArr = getStopWordsArrByFile($arabicStopWordsFile); $stopWordsStrictL2Arr = getStopWordsArrByFile($arabicStopWordsFileL2); $pauseMarksArr = getPauseMarksArrByFile($pauseMarksFile); } else { $stopWordsArr = getStopWordsArrByFile($englishStopWordsFile); $pauseMarksArr = array(); } //preprint_r($stopWordsArr); //preprint_r($pauseMarksArr); if (strpos($lang, "AR") !== false) { // SETTING PAUSE MARKS COUNTER ARRAY foreach ($pauseMarksArr as $pauseMark => $constant) { $TOTALS_ARR['PAUSEMARKS'][$pauseMark] = 0; } } /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { $TOTALS_ARR['TOTAL_PER_SURA'][$s] = array(); $suraNameLang = $lang; if ($suraNameLang == "AR_UTH") { $suraNameLang = "AR"; } $suraNameLang = strtolower($lang); $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NAME'] = $META_DATA['SURAS'][$s]['name_' . $suraNameLang]; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['CHARS'] = 0; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NRWORDS'] = 0; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['WORDS'] = 0; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES'] = 0; $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s] = array(); } /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { //echoN($quranXMLObj->sura[$s]['name']); $suraSize = $META_DATA['SURAS'][$s]['ayas']; /* VERSES LOOP **/ for ($a = 0; $a < $suraSize; $a++) { //$verseText = $verseText = $QURAN_TEXT[$s][$a]; //echoN("- ".$verseText); $wordsArr = preg_split("/ /", $verseText); /** CALCULATE VERSE LENGTH **/ $wordsInVerseIncludingPauses = count($wordsArr); $wordsInVerse = $wordsInVerseIncludingPauses - count(array_intersect($wordsArr, array_keys($pauseMarksArr))); if ($wordsInVerse >= $maxVerseLength) { $maxVerseLength = $wordsInVerse; $maxVerse = $verseText; } if ($wordsInVerse <= $minWordLength) { if ($wordsInVerse == $minWordLength) { if (mb_strlen($verseText) < mb_strlen($minVerse)) { $minVerseLength = $wordsInVerse; $minVerse = $verseText; } } else { $minVerseLength = $wordsInVerse; $minVerse = $verseText; } } $avgVerseLength += $wordsInVerse; /** END CALCULATE VERSE LENGTH **/ $wordIndex = 0; /* WORDS IN VERSE LOOP **/ foreach ($wordsArr as $word) { $word = trim($word); // PAUSE MARK if (strpos($lang, "AR") !== false && isset($pauseMarksArr[$word])) { $TOTALS_ARR['PAUSEMARKS'][$word]++; continue; } else { // SAJDAH MARK if ($word == $sajdahMark) { $TOTALS_ARR['SAJDAT_TELAWA']['COUNT']++; $TOTALS_ARR['SAJDAT_TELAWA']['VERSES'][] = array($s, $a, $verseText); continue; } else { // SAKTA LATIFA if ($word == $saktaLatifaMark) { $TOTALS_ARR['SAKTA_LATIFA']['COUNT']++; $TOTALS_ARR['SAKTA_LATIFA']['VERSES'][] = array($s, $a, $verseText); continue; } } } // Mainly for english translations if ($lang == "EN") { $word = strtolower(cleanAndTrim($word)); } // ignore empty words - result of trimming if (empty($word)) { // the case of " - " in english translations continue; } $wordIndex++; if ($wordIndex == 1) { if (!isset($WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word])) { $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word] = 0; } $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word]++; } else { if ($wordIndex == count($wordsArr)) { if (!isset($WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word])) { $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word] = 0; } $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word]++; } } $TOTALS_ARR['WORDS']++; if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s])) { $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s] = array(); } if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a])) { $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a] = array(); } if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word])) { $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word] = 0; } $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word]++; if (!isset($WORDS_FREQUENCY_ARR['WORDS'][$word])) { $WORDS_FREQUENCY_ARR['WORDS'][$word] = 0; } $WORDS_FREQUENCY_ARR['WORDS'][$word]++; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['WORDS']++; if (!isset($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word])) { $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word] = 0; } $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word]++; //if (!isset($INVERTED_INDEX[$word]) ) $INVERTED_INDEX[$word] = array(); //$INVERTED_INDEX[$word][] = array("SURA"=>$s,"AYA"=>$a,"INDEX_IN_AYA_EMLA2Y"=>$wordIndex,"WORD_TYPE"=>"NORMAL_WORD"); addToInvertedIndex($invertedIndexBatchApcArr, $lang, $word, $s, $a, $wordIndex, "NORMAL_WORD"); /** CALCULATE WORD LENGTHG **/ $wordLength = mb_strlen($word); if ($wordLength >= $maxWordLength) { $maxWordLength = $wordLength; $maxWord = $word; } if ($wordLength <= $minWordLength) { $minWordLength = $wordLength; $minWord = $word; } $avgWordLength += $wordLength; /** END CALCULATE WORD LENGTHG **/ $charsInWordArr = preg_split("//u", $word, -1, PREG_SPLIT_NO_EMPTY); /* CHARS IN EACH WORD LOOP **/ foreach ($charsInWordArr as $char) { //echoN($char." ".in_array($char,$pauseMarksArrTemp)); // SPACE if ($char == " ") { continue; } $TOTALS_ARR['CHARS']++; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['CHARS']++; } } $TOTALS_ARR['VERSES']++; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES']++; // if ( $TOTALS_ARR['VERSES']>30) // exit; } /** END AYA's LOOP **/ } /** END SURA's LOOP **/ /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NRWORDS'] = count($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s]); arsort($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s]); } $TOTALS_ARR['NRWORDS'] = count($WORDS_FREQUENCY_ARR['WORDS']); $TOTALS_ARR['PAUSEMARKS_AGGREGATION'] = 0; // AGGREGATE PAUSE MARKS foreach ($TOTALS_ARR['PAUSEMARKS'] as $pmLabel => $pmCount) { //echo $pmLabel.$pmCount; $TOTALS_ARR['PAUSEMARKS_AGGREGATION'] += $pmCount; } /** * CALCULATING TF-IDF TABLE */ foreach ($WORDS_FREQUENCY_ARR['WORDS'] as $wordLabel => $wordFreq) { $termFrequency = $wordFreq; $termFrequencyPercentage = $termFrequency / $TOTALS_ARR['WORDS'] * 100; // DOCUMENT = VERSE $documentFrequency = 0; $inverseDocumentFrequency = 0; //CHECKING VERSES for ($s = 0; $s < $numberOfSuras; $s++) { //$versesPerSura = $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES']; //for ($a=0;$a<$versesPerSura;$a++) //{ if (isset($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$wordLabel])) { //= $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$wordLabel] $documentFrequency++; } //} } $inverseDocumentFrequency = log($numberOfSuras / $documentFrequency, 10); $TFIDF = $termFrequency * $inverseDocumentFrequency; //echoN("WORD:$wordLabel PRCG:$termFrequencyPercentage TF:$termFrequency DF:$documentFrequency IDF:$inverseDocumentFrequency TFIDF:$TFIDF "); $WORDS_FREQUENCY_ARR['WORDS_TFIDF'][$wordLabel] = array("TF" => $termFrequency, "TPC" => $termFrequencyPercentage, "DF" => $documentFrequency, "IDF" => $inverseDocumentFrequency, "TFIDF" => $TFIDF); } /** END OF TFIDF TABLE **/ rsortBy($WORDS_FREQUENCY_ARR['WORDS_TFIDF'], 'TF'); //preprint_r($WORDS_FREQUENCY_ARR['WORDS_TFIDF']); /** Continuing WORD/VERSE LENGTH CALCULATE **/ $avgWordLength = $avgWordLength / $TOTALS_ARR['WORDS']; $avgVerseLength = $avgVerseLength / $TOTALS_ARR['VERSES']; /* echoN($minWordLength." - ".$minWord); echoN($maxWordLength." - ".$maxWord); echoN($avgWordLength); echoN($minVerseLength." - ".$minVerse); echoN($maxVerseLength." - ".$maxVerse); echoN($avgVerseLength); */ $TOTALS_ARR['MIN_WORD_LENGTH'] = $minWordLength; $TOTALS_ARR['AVG_WORD_LENGTH'] = round($avgWordLength, 2); $TOTALS_ARR['MAX_WORD_LENGTH'] = $maxWordLength; $TOTALS_ARR['MIN_WORD'] = $minWord; $TOTALS_ARR['MAX_WORD'] = $maxWord; $TOTALS_ARR['MIN_VERSE_LENGTH'] = $minVerseLength; $TOTALS_ARR['AVG_VERSE_LENGTH'] = round($avgVerseLength, 2); $TOTALS_ARR['MAX_VERSE_LENGTH'] = $maxVerseLength; $TOTALS_ARR['MIN_VERSE'] = $minVerse; $TOTALS_ARR['MAX_VERSE'] = $maxVerse; /** end CALCULATE WORD/VERSE LENGTH **/ //exit;; arsort($WORDS_FREQUENCY_ARR['WORDS']); arsort($WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS']); arsort($WORDS_FREQUENCY_ARR['VERSE_ENDINGS']); //preprint_r($WORDS_FREQUENCY_ARR); /////// LOADING LANGUAGE RESOURCE FILES $resourceFile = $englishResourceFile; if (strpos($lang, "AR") !== false) { $resourceFile = $arabicResourceFile; } $languageResourcesArr = file($resourceFile, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES); $RESOURCES = array(); foreach ($languageResourcesArr as $index => $resourceLine) { $resourcePairsArr = preg_split("/\\|/", $resourceLine); $resourceID = $resourcePairsArr[0]; $resourceValue = $resourcePairsArr[1]; $RESOURCES[$resourceID] = $resourceValue; } //$MODEL_CORE['LOADED']=1; //$MODEL_CORE[$lang]['META_DATA'] = $META_DATA; addValueToMemoryModel($lang, "MODEL_CORE", "META_DATA", "", $META_DATA); //$MODEL_CORE[$lang]['TOTALS'] = $TOTALS_ARR; addValueToMemoryModel($lang, "MODEL_CORE", "TOTALS", "", $TOTALS_ARR); //$MODEL_CORE[$lang]['WORDS_FREQUENCY'] = $WORDS_FREQUENCY_ARR; addValueToMemoryModel($lang, "MODEL_CORE", "WORDS_FREQUENCY", "", $WORDS_FREQUENCY_ARR); addValueToMemoryModel($lang, "MODEL_CORE", "WORDS_FREQUENCY", "WORDS", $WORDS_FREQUENCY_ARR['WORDS']); //$MODEL_CORE[$lang]['QURAN_TEXT'] = $QURAN_TEXT; addValueToMemoryModel($lang, "MODEL_CORE", "QURAN_TEXT", "", $QURAN_TEXT); //$MODEL_CORE[$lang]['RESOURCES']=$RESOURCES; addValueToMemoryModel($lang, "MODEL_CORE", "RESOURCES", "", $RESOURCES); //$MODEL_CORE[$lang]['STOP_WORDS']= $stopWordsArr; addValueToMemoryModel($lang, "MODEL_CORE", "STOP_WORDS", "", $stopWordsArr); //$MODEL_CORE[$lang]['STOP_WORDS_STRICT_L2']= $stopWordsStrictL2Arr; addValueToMemoryModel($lang, "MODEL_CORE", "STOP_WORDS_STRICT_L2", "", $stopWordsStrictL2Arr); //file_put_contents("$serializedModelFile.core", (json_encode($MODEL_CORE))); //$MODEL_SEARCH[$lang]['INVERTED_INDEX'] = $INVERTED_INDEX; /*$invertedIndexIterator = getAPCIterator("MODEL_SEARCH.*"); foreach($invertedIndexIterator as $cursor) { preprint_r($cursor); }*/ addToMemoryModelBatch($invertedIndexBatchApcArr); //$res = apc_store("MODEL_CORE[$lang]",$MODEL_CORE[$lang]); //if ( $res===false){ throw new Exception("Can't cache MODEL_CORE[$lang]"); } //$res = apc_store("MODEL_SEARCH[$lang]",$MODEL_SEARCH[$lang]); //if ( $res===false){ throw new Exception("Can't cache MODEL_SEARCH[$lang]"); } //file_put_contents("$serializedModelFile.search", (json_encode($MODEL_SEARCH))); if ($lang == "AR") { //$MODEL_QAC['QAC_MASTERTABLE'] = $qacMasterSegmentTable; //$MODEL_QAC['QAC_POS'] = $qacPOSTable; addToMemoryModelBatch($qacPOSTableBatchApcArr); //$MODEL_QAC['QAC_FEATURES'] = $qacFeaturesTable; addToMemoryModelBatch($qacFeatureTableBatchApcArr); //$MODEL_QAC['QAC_ROOTS_LOOKUP'] = $rootsLookupArray; //file_put_contents("$serializedModelFile.qac", (json_encode($MODEL_QAC))); //$res = apc_store("MODEL_QAC",$MODEL_QAC); //if ( $res===false){ throw new Exception("Can't cache MODEL_QAC"); } rsortBy($quranaConcecpts, 'FREQ'); $MODEL_QURANA['QURANA_CONCEPTS'] = $quranaConcecpts; $MODEL_QURANA['QURANA_PRONOUNS'] = $quranaResolvedPronouns; //file_put_contents("$serializedModelFile.qurana", (json_encode($MODEL_QURANA))); $res = apc_store("MODEL_QURANA", $MODEL_QURANA); if ($res === false) { throw new Exception("Can't cache MODEL_QURANA"); } } //preprint_r($MODEL['INVERTED_INDEX'] );exit; //preprint_r($WORDS_FREQUENCY_ARR['VERSE_ENDINGS']); //echo serialize(json_encode($MODEL)); //preprint_r($MODEL['EN']); }
$weight = 0; foreach ($biGramWords as $biGramTerm) { $weight += floatval($WORDS_FREQUENCY['WORDS_TFIDF'][$biGramTerm]['TFIDF']); } $weight = $weight / 2; ////// //$weight = round($freq/$maxConceptFreq,2); $quranaConceptArr = getQuranaConceptEntryByARWord($biGramConcept); // ADD QURANA TRANSLATION FOR QURANA BIGRAMS $engTranslation = ucfirst($quranaConceptArr['EN']); addNewConcept($finalConcepts, $biGramConcept, "A-BOX", "PHRASE", $freq, $engTranslation); $finalConcepts[$biGramConcept]['EXTRA']['POS'] = $pos; $finalConcepts[$biGramConcept]['EXTRA']['WEIGHT'] = $weight; $finalConcepts[$biGramConcept]['EXTRA']['IS_QURANA_NGRAM_CONCEPT'] = true; } rsortBy($finalConcepts, "FREQ"); echoN("FINAL CONCEPTS COUNT:" . count($finalConcepts)); //preprint_r($finalConcepts); file_put_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.final.concepts.stage1", serialize($finalConcepts)); file_put_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.all.terms", serialize($finalTerms)); } if ($GENERATE_NONTAXONOMIC_RELATIONS) { $finalConcepts = unserialize(file_get_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.final.concepts.stage1")); //preprint_r($finalConcepts);exit; $MODEL_CORE_UTH = loadUthmaniDataModel(); /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { $suraSize = count($MODEL_CORE_UTH['QURAN_TEXT'][$s]); /* VERSES LOOP **/ for ($a = 0; $a < $suraSize; $a++) { $i++;