function textToGraph($searchResultTextArr, $excludes) { global $pauseMarksFile, $lang; $MAX_CAP = 300; $graphObj = array(); $graphObj["capped"] = 0; $graphNodes = array(); $graphLinks = array(); $pauseMarksArr = getPauseMarksArrByFile($pauseMarksFile); /** SHOULD BE ZERO BASED FOR D3 TO WORK - o.target.weight = NULL**/ $nodeSerialNumber = 0; $lastWord = null; foreach ($searchResultTextArr as $index => $text) { $textWordsArr = preg_split("/ /", $text); //echoN($text); foreach ($textWordsArr as $word) { if ($lang == "EN") { $word = cleanAndTrim($word); $word = strtolower($word); } //echoN($word); if ($pauseMarksArr[$word]) { continue; } if ($excludes[$word] == 1) { continue; } if (!isset($graphNodes[$word])) { $graphNodes[$word] = array("id" => $nodeSerialNumber++, "word" => $word, "size" => 1, "x" => rand(1, 800), "y" => rand(1, 400)); } else { $graphNodes[$word]["size"] = $graphNodes[$word]["size"] + 1; } if ($lastWord != null) { $graphLinks[] = array("source" => $graphNodes[$lastWord]["id"], "target" => $graphNodes[$word]["id"]); } $lastWord = $word; } if (count($graphNodes) > $MAX_CAP) { $graphObj["capped"] = $MAX_CAP; break; } } $graphObj["nodes"] = $graphNodes; $graphObj["links"] = $graphLinks; //preprint_r($graphLinks); //preprint_r($graphNodes); return $graphObj; }
$noDerivationsConstraint = true; } if ($columnSearchArr[1] == "NOEXTENTIONFROMONTOLOGY") { $query = str_replace("CONSTRAINT:NOEXTENTIONFROMONTOLOGY", "", $query); $noOntologyExtentionConstraint = true; } } } } } //preprint_r($columnSearchKeyValParams);exit; //echoN("IS QUESTION:$isQuestion"); //echoN("noOntologyExtentionConstraint:$noOntologyExtentionConstraint"); //echoN("noDerivationsConstraint:$noDerivationsConstraint"); /// CLEANING $query = cleanAndTrim($query); //$query = removeTashkeel($query); // remove tashkeel - convert from uthmani to simple // didn't use remove tashkeel since it leaves "hamzet el wasl" which is not in the simple text if (!isSimpleQuranWord($query)) { $query = convertUthamniQueryToSimple($query); } // CASE HANDLING if ($lang == "EN") { $query = strtolower($query); $query = removeSpecialCharactersFromMidQuery($query); } else { $query = removeNonArabicAndSpaceChars($query); } $originalQueryWordsArr = preg_split("/ /", $query); //for faster access
function answerUserQuestion($query, $queryWordsArr, $taggedSignificantWords, $scoringTable, $lang) { global $is_a_relation_name_ar; $QURAN_TEXT = getModelEntryFromMemory($lang, "MODEL_CORE", "QURAN_TEXT", ""); // answering by relevance and similarity $conceptsFromTaxRelations = extendQueryWordsByConceptTaxRelations($taggedSignificantWords, $lang, true); $COMMON_CONCEPTS_FACTOR = 10; $COMMON_QUESTION_TYPE_CONCEPTS_FACTOR = 10; $COMMON_ROOTS_FACTOR = 10; $COMMON_DERIVATIONS_FACTOR = 10; $scoredAnswerVersesArr = array(); //preprint_r($taggedSignificantWords); //echoN($query); $questionType = containsQuestionWords($query, $lang); ////////// COMMON CONCEPTS IN QUESTION $conceptsInQuestionTextArr = getConceptsFoundInText($query, $lang); //preprint_r($conceptsInQuestionTextArr); /////////////////////////////////////// /////////// GET CONCEPTS FOR THE QUESTION TYPE /// GET INSTANCE CONCEPTS FROM QUESTION TYPE CLASS $questionType = cleanAndTrim(strtolower($questionType)); //echoN($questionType); //$conceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$questionType]; $conceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $questionType); //echoN($conceptID); //$relationsOfConceptAsTarget = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_TARGETS'][$conceptID]; $relationsOfConceptAsTarget = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_TARGETS", $conceptID); $questionTypeConceptsArr = array(); foreach ($relationsOfConceptAsTarget as $index => $relArr) { $verb = $relArr["link_verb"]; $subject = $relArr["source"]; //echoN("CC:$is_a_relation_name_ar|$verb"); if ($verb != $is_a_relation_name_ar) { continue; } $questionTypeConceptsArr[] = $subject; } ////////////////////////////////////////////// $debugArr = array(); //// Answering by similarity and relevance foreach ($scoringTable as $documentID => $documentScoreArr) { //preprint_r($documentScoreArr); $relevanceReverseOrderIndex--; $SURA = $documentScoreArr['SURA']; $AYA = $documentScoreArr['AYA']; $TEXT = $QURAN_TEXT[$SURA][$AYA]; $score = $documentScoreArr['SCORE']; //echoN("SCORE BEFORE QUESTION RELEVANCE:$score"); if ($lang == "EN") { $TEXT = strtolower($TEXT); } //echoN($TEXT); $conceptsInTextArr = getConceptsFoundInText($TEXT, $lang); //preprint_r($conceptsInTextArr); /////////// COMMON CONCEPTS BWTEEEN QUESTION AND A VERSE TEXT $commonQuestionVerseConceptsCount = getIntersectionCountOfTwoArrays(array_keys($conceptsInQuestionTextArr), array_keys($conceptsInTextArr)); //echoN("Common Concepts:$commonQuestionVerseConceptsCount"); $debugArr[$documentID]['COMMON_CONCEPTS'] = $commonQuestionVerseConceptsCount; $debugArr[$documentID]['COMMON_CONCEPTS_LIST'] = join(" ", array_intersect(array_keys($conceptsInQuestionTextArr), array_keys($conceptsInTextArr))); //preprint_r($debugArr);exit; $score += $commonQuestionVerseConceptsCount * $COMMON_CONCEPTS_FACTOR; /////////////////////////////////////////////////////////// //preprint_r($questionTypeConceptsArr); //preprint_r(array_keys($conceptsInTextArr)); $numberOfSharedConceptsForThisQuestionType = getIntersectionCountOfTwoArrays($questionTypeConceptsArr, array_keys($conceptsInTextArr)); //echoN($numberOfSharedConceptsForThisQuestionType); $score += $numberOfSharedConceptsForThisQuestionType * $COMMON_QUESTION_TYPE_CONCEPTS_FACTOR; $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS'] = $numberOfSharedConceptsForThisQuestionType; $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS_LIST'] = join(" ", array_intersect($questionTypeConceptsArr, array_keys($conceptsInTextArr))); //// QUESION-VERSE SIMILARITY MESUREMENT (wITH DERIVATIONS CONSIDERED) $wordsInVerseTextArr = explode(" ", $TEXT); $derivationHandledB4 = array(); $commonDerivations = 0; if ($lang == "EN") { foreach ($taggedSignificantWords as $wordInQuestion => $pos) { //echoN("$word $pos"); // for words like i (NOUN in the lexicon for some reson ) if (mb_strlen($wordInQuestion) <= 2) { continue; } if ($pos == "VBN" || $pos == "VBD" || $pos == "VBG" || $pos == "NN" || $pos == "NNS") { foreach ($wordsInVerseTextArr as $index => $wordInArray) { $wordInArray = cleanAndTrim($wordInArray); if (mb_strlen($wordInArray) <= 2) { continue; } // if any word (noun/verb) in the quetion is a substring if (strpos($wordInArray, $wordInQuestion) !== false || strpos($wordInQuestion, $wordInArray) !== false) { if (isset($derivationHandledB4[$wordInArray])) { continue; } //echoN("$word is SS in VerseText"); $commonDerivations++; $derivationHandledB4[$wordInArray] = 1; //$debugArr[$documentID]['COMMON_DERIVATIONS_LIST']= //$debugArr[$documentID]['COMMON_DERIVATIONS_LIST']."|".$wordInArray; } } } } $score += $commonDerivations * $COMMON_DERIVATIONS_FACTOR; $debugArr[$documentID]['COMMON_DERIVATIONS'] = $commonDerivations; } else { $questionWordsRootsArr = array(); foreach ($taggedSignificantWords as $wordInQuestion => $pos) { if (mb_strlen($wordInQuestion) <= 2) { continue; } if ($pos == "NN" || $pos == "NNS") { //echoN("===$wordInQuestion"); $root = getRootOfSimpleWord($wordInQuestion, array("N", "V")); if (!empty($root)) { $questionWordsRootsArr[] = $root; } } } //preprint_r($questionWordsRootsArr); //exit; $verseWordsRootsArr = array(); foreach ($wordsInVerseTextArr as $index => $wordInArray) { if (mb_strlen($wordInArray) <= 2) { continue; } $root = getRootOfSimpleWord($wordInArray, array("N", "V")); if (!empty($root)) { $verseWordsRootsArr[] = $root; } } //preprint_r($verseWordsRootsArr); } $commonRootsCount = getIntersectionCountOfTwoArrays($verseWordsRootsArr, $questionWordsRootsArr); $score += $commonRootsCount * $COMMON_ROOTS_FACTOR; $debugArr[$documentID]['COMMON_ROOTS'] = $commonRootsCount; //echoN($commonRootsCount); ///////////////////////////////////////////////////////// //echoN("SCORE AFTER QUESTION RELEVANCE:$score"); $scoringTable[$documentID]['SCORE'] = $score; $scoredAnswerVersesArr[$documentID] = $scoringTable[$documentID]; } rsortBy($scoredAnswerVersesArr, "SCORE"); //preprint_r($debugArr); //preprint_r($scoredAnswerVersesArr);exit; $scoredAnswerVersesArr = array_slice($scoredAnswerVersesArr, 0, 3); //// REMOVE ANY VERSE FROM THE FINAL LIST WHICH HAS NO OBVIOUS SIMILARITY WITH THE QUESTION foreach ($scoredAnswerVersesArr as $documentID => $verseArr) { //preprint_r($debugArr[$documentID]); if ($debugArr[$documentID]['COMMON_ROOTS'] == 0 && $debugArr[$documentID]['COMMON_CONCEPTS'] == 0 && $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS'] == 0 && $debugArr[$documentID]['COMMON_DERIVATIONS'] == 0) { unset($scoredAnswerVersesArr[$documentID]); } } ///////////////////////////////////// //preprint_r($scoredAnswerVersesArr); //preprint_r($scoredAnswerVersesArr); return array("ANSWER_CONCEPTS" => $conceptsFromTaxRelations, "ANSWER_VERSES" => $scoredAnswerVersesArr); }
function getConceptsFoundInText($text, $lang) { global $thing_class_name_ar, $is_a_relation_name_ar; $conceptsInTextArr = array(); $textWordsArr = preg_split("/ /", $text); foreach ($textWordsArr as $index => $word) { if ($lang == "EN") { $word = cleanAndTrim($word); $word = strtolower($word); // translate English name to arabic concept name/id //$wordConveretedToConceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$word]; $wordConveretedToConceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $word); } else { $wordConveretedToConceptID = convertWordToConceptID($word); } //echoN($wordConveretedToConceptID); if (modelEntryExistsInMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID)) { //preprint_r($MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]);exit; //echoN($wordConveretedToConceptID); //$mainConceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]; $mainConceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID); $conceptLabelAR = $mainConceptArr['label_ar']; $conceptLabelEN = $mainConceptArr['label_en']; $conceptFrequency = $mainConceptArr['frequency']; $conceptWeight = $mainConceptArr['weight']; $finalNodeLabel = $conceptLabelAR; if ($lang == "EN") { $finalNodeLabel = $conceptLabelEN; } if ($wordConveretedToConceptID == $thing_class_name_ar) { continue; } $conceptsInTextArr[$wordConveretedToConceptID] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $mainConceptArr, $randomXLocation, $randomYLocation, 1); } } return $conceptsInTextArr; }
<div id='main-container'> <div id='graph-maingraph-area'> <?php $customFreqArr = array(); $QURAN_TEXT = getModelEntryFromMemory($lang, "MODEL_CORE", "QURAN_TEXT", ""); $suraSize = count($QURAN_TEXT[$SURA]); for ($a = 0; $a < $suraSize; $a++) { $verseText = $QURAN_TEXT[$SURA][$a]; $verseTextArr = explode(" ", $verseText); foreach ($verseTextArr as $index => $word) { $word = cleanAndTrim($word); $word = strtolower($word); $customFreqArr[$word]++; } $arrOfTextToGraph[] = $verseText; } $graphObj = ontologyTextToD3Graph($MODEL_QA_ONTOLOGY, "SEARCH_RESULTS_TEXT_ARRAY", $arrOfTextToGraph, 0, array(960, 600), $lang, 1); foreach ($graphObj['nodes'] as $index => $nodeArr) { $word = strtolower($nodeArr['word']); if (isset($customFreqArr[$word])) { $graphObj['nodes'][$index]['size'] = $customFreqArr[$word]; } else { $graphObj['nodes'][$index]['size'] = 1; } } //preprint_r($graphNodesArr);
function ontologyTextToD3Graph($MODEL_QA_ONTOLOGY, $inputType, $searchResultTextArr, $minFreq = 0, $widthHeigthArr, $lang, $mainConceptsOnly = false, $isPhraseSearch = false, $isQuestion = false, $query = "") { global $thing_class_name_ar, $is_a_relation_name_ar; $graphObj = array(); $graphNodes = array(); $graphLinks = array(); ////// calculate start points $width = $widthHeigthArr[0]; $height = $widthHeigthArr[1]; $startLocationXMin = $width / 2 - 100; $startLocationXMax = $width / 2 + 100; $startLocationYMin = $height / 2 - 100; $startLocationYMax = $height / 2 + 100; //////////////////////////// /** SHOULD BE ZERO BASED FOR D3 TO WORK - o.target.weight = NULL**/ $nodeSerialNumber = 0; $lastWord = null; foreach ($searchResultTextArr as $index => $text) { if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY") { $textWordsArr = preg_split("/ /", $text); } else { if (!$isPhraseSearch) { // extendedQueryParam $textWordsArr = array_keys($searchResultTextArr); } else { // phrase should be checked as is $textWordsArr[0] = $query; } } foreach ($textWordsArr as $word) { if ($lang == "EN") { $word = cleanAndTrim($word); $word = strtolower($word); // translate English name to arabic concept name/id //$wordConveretedToConceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$word]; $wordConveretedToConceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $word); } else { $wordConveretedToConceptID = convertWordToConceptID($word); } if (modelEntryExistsInMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID)) { //preprint_r($MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]);exit; //echoN($wordConveretedToConceptID); $mainConceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID); //$mainConceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]; $conceptLabelAR = $mainConceptArr['label_ar']; $conceptLabelEN = $mainConceptArr['label_en']; $conceptFrequency = $mainConceptArr['frequency']; $conceptWeight = $mainConceptArr['weight']; $finalNodeLabel = $conceptLabelAR; if ($lang == "EN") { $finalNodeLabel = $conceptLabelEN; } /*if ( empty($finalNodeLabel)) { echoN($conceptLabelAR); exit; }*/ if ($conceptFrequency < $minFreq) { continue; } if ($wordConveretedToConceptID == $thing_class_name_ar) { continue; } if (!isset($graphNodes[$wordConveretedToConceptID])) { $randomXLocation = rand($startLocationXMin, $startLocationXMax); $randomYLocation = rand($startLocationYMin, $startLocationYMax); $graphNodes[$wordConveretedToConceptID] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $mainConceptArr, $randomXLocation, $randomYLocation, 1); } } } } $tooManyConcepts = count($graphNodes) > 200; $ONTOLOGY_RELATIONS = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "RELATIONS", ""); //preprint_r($graphNodes,1);exit; $linksHashLookupTable = array(); //preprint_r($graphNodes,true);exit; foreach ($graphNodes as $concept => $conceptArr) { $conceptID = convertWordToConceptID($concept); //$relationsOfConceptAsSource = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_SOURCES'][$conceptID]; $relationsOfConceptAsSource = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_SOURCES", $conceptID); //$relationsOfConceptAsTarget = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_TARGETS'][$conceptID]; $relationsOfConceptAsTarget = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_TARGETS", $conceptID); foreach ($relationsOfConceptAsSource as $index => $relArr) { $verb = $relArr["link_verb"]; $object = $relArr["target"]; //echoN("$verb==$is_a_relation_name_ar && $object==$thing_class_name_ar"); // ignore is-a thing relations if ($verb == $is_a_relation_name_ar && $object == $thing_class_name_ar) { continue; } if ($tooManyConcepts && $verb == $is_a_relation_name_ar) { continue; } // IF SHOWING MAIN CONCEPTS ONLY, IGNORE CONCEPTS NOT IN MAIN CONCEPTS LIST if ($mainConceptsOnly && !isset($graphNodes[$object])) { continue; } // NO extending by relations in case of search result text // to reduce number of concepts we only add relations with other concepts // found in the text if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY" && !isset($graphNodes[$object])) { continue; } //preprint_r($relArr,true); $randomXLocation = rand($startLocationXMin, $startLocationXMax); $randomYLocation = rand($startLocationYMin, $startLocationYMax); $relHashID = buildRelationHashID($conceptID, $verb, $object); $fullRelationArr = $ONTOLOGY_RELATIONS[$relHashID]; //$conceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$object]; $conceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $object); $finalNodeLabel = $conceptArr['label_ar']; if ($lang == "EN") { $finalNodeLabel = formatEnglishConcept($conceptArr['label_en']); $verb = $fullRelationArr['VERB_TRANSLATION_EN']; } if (!isset($graphNodes[$object])) { $graphNodes[$object] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $conceptArr, $randomXLocation, $randomYLocation, 2); } $linkArr = array("source" => $graphNodes[$concept]["id"], "target" => $graphNodes[$object]["id"], "link_verb" => $verb, "link_frequency" => $fullRelationArr['FREQUENCY']); //////// HANDLING MULTIPLE LINKS BETWEEN SAME NODES BEFORE ASSIGNING LINK $arrHash = getArrayHashForFields($linkArr, array('source', 'target')); /*preprint_r($graphNodes); echoN($finalNodeLabel); preprint_r($linkArr);*/ if (!isset($linksHashLookupTable[$arrHash])) { $graphLinks[] = $linkArr; $linksHashLookupTable[$arrHash] = count($graphLinks) - 1; } else { $linkIndex = $linksHashLookupTable[$arrHash]; if (strpos($graphLinks[$linkIndex]['link_verb'], "{$verb}") === false) { $graphLinks[$linkIndex]['link_verb'] .= "," . $verb; } } /*if ( $MODEL_QA_ONTOLOGY['CONCEPTS'][$object]['label_en']=="help") { echoN(isset($graphNodes[$object])." ".$object," "); echoN($concept); preprint_r($graphLinks[$linkIndex]); preprint_r($graphNodes[$object]); preprint_r($graphNodes[$concept]); exit; }*/ ///////////////////////////////////////////////////////////// } foreach ($relationsOfConceptAsTarget as $index => $relArr) { $verb = $relArr["link_verb"]; $subject = $relArr["source"]; $relationIndex = $relArr['relation_index']; // IF SHOWING MAIN CONCEPTS ONLY, IGNORE CONCEPTS NOT IN MAIN CONCEPTS LIST if ($mainConceptsOnly && !isset($graphNodes[$subject])) { continue; } if ($tooManyConcepts && $verb == $is_a_relation_name_ar) { continue; } // NO extending by relations in case of search result text // to reduce number of concepts we only add relations with other concepts // found in the text if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY" && !isset($graphNodes[$object])) { continue; } $relHashID = buildRelationHashID($subject, $verb, $concept); $fullRelationArr = $ONTOLOGY_RELATIONS[$relHashID]; $randomXLocation = rand($startLocationXMin, $startLocationXMax); $randomYLocation = rand($startLocationYMin, $startLocationYMax); //$conceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$subject]; $conceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $subject); $finalNodeLabel = $conceptArr['label_ar']; if ($lang == "EN") { $finalNodeLabel = formatEnglishConcept($conceptArr['label_en']); $verb = $fullRelationArr['VERB_TRANSLATION_EN']; } if (!isset($graphNodes[$subject])) { $graphNodes[$subject] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $conceptArr, $randomXLocation, $randomYLocation, 2); } $linkArr = array("source" => $graphNodes[$subject]["id"], "target" => $graphNodes[$concept]["id"], "link_verb" => $verb, "link_frequency" => $fullRelationArr['frequency']); //////// HANDLING MULTIPLE LINKS BETWEEN SAME NODES BEFORE ASSIGNING LINK $arrHash = getArrayHashForFields($linkArr, array('source', 'target')); if (!isset($linksHashLookupTable[$arrHash])) { $graphLinks[] = $linkArr; $linksHashLookupTable[$arrHash] = count($graphLinks) - 1; } else { $linkIndex = $linksHashLookupTable[$arrHash]; if (strpos($graphLinks[$linkIndex]['link_verb'], "{$verb}") === false) { $graphLinks[$linkIndex]['link_verb'] .= "," . $verb; } } ////////////////////////////////////////////////////////////// } } //preprint_r($graphLinks);exit; $graphNodesArr = array(); foreach ($graphNodes as $word => $nodeArr) { $graphNodesArr[] = $nodeArr; } //preprint_r($graphNodesArr,1);exit; //$graphNodesArr = array_slice($graphNodesArr, 1,10); //$graphLinks = array_slice($graphLinks, 1,10); $graphObj["nodes"] = $graphNodesArr; $graphObj["links"] = $graphLinks; return $graphObj; }
function loadModel($lang, $type, $file) { global $WORDS_FREQUENCY_ARR, $TOTALS_ARR, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $MODEL_QURANA; global $sajdahMark, $saktaLatifaMark, $pauseMarksFile, $serializedModelFile, $basmalaTextUthmani; global $numberOfSuras, $numberOfVerses, $quranMetaDataFile, $arabicStopWordsFile, $englishStopWordsFile; global $META_DATA, $basmalaText, $englishResourceFile, $arabicResourceFile, $quranCorpusMorphologyFile; global $quranaPronounResolutionConceptsFile, $quranaPronounResolutionDataFileTemplate, $quranFileUthmaniAR; global $TRANSLATION_MAP_EN_TO_AR, $TRANSLATION_MAP_AR_TO_EN, $TRANSLITERATION_WORDS_MAP, $TRANSLITERATION_VERSES_MAP; global $basmalaTextUthmani2, $arabicStopWordsFileL2; global $TRANSLITERATION_WORDS_LOCATION_MAP; $QURAN_TEXT = array(); $invertedIndexBatchApcArr = array(); $qacMasterTableBatchApcArr = array(); $qacPOSTableBatchApcArr = array(); $qacFeatureTableBatchApcArr = array(); $TOTALS_ARR = array(); $TOTALS_ARR['CHARS'] = 0; $TOTALS_ARR['WORDS'] = 0; $TOTALS_ARR['NRWORDS'] = 0; $TOTALS_ARR['VERSES'] = 0; $TOTALS_ARR['SURAS'] = $numberOfSuras; $TOTALS_ARR['CHAPTERS'] = 30; $TOTALS_ARR['TOTAL_PER_SURA'] = array(); $TOTALS_ARR['SAJDAT_TELAWA'] = array(); $TOTALS_ARR['PAUSEMARKS'] = array(); $TOTALS_ARR['MIN_WORD_LENGTH'] = 0; $TOTALS_ARR['AVG_WORD_LENGTH'] = 0; $TOTALS_ARR['MAX_WORD_LENGTH'] = 0; $TOTALS_ARR['MIN_WORD'] = null; $TOTALS_ARR['MAX_WORD'] = null; $TOTALS_ARR['MIN_VERSE_LENGTH'] = 0; $TOTALS_ARR['AVG_VERSE_LENGTH'] = 0; $TOTALS_ARR['MAX_VERSE_LENGTH'] = 0; $TOTALS_ARR['MIN_VERSE'] = null; $TOTALS_ARR['MAX_VERSE'] = null; $TOTALS_ARR['SAJDAT_TELAWA']['COUNT'] = 0; $TOTALS_ARR['SAJDAT_TELAWA']['VERSES'] = array(); $TOTALS_ARR['SAKTA_LATIFA']['COUNT'] = 0; $TOTALS_ARR['SAKTA_LATIFA']['VERSES'] = array(); $INVERTED_INDEX = array(); $WORDS_FREQUENCY_ARR = array(); $WORDS_FREQUENCY_ARR['WORDS'] = array(); $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'] = array(); $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'] = array(); $WORDS_FREQUENCY_ARR['WORDS_TFIDF'] = array(); $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'] = array(); $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'] = array(); /** WORD LENGTH **/ $minWordLength = 1000; $minWord = null; $maxWordLength = -1; $maxWord = null; $avgWordLength = 0; /** VERSE LENGTH **/ $minVerseLength = 1000; $minVerse = null; $maxVerseLength = -1; $maxVerse = null; $avgVerseLength = 0; /** QAC Model **/ // Master model, contains all QAC data $qacMasterSegmentTable = array(); //pinters/indexes on the master table for POS and features $qacPOSTable = array(); $qacFeaturesTable = array(); //$qacWordsTable = array(); $qacSegmentToWordTable = array(); /** QURANA Corpus **/ $quranaConcecpts = array(); $quranaResolvedPronouns = array(); ########### LOAD DATA ACCORDING TO MODEL SOURCE TYPE if ($type == "XML") { $sourceContent = simplexml_load_file($file); } else { $sourceContent = file($file, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES); } if ($type == "TXT") { for ($s = 0; $s < $numberOfVerses; $s++) { $line = $sourceContent[$s]; $lineArr = preg_split("/\\|/", $line); $suraIndex = $lineArr[0]; $ayaIndex = $lineArr[1]; $text = $lineArr[2]; //strip "besm allah alrahman al raheem" from furst aya of all suras except the first one if (strpos($lang, "AR") !== false && $ayaIndex == 1 && $s != 0) { if ($lang == "AR") { $text = trim(str_replace($basmalaText, "", $text)); } else { if ($lang == "AR_UTH") { $text = trim(str_replace($basmalaTextUthmani, "", $text)); $text = trim(str_replace($basmalaTextUthmani2, "", $text)); } } } if (!isset($QURAN_TEXT[$suraIndex - 1])) { $QURAN_TEXT[$suraIndex - 1] = array(); } $QURAN_TEXT[$suraIndex - 1][$ayaIndex - 1] = $text; } } else { if ($type == "XML") { for ($s = 0; $s < $numberOfSuras; $s++) { $suraSize = $META_DATA['SURAS'][$s]['ayas']; for ($a = 0; $a < $suraSize; $a++) { $QURAN_TEXT[$s][$a] = (string) $sourceContent->sura[$s]->aya[$a]['text']; } } } else { throw new Exception("Invalid Source Type ({$type})"); } } ############################################################## // free resources $sourceContent = null; unset($sourceContent); if ($lang == "AR") { ############ LOAD QAC (Quranic Arabic Corpus) FILE ################################### //dont skip new lines here (FILE_SKIP_EMPTY_LINES) for the skipping "57" condition below to work $qacFileLinesArr = file($quranCorpusMorphologyFile, FILE_IGNORE_NEW_LINES); $rootsLookupArray = array(); $headerIndex = 0; $segmentIndex = 1; foreach ($qacFileLinesArr as $line) { $headerIndex++; //ignore header sections if ($headerIndex <= 57) { continue; } //if ( $segmentIndex >= 2) exit; //echoN($line); // convert columns to array $lineArr = preg_split("/\t/", $line); $location = $lineArr[0]; $formOrSegment = $lineArr[1]; $posTAG = $lineArr[2]; $featuresList = $lineArr[3]; //preprint_r($lineArr); // remove brackets from location and keep it only SURA/AYA/WORDINDEX/SEGMENTINDEX $masterID = preg_replace("/\\(|\\)|/", "", $location); $locationArr = preg_split("/\\:/", $masterID); $wordSegmentID = $locationArr[count($locationArr) - 1]; $wordIndex = $locationArr[count($locationArr) - 2]; $verseID = $locationArr[count($locationArr) - 3]; $suraID = $locationArr[count($locationArr) - 4]; // Remove segment index from location ( will be added as new array below ) $masterID = substr($masterID, 0, strlen($masterID) - 2); // get the reversed buackwalter transliteration for the segment $formOrSegmentReverseTransliterated = buckwalterReverseTransliteration($formOrSegment); //echoN($formOrSegmentReverseTransliterated); // separate features $featuresTempArr = preg_split("/\\|/", $featuresList); //preprint_r($featuresTempArr); $featuresArr = array(); foreach ($featuresTempArr as $oneFeature) { // feature is a key/value set if (strpos($oneFeature, ":") !== false) { $oneFeatureKeyValueArr = preg_split("/\\:/", $oneFeature); $featureName = $oneFeatureKeyValueArr[0]; $featureValue = $oneFeatureKeyValueArr[1]; if ($featureName == "LEM" || $featureName == "ROOT") { //echoN($featureValue); $featureValue = buckwalterReverseTransliteration($featureValue); } } else { $featureName = $oneFeature; // 1 here just a dummy value $featureValue = -1; } $featureValue = trim($featureValue); // fill Features Index table //$qacFeaturesTable[$featureName][$masterID]= $featureValue; $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_FEATURES/{$featureName}"; $qacFeatureTableBatchApcArr[$apcMemoryEntryKey][$masterID] = $featureValue; $featuresArr[$featureName] = $featureValue; // non-word features should not be included if ($featureName == "LEM" || $featureName == "ROOT") { addToInvertedIndex($invertedIndexBatchApcArr, $lang, trim($featureValue), $suraID - 1, $verseID - 1, $wordIndex, trim($featureName), $formOrSegmentReverseTransliterated); if ($featureName == "ROOT") { //$rootsLookupArray[$formOrSegmentReverseTransliterated]=$featureValue; addValueToMemoryModel($lang, "MODEL_QAC", "QAC_ROOTS_LOOKUP", $formOrSegmentReverseTransliterated, $featureValue); } } } //location significant before increment below $qacSegmentToWordTable[$segmentIndex] = $wordIndex; // Fill master table //$qacMasterSegmentTable[$masterID][] $qacMasterTableEntry = array("FORM_EN" => $formOrSegment, "FORM_AR" => $formOrSegmentReverseTransliterated, "TAG" => $posTAG, "SEGMENT_INDEX" => $segmentIndex++, "FEATURES" => $featuresArr); $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_MASTERTABLE/{$masterID}"; $qacMasterTableBatchApcArr[$apcMemoryEntryKey][] = $qacMasterTableEntry; // Fill Part of Speech tagging table $qacPOSTable[$posTAG][$masterID] = $wordSegmentID; $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_POS/{$posTAG}"; $qacPOSTableBatchApcArr[$apcMemoryEntryKey][$masterID] = $wordSegmentID; } //preprint_r($qacMasterSegmentTable); //preprint_r($qacFeaturesTable); //preprint_r($qacPOSTable); ############################################################## // free resources $qacFileLinesArr = null; unset($qacFileLinesArr); // need to fluch tabel in memory since it is needed by Qurana - in segment function addToMemoryModelBatch($qacMasterTableBatchApcArr); } ######### Qurana Pronomial Anaphone Corpus ################### //echoN($quranaPronounResolutionConceptsFile); // GET XML FILE CONTENT $xmlContent = file_get_contents($quranaPronounResolutionConceptsFile); // LOAD XML OBJECT - trim used to avoid first line empty error $concepts = simplexml_load_string(trim(stripHTMLComments($xmlContent))); // LOAD CONCEPTS foreach ($concepts->con as $index => $conceptObj) { $conceptID = (string) $conceptObj['id']; $conceptNameEN = (string) $conceptObj->english; $conceptNameAR = (string) $conceptObj->arabic; $quranaConcecpts[$conceptID] = array("EN" => trim($conceptNameEN), "AR" => trim($conceptNameAR), "FREQ" => 0); } $pronounsCount = 0; $segmentsCount = 0; //preprint_r($quranaConcecpts); // LOAD PRONOUNS // load & parse the file of each SURA and load it in the model for ($s = 0; $s < $numberOfSuras; $s++) { $suraID = $s + 1; $pronounDataFileName = preg_replace("/%s/", $suraID, $quranaPronounResolutionDataFileTemplate); //echoN($pronounDataFileName); // GET XML FILE CONTENT of the current SURA by customizing file name $xmlContent = file_get_contents($pronounDataFileName); // LOAD XML OBJECT - trim used to avoid first line empty error $chapter = simplexml_load_string(trim(stripHTMLComments($xmlContent))); // LOAD CONCEPTS foreach ($chapter->verse as $index => $verseObj) { $verseLocalSegmentIndex = 0; $versesCount++; // Loop on all children foreach ($verseObj->children() as $index => $childObj) { // get tag name $tagName = $childObj->getName(); $verseLocalSegmentIndex++; $segmentsCount++; // we got a prounoun tag if ($tagName == "pron") { $pronounsCount++; // get the verse including this pronoun $verseID = (string) $verseObj['id']; // get pronoun concept ID and antecendent $conceptID = (string) $childObj['con']; $pronounAntecedent = (string) $childObj['ant']; // get segment ID and word form $quranaSegmentID = (string) $childObj->seg['id']; $quranaSegmentForm = (string) $childObj->seg->__toString(); $quranaSegmentForm = trim($quranaSegmentForm); // convert Qurana Segment ID to QAC segment for cross referenceing $qacSegment = getQACSegmentByQuranaSeqment($suraID, $verseID, $verseLocalSegmentIndex, $quranaSegmentForm); //echo("$qacSegment,$quranaSegmentID\n"); // get the id of the word where the segment is $wordId = $qacSegmentToWordTable[$qacSegment]; $quranaConcecpts[$conceptID]["FREQ"]++; // fill pronouns array $quranaResolvedPronouns["{$suraID}:{$verseID}:{$wordId}"][] = array("CONCEPT_ID" => $conceptID, "SEGMENT_INDEX" => $qacSegment, "ANTECEDENT_SEGMENTS" => preg_split("/ /", $pronounAntecedent)); if ($lang == "EN") { addToInvertedIndex($invertedIndexBatchApcArr, $lang, strtolower($quranaConcecpts[$conceptID]['EN']), $suraID - 1, $verseID - 1, $wordId, "PRONOUN_ANTECEDENT", $quranaSegmentForm); } else { addToInvertedIndex($invertedIndexBatchApcArr, $lang, $quranaConcecpts[$conceptID]['AR'], $suraID - 1, $verseID - 1, $wordId, "PRONOUN_ANTECEDENT", $quranaSegmentForm); } } } } } //echoN("SEG:$segmentsCount PRON:$pronounsCount"); //preprint_r($quranaResolvedPronouns); //preprint_r($INVERTED_INDEX);exit; ############################################################## // free resources $xmlContent = null; $concepts = null; unset($xmlContent); unset($concepts); //echo preprint_r($QURAN_TEXT);; if (strpos($lang, "AR") !== false) { $stopWordsArr = getStopWordsArrByFile($arabicStopWordsFile); $stopWordsStrictL2Arr = getStopWordsArrByFile($arabicStopWordsFileL2); $pauseMarksArr = getPauseMarksArrByFile($pauseMarksFile); } else { $stopWordsArr = getStopWordsArrByFile($englishStopWordsFile); $pauseMarksArr = array(); } //preprint_r($stopWordsArr); //preprint_r($pauseMarksArr); if (strpos($lang, "AR") !== false) { // SETTING PAUSE MARKS COUNTER ARRAY foreach ($pauseMarksArr as $pauseMark => $constant) { $TOTALS_ARR['PAUSEMARKS'][$pauseMark] = 0; } } /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { $TOTALS_ARR['TOTAL_PER_SURA'][$s] = array(); $suraNameLang = $lang; if ($suraNameLang == "AR_UTH") { $suraNameLang = "AR"; } $suraNameLang = strtolower($lang); $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NAME'] = $META_DATA['SURAS'][$s]['name_' . $suraNameLang]; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['CHARS'] = 0; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NRWORDS'] = 0; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['WORDS'] = 0; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES'] = 0; $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s] = array(); } /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { //echoN($quranXMLObj->sura[$s]['name']); $suraSize = $META_DATA['SURAS'][$s]['ayas']; /* VERSES LOOP **/ for ($a = 0; $a < $suraSize; $a++) { //$verseText = $verseText = $QURAN_TEXT[$s][$a]; //echoN("- ".$verseText); $wordsArr = preg_split("/ /", $verseText); /** CALCULATE VERSE LENGTH **/ $wordsInVerseIncludingPauses = count($wordsArr); $wordsInVerse = $wordsInVerseIncludingPauses - count(array_intersect($wordsArr, array_keys($pauseMarksArr))); if ($wordsInVerse >= $maxVerseLength) { $maxVerseLength = $wordsInVerse; $maxVerse = $verseText; } if ($wordsInVerse <= $minWordLength) { if ($wordsInVerse == $minWordLength) { if (mb_strlen($verseText) < mb_strlen($minVerse)) { $minVerseLength = $wordsInVerse; $minVerse = $verseText; } } else { $minVerseLength = $wordsInVerse; $minVerse = $verseText; } } $avgVerseLength += $wordsInVerse; /** END CALCULATE VERSE LENGTH **/ $wordIndex = 0; /* WORDS IN VERSE LOOP **/ foreach ($wordsArr as $word) { $word = trim($word); // PAUSE MARK if (strpos($lang, "AR") !== false && isset($pauseMarksArr[$word])) { $TOTALS_ARR['PAUSEMARKS'][$word]++; continue; } else { // SAJDAH MARK if ($word == $sajdahMark) { $TOTALS_ARR['SAJDAT_TELAWA']['COUNT']++; $TOTALS_ARR['SAJDAT_TELAWA']['VERSES'][] = array($s, $a, $verseText); continue; } else { // SAKTA LATIFA if ($word == $saktaLatifaMark) { $TOTALS_ARR['SAKTA_LATIFA']['COUNT']++; $TOTALS_ARR['SAKTA_LATIFA']['VERSES'][] = array($s, $a, $verseText); continue; } } } // Mainly for english translations if ($lang == "EN") { $word = strtolower(cleanAndTrim($word)); } // ignore empty words - result of trimming if (empty($word)) { // the case of " - " in english translations continue; } $wordIndex++; if ($wordIndex == 1) { if (!isset($WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word])) { $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word] = 0; } $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word]++; } else { if ($wordIndex == count($wordsArr)) { if (!isset($WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word])) { $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word] = 0; } $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word]++; } } $TOTALS_ARR['WORDS']++; if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s])) { $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s] = array(); } if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a])) { $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a] = array(); } if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word])) { $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word] = 0; } $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word]++; if (!isset($WORDS_FREQUENCY_ARR['WORDS'][$word])) { $WORDS_FREQUENCY_ARR['WORDS'][$word] = 0; } $WORDS_FREQUENCY_ARR['WORDS'][$word]++; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['WORDS']++; if (!isset($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word])) { $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word] = 0; } $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word]++; //if (!isset($INVERTED_INDEX[$word]) ) $INVERTED_INDEX[$word] = array(); //$INVERTED_INDEX[$word][] = array("SURA"=>$s,"AYA"=>$a,"INDEX_IN_AYA_EMLA2Y"=>$wordIndex,"WORD_TYPE"=>"NORMAL_WORD"); addToInvertedIndex($invertedIndexBatchApcArr, $lang, $word, $s, $a, $wordIndex, "NORMAL_WORD"); /** CALCULATE WORD LENGTHG **/ $wordLength = mb_strlen($word); if ($wordLength >= $maxWordLength) { $maxWordLength = $wordLength; $maxWord = $word; } if ($wordLength <= $minWordLength) { $minWordLength = $wordLength; $minWord = $word; } $avgWordLength += $wordLength; /** END CALCULATE WORD LENGTHG **/ $charsInWordArr = preg_split("//u", $word, -1, PREG_SPLIT_NO_EMPTY); /* CHARS IN EACH WORD LOOP **/ foreach ($charsInWordArr as $char) { //echoN($char." ".in_array($char,$pauseMarksArrTemp)); // SPACE if ($char == " ") { continue; } $TOTALS_ARR['CHARS']++; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['CHARS']++; } } $TOTALS_ARR['VERSES']++; $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES']++; // if ( $TOTALS_ARR['VERSES']>30) // exit; } /** END AYA's LOOP **/ } /** END SURA's LOOP **/ /* SURA'S LOOP **/ for ($s = 0; $s < $numberOfSuras; $s++) { $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NRWORDS'] = count($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s]); arsort($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s]); } $TOTALS_ARR['NRWORDS'] = count($WORDS_FREQUENCY_ARR['WORDS']); $TOTALS_ARR['PAUSEMARKS_AGGREGATION'] = 0; // AGGREGATE PAUSE MARKS foreach ($TOTALS_ARR['PAUSEMARKS'] as $pmLabel => $pmCount) { //echo $pmLabel.$pmCount; $TOTALS_ARR['PAUSEMARKS_AGGREGATION'] += $pmCount; } /** * CALCULATING TF-IDF TABLE */ foreach ($WORDS_FREQUENCY_ARR['WORDS'] as $wordLabel => $wordFreq) { $termFrequency = $wordFreq; $termFrequencyPercentage = $termFrequency / $TOTALS_ARR['WORDS'] * 100; // DOCUMENT = VERSE $documentFrequency = 0; $inverseDocumentFrequency = 0; //CHECKING VERSES for ($s = 0; $s < $numberOfSuras; $s++) { //$versesPerSura = $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES']; //for ($a=0;$a<$versesPerSura;$a++) //{ if (isset($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$wordLabel])) { //= $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$wordLabel] $documentFrequency++; } //} } $inverseDocumentFrequency = log($numberOfSuras / $documentFrequency, 10); $TFIDF = $termFrequency * $inverseDocumentFrequency; //echoN("WORD:$wordLabel PRCG:$termFrequencyPercentage TF:$termFrequency DF:$documentFrequency IDF:$inverseDocumentFrequency TFIDF:$TFIDF "); $WORDS_FREQUENCY_ARR['WORDS_TFIDF'][$wordLabel] = array("TF" => $termFrequency, "TPC" => $termFrequencyPercentage, "DF" => $documentFrequency, "IDF" => $inverseDocumentFrequency, "TFIDF" => $TFIDF); } /** END OF TFIDF TABLE **/ rsortBy($WORDS_FREQUENCY_ARR['WORDS_TFIDF'], 'TF'); //preprint_r($WORDS_FREQUENCY_ARR['WORDS_TFIDF']); /** Continuing WORD/VERSE LENGTH CALCULATE **/ $avgWordLength = $avgWordLength / $TOTALS_ARR['WORDS']; $avgVerseLength = $avgVerseLength / $TOTALS_ARR['VERSES']; /* echoN($minWordLength." - ".$minWord); echoN($maxWordLength." - ".$maxWord); echoN($avgWordLength); echoN($minVerseLength." - ".$minVerse); echoN($maxVerseLength." - ".$maxVerse); echoN($avgVerseLength); */ $TOTALS_ARR['MIN_WORD_LENGTH'] = $minWordLength; $TOTALS_ARR['AVG_WORD_LENGTH'] = round($avgWordLength, 2); $TOTALS_ARR['MAX_WORD_LENGTH'] = $maxWordLength; $TOTALS_ARR['MIN_WORD'] = $minWord; $TOTALS_ARR['MAX_WORD'] = $maxWord; $TOTALS_ARR['MIN_VERSE_LENGTH'] = $minVerseLength; $TOTALS_ARR['AVG_VERSE_LENGTH'] = round($avgVerseLength, 2); $TOTALS_ARR['MAX_VERSE_LENGTH'] = $maxVerseLength; $TOTALS_ARR['MIN_VERSE'] = $minVerse; $TOTALS_ARR['MAX_VERSE'] = $maxVerse; /** end CALCULATE WORD/VERSE LENGTH **/ //exit;; arsort($WORDS_FREQUENCY_ARR['WORDS']); arsort($WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS']); arsort($WORDS_FREQUENCY_ARR['VERSE_ENDINGS']); //preprint_r($WORDS_FREQUENCY_ARR); /////// LOADING LANGUAGE RESOURCE FILES $resourceFile = $englishResourceFile; if (strpos($lang, "AR") !== false) { $resourceFile = $arabicResourceFile; } $languageResourcesArr = file($resourceFile, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES); $RESOURCES = array(); foreach ($languageResourcesArr as $index => $resourceLine) { $resourcePairsArr = preg_split("/\\|/", $resourceLine); $resourceID = $resourcePairsArr[0]; $resourceValue = $resourcePairsArr[1]; $RESOURCES[$resourceID] = $resourceValue; } //$MODEL_CORE['LOADED']=1; //$MODEL_CORE[$lang]['META_DATA'] = $META_DATA; addValueToMemoryModel($lang, "MODEL_CORE", "META_DATA", "", $META_DATA); //$MODEL_CORE[$lang]['TOTALS'] = $TOTALS_ARR; addValueToMemoryModel($lang, "MODEL_CORE", "TOTALS", "", $TOTALS_ARR); //$MODEL_CORE[$lang]['WORDS_FREQUENCY'] = $WORDS_FREQUENCY_ARR; addValueToMemoryModel($lang, "MODEL_CORE", "WORDS_FREQUENCY", "", $WORDS_FREQUENCY_ARR); addValueToMemoryModel($lang, "MODEL_CORE", "WORDS_FREQUENCY", "WORDS", $WORDS_FREQUENCY_ARR['WORDS']); //$MODEL_CORE[$lang]['QURAN_TEXT'] = $QURAN_TEXT; addValueToMemoryModel($lang, "MODEL_CORE", "QURAN_TEXT", "", $QURAN_TEXT); //$MODEL_CORE[$lang]['RESOURCES']=$RESOURCES; addValueToMemoryModel($lang, "MODEL_CORE", "RESOURCES", "", $RESOURCES); //$MODEL_CORE[$lang]['STOP_WORDS']= $stopWordsArr; addValueToMemoryModel($lang, "MODEL_CORE", "STOP_WORDS", "", $stopWordsArr); //$MODEL_CORE[$lang]['STOP_WORDS_STRICT_L2']= $stopWordsStrictL2Arr; addValueToMemoryModel($lang, "MODEL_CORE", "STOP_WORDS_STRICT_L2", "", $stopWordsStrictL2Arr); //file_put_contents("$serializedModelFile.core", (json_encode($MODEL_CORE))); //$MODEL_SEARCH[$lang]['INVERTED_INDEX'] = $INVERTED_INDEX; /*$invertedIndexIterator = getAPCIterator("MODEL_SEARCH.*"); foreach($invertedIndexIterator as $cursor) { preprint_r($cursor); }*/ addToMemoryModelBatch($invertedIndexBatchApcArr); //$res = apc_store("MODEL_CORE[$lang]",$MODEL_CORE[$lang]); //if ( $res===false){ throw new Exception("Can't cache MODEL_CORE[$lang]"); } //$res = apc_store("MODEL_SEARCH[$lang]",$MODEL_SEARCH[$lang]); //if ( $res===false){ throw new Exception("Can't cache MODEL_SEARCH[$lang]"); } //file_put_contents("$serializedModelFile.search", (json_encode($MODEL_SEARCH))); if ($lang == "AR") { //$MODEL_QAC['QAC_MASTERTABLE'] = $qacMasterSegmentTable; //$MODEL_QAC['QAC_POS'] = $qacPOSTable; addToMemoryModelBatch($qacPOSTableBatchApcArr); //$MODEL_QAC['QAC_FEATURES'] = $qacFeaturesTable; addToMemoryModelBatch($qacFeatureTableBatchApcArr); //$MODEL_QAC['QAC_ROOTS_LOOKUP'] = $rootsLookupArray; //file_put_contents("$serializedModelFile.qac", (json_encode($MODEL_QAC))); //$res = apc_store("MODEL_QAC",$MODEL_QAC); //if ( $res===false){ throw new Exception("Can't cache MODEL_QAC"); } rsortBy($quranaConcecpts, 'FREQ'); $MODEL_QURANA['QURANA_CONCEPTS'] = $quranaConcecpts; $MODEL_QURANA['QURANA_PRONOUNS'] = $quranaResolvedPronouns; //file_put_contents("$serializedModelFile.qurana", (json_encode($MODEL_QURANA))); $res = apc_store("MODEL_QURANA", $MODEL_QURANA); if ($res === false) { throw new Exception("Can't cache MODEL_QURANA"); } } //preprint_r($MODEL['INVERTED_INDEX'] );exit; //preprint_r($WORDS_FREQUENCY_ARR['VERSE_ENDINGS']); //echo serialize(json_encode($MODEL)); //preprint_r($MODEL['EN']); }
function getStatisticallySginificantWords($extendedQueryWordsArr, $scoringTable) { global $MODEL_CORE, $MODEL_CORE_UTH, $script; global $saktaLatifaMark, $sajdahMark; //preprint_r($extendedQueryWordsArr);exit; $queryTermsCollocation = array(); $relevanceReverseOrderIndex = count($documentScoreArr); foreach ($scoringTable as $documentID => $documentScoreArr) { $SURA = $documentScoreArr['SURA']; $AYA = $documentScoreArr['AYA']; $TEXT = $MODEL_CORE['QURAN_TEXT'][$SURA][$AYA]; $TEXT_UTH = $MODEL_CORE_UTH['QURAN_TEXT'][$SURA][$AYA]; $wordsArr = explode(" ", $TEXT); $lastWord = null; foreach ($wordsArr as $word) { $word = cleanAndTrim($word); if (empty($word)) { continue; } $word = strtolower($word); if (isset($MODEL_CORE['STOP_WORDS'][$word])) { continue; } // ignore pause marks if (isPauseMark($word, $MODEL_CORE['TOTALS']['PAUSEMARKS'], $saktaLatifaMark, $sajdahMark)) { continue; } if (!empty($lastWord) && isset($extendedQueryWordsArr[$word]) && !isset($extendedQueryWordsArr[$lastWord])) { $queryTermsCollocation[$lastWord]++; } if (!empty($lastWord) && isset($extendedQueryWordsArr[$lastWord]) && !isset($extendedQueryWordsArr[$word])) { $queryTermsCollocation[$word]++; } $lastWord = $word; } } arsort($queryTermsCollocation); //preprint_r($queryTermsCollocation);exit; $queryTermsCollocation = array_slice($queryTermsCollocation, 0, 10); return $queryTermsCollocation; }