Exemplo n.º 1
0
function textToGraph($searchResultTextArr, $excludes)
{
    global $pauseMarksFile, $lang;
    $MAX_CAP = 300;
    $graphObj = array();
    $graphObj["capped"] = 0;
    $graphNodes = array();
    $graphLinks = array();
    $pauseMarksArr = getPauseMarksArrByFile($pauseMarksFile);
    /** SHOULD BE ZERO BASED FOR D3 TO WORK - o.target.weight = NULL**/
    $nodeSerialNumber = 0;
    $lastWord = null;
    foreach ($searchResultTextArr as $index => $text) {
        $textWordsArr = preg_split("/ /", $text);
        //echoN($text);
        foreach ($textWordsArr as $word) {
            if ($lang == "EN") {
                $word = cleanAndTrim($word);
                $word = strtolower($word);
            }
            //echoN($word);
            if ($pauseMarksArr[$word]) {
                continue;
            }
            if ($excludes[$word] == 1) {
                continue;
            }
            if (!isset($graphNodes[$word])) {
                $graphNodes[$word] = array("id" => $nodeSerialNumber++, "word" => $word, "size" => 1, "x" => rand(1, 800), "y" => rand(1, 400));
            } else {
                $graphNodes[$word]["size"] = $graphNodes[$word]["size"] + 1;
            }
            if ($lastWord != null) {
                $graphLinks[] = array("source" => $graphNodes[$lastWord]["id"], "target" => $graphNodes[$word]["id"]);
            }
            $lastWord = $word;
        }
        if (count($graphNodes) > $MAX_CAP) {
            $graphObj["capped"] = $MAX_CAP;
            break;
        }
    }
    $graphObj["nodes"] = $graphNodes;
    $graphObj["links"] = $graphLinks;
    //preprint_r($graphLinks);
    //preprint_r($graphNodes);
    return $graphObj;
}
                    $noDerivationsConstraint = true;
                }
                if ($columnSearchArr[1] == "NOEXTENTIONFROMONTOLOGY") {
                    $query = str_replace("CONSTRAINT:NOEXTENTIONFROMONTOLOGY", "", $query);
                    $noOntologyExtentionConstraint = true;
                }
            }
        }
    }
}
//preprint_r($columnSearchKeyValParams);exit;
//echoN("IS QUESTION:$isQuestion");
//echoN("noOntologyExtentionConstraint:$noOntologyExtentionConstraint");
//echoN("noDerivationsConstraint:$noDerivationsConstraint");
/// CLEANING
$query = cleanAndTrim($query);
//$query = removeTashkeel($query);
//  remove tashkeel - convert from uthmani to simple
// didn't use remove tashkeel since it leaves "hamzet el wasl" which is not in the simple text
if (!isSimpleQuranWord($query)) {
    $query = convertUthamniQueryToSimple($query);
}
// CASE HANDLING
if ($lang == "EN") {
    $query = strtolower($query);
    $query = removeSpecialCharactersFromMidQuery($query);
} else {
    $query = removeNonArabicAndSpaceChars($query);
}
$originalQueryWordsArr = preg_split("/ /", $query);
//for faster access
function answerUserQuestion($query, $queryWordsArr, $taggedSignificantWords, $scoringTable, $lang)
{
    global $is_a_relation_name_ar;
    $QURAN_TEXT = getModelEntryFromMemory($lang, "MODEL_CORE", "QURAN_TEXT", "");
    // answering by relevance and similarity
    $conceptsFromTaxRelations = extendQueryWordsByConceptTaxRelations($taggedSignificantWords, $lang, true);
    $COMMON_CONCEPTS_FACTOR = 10;
    $COMMON_QUESTION_TYPE_CONCEPTS_FACTOR = 10;
    $COMMON_ROOTS_FACTOR = 10;
    $COMMON_DERIVATIONS_FACTOR = 10;
    $scoredAnswerVersesArr = array();
    //preprint_r($taggedSignificantWords);
    //echoN($query);
    $questionType = containsQuestionWords($query, $lang);
    ////////// COMMON CONCEPTS IN QUESTION
    $conceptsInQuestionTextArr = getConceptsFoundInText($query, $lang);
    //preprint_r($conceptsInQuestionTextArr);
    ///////////////////////////////////////
    /////////// GET CONCEPTS FOR THE QUESTION TYPE
    /// GET INSTANCE CONCEPTS FROM QUESTION TYPE CLASS
    $questionType = cleanAndTrim(strtolower($questionType));
    //echoN($questionType);
    //$conceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$questionType];
    $conceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $questionType);
    //echoN($conceptID);
    //$relationsOfConceptAsTarget = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_TARGETS'][$conceptID];
    $relationsOfConceptAsTarget = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_TARGETS", $conceptID);
    $questionTypeConceptsArr = array();
    foreach ($relationsOfConceptAsTarget as $index => $relArr) {
        $verb = $relArr["link_verb"];
        $subject = $relArr["source"];
        //echoN("CC:$is_a_relation_name_ar|$verb");
        if ($verb != $is_a_relation_name_ar) {
            continue;
        }
        $questionTypeConceptsArr[] = $subject;
    }
    //////////////////////////////////////////////
    $debugArr = array();
    //// Answering by similarity and relevance
    foreach ($scoringTable as $documentID => $documentScoreArr) {
        //preprint_r($documentScoreArr);
        $relevanceReverseOrderIndex--;
        $SURA = $documentScoreArr['SURA'];
        $AYA = $documentScoreArr['AYA'];
        $TEXT = $QURAN_TEXT[$SURA][$AYA];
        $score = $documentScoreArr['SCORE'];
        //echoN("SCORE BEFORE QUESTION RELEVANCE:$score");
        if ($lang == "EN") {
            $TEXT = strtolower($TEXT);
        }
        //echoN($TEXT);
        $conceptsInTextArr = getConceptsFoundInText($TEXT, $lang);
        //preprint_r($conceptsInTextArr);
        /////////// COMMON CONCEPTS BWTEEEN QUESTION AND A VERSE TEXT
        $commonQuestionVerseConceptsCount = getIntersectionCountOfTwoArrays(array_keys($conceptsInQuestionTextArr), array_keys($conceptsInTextArr));
        //echoN("Common Concepts:$commonQuestionVerseConceptsCount");
        $debugArr[$documentID]['COMMON_CONCEPTS'] = $commonQuestionVerseConceptsCount;
        $debugArr[$documentID]['COMMON_CONCEPTS_LIST'] = join(" ", array_intersect(array_keys($conceptsInQuestionTextArr), array_keys($conceptsInTextArr)));
        //preprint_r($debugArr);exit;
        $score += $commonQuestionVerseConceptsCount * $COMMON_CONCEPTS_FACTOR;
        ///////////////////////////////////////////////////////////
        //preprint_r($questionTypeConceptsArr);
        //preprint_r(array_keys($conceptsInTextArr));
        $numberOfSharedConceptsForThisQuestionType = getIntersectionCountOfTwoArrays($questionTypeConceptsArr, array_keys($conceptsInTextArr));
        //echoN($numberOfSharedConceptsForThisQuestionType);
        $score += $numberOfSharedConceptsForThisQuestionType * $COMMON_QUESTION_TYPE_CONCEPTS_FACTOR;
        $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS'] = $numberOfSharedConceptsForThisQuestionType;
        $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS_LIST'] = join(" ", array_intersect($questionTypeConceptsArr, array_keys($conceptsInTextArr)));
        //// QUESION-VERSE SIMILARITY MESUREMENT (wITH DERIVATIONS CONSIDERED)
        $wordsInVerseTextArr = explode(" ", $TEXT);
        $derivationHandledB4 = array();
        $commonDerivations = 0;
        if ($lang == "EN") {
            foreach ($taggedSignificantWords as $wordInQuestion => $pos) {
                //echoN("$word $pos");
                // for words like i (NOUN in the lexicon for some reson )
                if (mb_strlen($wordInQuestion) <= 2) {
                    continue;
                }
                if ($pos == "VBN" || $pos == "VBD" || $pos == "VBG" || $pos == "NN" || $pos == "NNS") {
                    foreach ($wordsInVerseTextArr as $index => $wordInArray) {
                        $wordInArray = cleanAndTrim($wordInArray);
                        if (mb_strlen($wordInArray) <= 2) {
                            continue;
                        }
                        // if any word (noun/verb) in the quetion is a substring
                        if (strpos($wordInArray, $wordInQuestion) !== false || strpos($wordInQuestion, $wordInArray) !== false) {
                            if (isset($derivationHandledB4[$wordInArray])) {
                                continue;
                            }
                            //echoN("$word is SS in VerseText");
                            $commonDerivations++;
                            $derivationHandledB4[$wordInArray] = 1;
                            //$debugArr[$documentID]['COMMON_DERIVATIONS_LIST']=
                            //$debugArr[$documentID]['COMMON_DERIVATIONS_LIST']."|".$wordInArray;
                        }
                    }
                }
            }
            $score += $commonDerivations * $COMMON_DERIVATIONS_FACTOR;
            $debugArr[$documentID]['COMMON_DERIVATIONS'] = $commonDerivations;
        } else {
            $questionWordsRootsArr = array();
            foreach ($taggedSignificantWords as $wordInQuestion => $pos) {
                if (mb_strlen($wordInQuestion) <= 2) {
                    continue;
                }
                if ($pos == "NN" || $pos == "NNS") {
                    //echoN("===$wordInQuestion");
                    $root = getRootOfSimpleWord($wordInQuestion, array("N", "V"));
                    if (!empty($root)) {
                        $questionWordsRootsArr[] = $root;
                    }
                }
            }
            //preprint_r($questionWordsRootsArr);
            //exit;
            $verseWordsRootsArr = array();
            foreach ($wordsInVerseTextArr as $index => $wordInArray) {
                if (mb_strlen($wordInArray) <= 2) {
                    continue;
                }
                $root = getRootOfSimpleWord($wordInArray, array("N", "V"));
                if (!empty($root)) {
                    $verseWordsRootsArr[] = $root;
                }
            }
            //preprint_r($verseWordsRootsArr);
        }
        $commonRootsCount = getIntersectionCountOfTwoArrays($verseWordsRootsArr, $questionWordsRootsArr);
        $score += $commonRootsCount * $COMMON_ROOTS_FACTOR;
        $debugArr[$documentID]['COMMON_ROOTS'] = $commonRootsCount;
        //echoN($commonRootsCount);
        /////////////////////////////////////////////////////////
        //echoN("SCORE AFTER QUESTION RELEVANCE:$score");
        $scoringTable[$documentID]['SCORE'] = $score;
        $scoredAnswerVersesArr[$documentID] = $scoringTable[$documentID];
    }
    rsortBy($scoredAnswerVersesArr, "SCORE");
    //preprint_r($debugArr);
    //preprint_r($scoredAnswerVersesArr);exit;
    $scoredAnswerVersesArr = array_slice($scoredAnswerVersesArr, 0, 3);
    //// REMOVE ANY VERSE FROM THE FINAL LIST WHICH HAS NO OBVIOUS SIMILARITY WITH THE QUESTION
    foreach ($scoredAnswerVersesArr as $documentID => $verseArr) {
        //preprint_r($debugArr[$documentID]);
        if ($debugArr[$documentID]['COMMON_ROOTS'] == 0 && $debugArr[$documentID]['COMMON_CONCEPTS'] == 0 && $debugArr[$documentID]['COMMON_QUESTION_TYPE_CONCEPTS'] == 0 && $debugArr[$documentID]['COMMON_DERIVATIONS'] == 0) {
            unset($scoredAnswerVersesArr[$documentID]);
        }
    }
    /////////////////////////////////////
    //preprint_r($scoredAnswerVersesArr);
    //preprint_r($scoredAnswerVersesArr);
    return array("ANSWER_CONCEPTS" => $conceptsFromTaxRelations, "ANSWER_VERSES" => $scoredAnswerVersesArr);
}
Exemplo n.º 4
0
function getConceptsFoundInText($text, $lang)
{
    global $thing_class_name_ar, $is_a_relation_name_ar;
    $conceptsInTextArr = array();
    $textWordsArr = preg_split("/ /", $text);
    foreach ($textWordsArr as $index => $word) {
        if ($lang == "EN") {
            $word = cleanAndTrim($word);
            $word = strtolower($word);
            // translate English name to arabic concept name/id
            //$wordConveretedToConceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$word];
            $wordConveretedToConceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $word);
        } else {
            $wordConveretedToConceptID = convertWordToConceptID($word);
        }
        //echoN($wordConveretedToConceptID);
        if (modelEntryExistsInMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID)) {
            //preprint_r($MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]);exit;
            //echoN($wordConveretedToConceptID);
            //$mainConceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID];
            $mainConceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID);
            $conceptLabelAR = $mainConceptArr['label_ar'];
            $conceptLabelEN = $mainConceptArr['label_en'];
            $conceptFrequency = $mainConceptArr['frequency'];
            $conceptWeight = $mainConceptArr['weight'];
            $finalNodeLabel = $conceptLabelAR;
            if ($lang == "EN") {
                $finalNodeLabel = $conceptLabelEN;
            }
            if ($wordConveretedToConceptID == $thing_class_name_ar) {
                continue;
            }
            $conceptsInTextArr[$wordConveretedToConceptID] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $mainConceptArr, $randomXLocation, $randomYLocation, 1);
        }
    }
    return $conceptsInTextArr;
}
  <div id='main-container'>
			  	
			
			  	
			

			  		<div id='graph-maingraph-area'>
					<?php 
$customFreqArr = array();
$QURAN_TEXT = getModelEntryFromMemory($lang, "MODEL_CORE", "QURAN_TEXT", "");
$suraSize = count($QURAN_TEXT[$SURA]);
for ($a = 0; $a < $suraSize; $a++) {
    $verseText = $QURAN_TEXT[$SURA][$a];
    $verseTextArr = explode(" ", $verseText);
    foreach ($verseTextArr as $index => $word) {
        $word = cleanAndTrim($word);
        $word = strtolower($word);
        $customFreqArr[$word]++;
    }
    $arrOfTextToGraph[] = $verseText;
}
$graphObj = ontologyTextToD3Graph($MODEL_QA_ONTOLOGY, "SEARCH_RESULTS_TEXT_ARRAY", $arrOfTextToGraph, 0, array(960, 600), $lang, 1);
foreach ($graphObj['nodes'] as $index => $nodeArr) {
    $word = strtolower($nodeArr['word']);
    if (isset($customFreqArr[$word])) {
        $graphObj['nodes'][$index]['size'] = $customFreqArr[$word];
    } else {
        $graphObj['nodes'][$index]['size'] = 1;
    }
}
//preprint_r($graphNodesArr);
Exemplo n.º 6
0
function ontologyTextToD3Graph($MODEL_QA_ONTOLOGY, $inputType, $searchResultTextArr, $minFreq = 0, $widthHeigthArr, $lang, $mainConceptsOnly = false, $isPhraseSearch = false, $isQuestion = false, $query = "")
{
    global $thing_class_name_ar, $is_a_relation_name_ar;
    $graphObj = array();
    $graphNodes = array();
    $graphLinks = array();
    ////// calculate start points
    $width = $widthHeigthArr[0];
    $height = $widthHeigthArr[1];
    $startLocationXMin = $width / 2 - 100;
    $startLocationXMax = $width / 2 + 100;
    $startLocationYMin = $height / 2 - 100;
    $startLocationYMax = $height / 2 + 100;
    ////////////////////////////
    /** SHOULD BE ZERO BASED FOR D3 TO WORK - o.target.weight = NULL**/
    $nodeSerialNumber = 0;
    $lastWord = null;
    foreach ($searchResultTextArr as $index => $text) {
        if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY") {
            $textWordsArr = preg_split("/ /", $text);
        } else {
            if (!$isPhraseSearch) {
                // extendedQueryParam
                $textWordsArr = array_keys($searchResultTextArr);
            } else {
                // phrase should be checked as is
                $textWordsArr[0] = $query;
            }
        }
        foreach ($textWordsArr as $word) {
            if ($lang == "EN") {
                $word = cleanAndTrim($word);
                $word = strtolower($word);
                // translate English name to arabic concept name/id
                //$wordConveretedToConceptID = $MODEL_QA_ONTOLOGY['CONCEPTS_EN_AR_NAME_MAP'][$word];
                $wordConveretedToConceptID = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS_EN_AR_NAME_MAP", $word);
            } else {
                $wordConveretedToConceptID = convertWordToConceptID($word);
            }
            if (modelEntryExistsInMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID)) {
                //preprint_r($MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID]);exit;
                //echoN($wordConveretedToConceptID);
                $mainConceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $wordConveretedToConceptID);
                //$mainConceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$wordConveretedToConceptID];
                $conceptLabelAR = $mainConceptArr['label_ar'];
                $conceptLabelEN = $mainConceptArr['label_en'];
                $conceptFrequency = $mainConceptArr['frequency'];
                $conceptWeight = $mainConceptArr['weight'];
                $finalNodeLabel = $conceptLabelAR;
                if ($lang == "EN") {
                    $finalNodeLabel = $conceptLabelEN;
                }
                /*if ( empty($finalNodeLabel))
                		{
                			echoN($conceptLabelAR);
                			exit;
                		}*/
                if ($conceptFrequency < $minFreq) {
                    continue;
                }
                if ($wordConveretedToConceptID == $thing_class_name_ar) {
                    continue;
                }
                if (!isset($graphNodes[$wordConveretedToConceptID])) {
                    $randomXLocation = rand($startLocationXMin, $startLocationXMax);
                    $randomYLocation = rand($startLocationYMin, $startLocationYMax);
                    $graphNodes[$wordConveretedToConceptID] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $mainConceptArr, $randomXLocation, $randomYLocation, 1);
                }
            }
        }
    }
    $tooManyConcepts = count($graphNodes) > 200;
    $ONTOLOGY_RELATIONS = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "RELATIONS", "");
    //preprint_r($graphNodes,1);exit;
    $linksHashLookupTable = array();
    //preprint_r($graphNodes,true);exit;
    foreach ($graphNodes as $concept => $conceptArr) {
        $conceptID = convertWordToConceptID($concept);
        //$relationsOfConceptAsSource = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_SOURCES'][$conceptID];
        $relationsOfConceptAsSource = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_SOURCES", $conceptID);
        //$relationsOfConceptAsTarget = $MODEL_QA_ONTOLOGY['GRAPH_INDEX_TARGETS'][$conceptID];
        $relationsOfConceptAsTarget = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "GRAPH_INDEX_TARGETS", $conceptID);
        foreach ($relationsOfConceptAsSource as $index => $relArr) {
            $verb = $relArr["link_verb"];
            $object = $relArr["target"];
            //echoN("$verb==$is_a_relation_name_ar && $object==$thing_class_name_ar");
            // ignore is-a thing relations
            if ($verb == $is_a_relation_name_ar && $object == $thing_class_name_ar) {
                continue;
            }
            if ($tooManyConcepts && $verb == $is_a_relation_name_ar) {
                continue;
            }
            // IF SHOWING MAIN CONCEPTS ONLY, IGNORE CONCEPTS NOT IN MAIN CONCEPTS LIST
            if ($mainConceptsOnly && !isset($graphNodes[$object])) {
                continue;
            }
            // NO extending by relations in case of search result text
            // to reduce number of concepts we only add relations with other concepts
            // found in the text
            if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY" && !isset($graphNodes[$object])) {
                continue;
            }
            //preprint_r($relArr,true);
            $randomXLocation = rand($startLocationXMin, $startLocationXMax);
            $randomYLocation = rand($startLocationYMin, $startLocationYMax);
            $relHashID = buildRelationHashID($conceptID, $verb, $object);
            $fullRelationArr = $ONTOLOGY_RELATIONS[$relHashID];
            //$conceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$object];
            $conceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $object);
            $finalNodeLabel = $conceptArr['label_ar'];
            if ($lang == "EN") {
                $finalNodeLabel = formatEnglishConcept($conceptArr['label_en']);
                $verb = $fullRelationArr['VERB_TRANSLATION_EN'];
            }
            if (!isset($graphNodes[$object])) {
                $graphNodes[$object] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $conceptArr, $randomXLocation, $randomYLocation, 2);
            }
            $linkArr = array("source" => $graphNodes[$concept]["id"], "target" => $graphNodes[$object]["id"], "link_verb" => $verb, "link_frequency" => $fullRelationArr['FREQUENCY']);
            //////// HANDLING MULTIPLE LINKS BETWEEN SAME NODES BEFORE ASSIGNING LINK
            $arrHash = getArrayHashForFields($linkArr, array('source', 'target'));
            /*preprint_r($graphNodes);
            		echoN($finalNodeLabel);
            		preprint_r($linkArr);*/
            if (!isset($linksHashLookupTable[$arrHash])) {
                $graphLinks[] = $linkArr;
                $linksHashLookupTable[$arrHash] = count($graphLinks) - 1;
            } else {
                $linkIndex = $linksHashLookupTable[$arrHash];
                if (strpos($graphLinks[$linkIndex]['link_verb'], "{$verb}") === false) {
                    $graphLinks[$linkIndex]['link_verb'] .= "," . $verb;
                }
            }
            /*if (  $MODEL_QA_ONTOLOGY['CONCEPTS'][$object]['label_en']=="help")
            		{
            			echoN(isset($graphNodes[$object])." ".$object," ");
            			echoN($concept);
            			preprint_r($graphLinks[$linkIndex]);
            			preprint_r($graphNodes[$object]);
            			preprint_r($graphNodes[$concept]);
            			exit;
            			
            		}*/
            /////////////////////////////////////////////////////////////
        }
        foreach ($relationsOfConceptAsTarget as $index => $relArr) {
            $verb = $relArr["link_verb"];
            $subject = $relArr["source"];
            $relationIndex = $relArr['relation_index'];
            // IF SHOWING MAIN CONCEPTS ONLY, IGNORE CONCEPTS NOT IN MAIN CONCEPTS LIST
            if ($mainConceptsOnly && !isset($graphNodes[$subject])) {
                continue;
            }
            if ($tooManyConcepts && $verb == $is_a_relation_name_ar) {
                continue;
            }
            // NO extending by relations in case of search result text
            // to reduce number of concepts we only add relations with other concepts
            // found in the text
            if ($inputType == "SEARCH_RESULTS_TEXT_ARRAY" && !isset($graphNodes[$object])) {
                continue;
            }
            $relHashID = buildRelationHashID($subject, $verb, $concept);
            $fullRelationArr = $ONTOLOGY_RELATIONS[$relHashID];
            $randomXLocation = rand($startLocationXMin, $startLocationXMax);
            $randomYLocation = rand($startLocationYMin, $startLocationYMax);
            //$conceptArr = $MODEL_QA_ONTOLOGY['CONCEPTS'][$subject];
            $conceptArr = getModelEntryFromMemory("ALL", "MODEL_QA_ONTOLOGY", "CONCEPTS", $subject);
            $finalNodeLabel = $conceptArr['label_ar'];
            if ($lang == "EN") {
                $finalNodeLabel = formatEnglishConcept($conceptArr['label_en']);
                $verb = $fullRelationArr['VERB_TRANSLATION_EN'];
            }
            if (!isset($graphNodes[$subject])) {
                $graphNodes[$subject] = createNewConceptObj($nodeSerialNumber, $lang, $finalNodeLabel, $conceptArr, $randomXLocation, $randomYLocation, 2);
            }
            $linkArr = array("source" => $graphNodes[$subject]["id"], "target" => $graphNodes[$concept]["id"], "link_verb" => $verb, "link_frequency" => $fullRelationArr['frequency']);
            //////// HANDLING MULTIPLE LINKS BETWEEN SAME NODES BEFORE ASSIGNING LINK
            $arrHash = getArrayHashForFields($linkArr, array('source', 'target'));
            if (!isset($linksHashLookupTable[$arrHash])) {
                $graphLinks[] = $linkArr;
                $linksHashLookupTable[$arrHash] = count($graphLinks) - 1;
            } else {
                $linkIndex = $linksHashLookupTable[$arrHash];
                if (strpos($graphLinks[$linkIndex]['link_verb'], "{$verb}") === false) {
                    $graphLinks[$linkIndex]['link_verb'] .= "," . $verb;
                }
            }
            //////////////////////////////////////////////////////////////
        }
    }
    //preprint_r($graphLinks);exit;
    $graphNodesArr = array();
    foreach ($graphNodes as $word => $nodeArr) {
        $graphNodesArr[] = $nodeArr;
    }
    //preprint_r($graphNodesArr,1);exit;
    //$graphNodesArr = array_slice($graphNodesArr, 1,10);
    //$graphLinks = array_slice($graphLinks, 1,10);
    $graphObj["nodes"] = $graphNodesArr;
    $graphObj["links"] = $graphLinks;
    return $graphObj;
}
Exemplo n.º 7
0
function loadModel($lang, $type, $file)
{
    global $WORDS_FREQUENCY_ARR, $TOTALS_ARR, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $MODEL_QURANA;
    global $sajdahMark, $saktaLatifaMark, $pauseMarksFile, $serializedModelFile, $basmalaTextUthmani;
    global $numberOfSuras, $numberOfVerses, $quranMetaDataFile, $arabicStopWordsFile, $englishStopWordsFile;
    global $META_DATA, $basmalaText, $englishResourceFile, $arabicResourceFile, $quranCorpusMorphologyFile;
    global $quranaPronounResolutionConceptsFile, $quranaPronounResolutionDataFileTemplate, $quranFileUthmaniAR;
    global $TRANSLATION_MAP_EN_TO_AR, $TRANSLATION_MAP_AR_TO_EN, $TRANSLITERATION_WORDS_MAP, $TRANSLITERATION_VERSES_MAP;
    global $basmalaTextUthmani2, $arabicStopWordsFileL2;
    global $TRANSLITERATION_WORDS_LOCATION_MAP;
    $QURAN_TEXT = array();
    $invertedIndexBatchApcArr = array();
    $qacMasterTableBatchApcArr = array();
    $qacPOSTableBatchApcArr = array();
    $qacFeatureTableBatchApcArr = array();
    $TOTALS_ARR = array();
    $TOTALS_ARR['CHARS'] = 0;
    $TOTALS_ARR['WORDS'] = 0;
    $TOTALS_ARR['NRWORDS'] = 0;
    $TOTALS_ARR['VERSES'] = 0;
    $TOTALS_ARR['SURAS'] = $numberOfSuras;
    $TOTALS_ARR['CHAPTERS'] = 30;
    $TOTALS_ARR['TOTAL_PER_SURA'] = array();
    $TOTALS_ARR['SAJDAT_TELAWA'] = array();
    $TOTALS_ARR['PAUSEMARKS'] = array();
    $TOTALS_ARR['MIN_WORD_LENGTH'] = 0;
    $TOTALS_ARR['AVG_WORD_LENGTH'] = 0;
    $TOTALS_ARR['MAX_WORD_LENGTH'] = 0;
    $TOTALS_ARR['MIN_WORD'] = null;
    $TOTALS_ARR['MAX_WORD'] = null;
    $TOTALS_ARR['MIN_VERSE_LENGTH'] = 0;
    $TOTALS_ARR['AVG_VERSE_LENGTH'] = 0;
    $TOTALS_ARR['MAX_VERSE_LENGTH'] = 0;
    $TOTALS_ARR['MIN_VERSE'] = null;
    $TOTALS_ARR['MAX_VERSE'] = null;
    $TOTALS_ARR['SAJDAT_TELAWA']['COUNT'] = 0;
    $TOTALS_ARR['SAJDAT_TELAWA']['VERSES'] = array();
    $TOTALS_ARR['SAKTA_LATIFA']['COUNT'] = 0;
    $TOTALS_ARR['SAKTA_LATIFA']['VERSES'] = array();
    $INVERTED_INDEX = array();
    $WORDS_FREQUENCY_ARR = array();
    $WORDS_FREQUENCY_ARR['WORDS'] = array();
    $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'] = array();
    $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'] = array();
    $WORDS_FREQUENCY_ARR['WORDS_TFIDF'] = array();
    $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'] = array();
    $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'] = array();
    /** WORD LENGTH **/
    $minWordLength = 1000;
    $minWord = null;
    $maxWordLength = -1;
    $maxWord = null;
    $avgWordLength = 0;
    /** VERSE LENGTH **/
    $minVerseLength = 1000;
    $minVerse = null;
    $maxVerseLength = -1;
    $maxVerse = null;
    $avgVerseLength = 0;
    /** QAC Model **/
    // Master model, contains all QAC data
    $qacMasterSegmentTable = array();
    //pinters/indexes on the master table for POS and features
    $qacPOSTable = array();
    $qacFeaturesTable = array();
    //$qacWordsTable = array();
    $qacSegmentToWordTable = array();
    /** QURANA Corpus **/
    $quranaConcecpts = array();
    $quranaResolvedPronouns = array();
    ########### LOAD DATA ACCORDING TO MODEL SOURCE TYPE
    if ($type == "XML") {
        $sourceContent = simplexml_load_file($file);
    } else {
        $sourceContent = file($file, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES);
    }
    if ($type == "TXT") {
        for ($s = 0; $s < $numberOfVerses; $s++) {
            $line = $sourceContent[$s];
            $lineArr = preg_split("/\\|/", $line);
            $suraIndex = $lineArr[0];
            $ayaIndex = $lineArr[1];
            $text = $lineArr[2];
            //strip "besm allah alrahman al raheem" from furst aya of all suras except the first one
            if (strpos($lang, "AR") !== false && $ayaIndex == 1 && $s != 0) {
                if ($lang == "AR") {
                    $text = trim(str_replace($basmalaText, "", $text));
                } else {
                    if ($lang == "AR_UTH") {
                        $text = trim(str_replace($basmalaTextUthmani, "", $text));
                        $text = trim(str_replace($basmalaTextUthmani2, "", $text));
                    }
                }
            }
            if (!isset($QURAN_TEXT[$suraIndex - 1])) {
                $QURAN_TEXT[$suraIndex - 1] = array();
            }
            $QURAN_TEXT[$suraIndex - 1][$ayaIndex - 1] = $text;
        }
    } else {
        if ($type == "XML") {
            for ($s = 0; $s < $numberOfSuras; $s++) {
                $suraSize = $META_DATA['SURAS'][$s]['ayas'];
                for ($a = 0; $a < $suraSize; $a++) {
                    $QURAN_TEXT[$s][$a] = (string) $sourceContent->sura[$s]->aya[$a]['text'];
                }
            }
        } else {
            throw new Exception("Invalid Source Type ({$type})");
        }
    }
    ##############################################################
    // free resources
    $sourceContent = null;
    unset($sourceContent);
    if ($lang == "AR") {
        ############ LOAD QAC (Quranic Arabic Corpus) FILE ###################################
        //dont skip new lines here (FILE_SKIP_EMPTY_LINES) for the skipping "57" condition below to work
        $qacFileLinesArr = file($quranCorpusMorphologyFile, FILE_IGNORE_NEW_LINES);
        $rootsLookupArray = array();
        $headerIndex = 0;
        $segmentIndex = 1;
        foreach ($qacFileLinesArr as $line) {
            $headerIndex++;
            //ignore header sections
            if ($headerIndex <= 57) {
                continue;
            }
            //if ( $segmentIndex >= 2) exit;
            //echoN($line);
            // convert columns to array
            $lineArr = preg_split("/\t/", $line);
            $location = $lineArr[0];
            $formOrSegment = $lineArr[1];
            $posTAG = $lineArr[2];
            $featuresList = $lineArr[3];
            //preprint_r($lineArr);
            // remove brackets from location and keep it only SURA/AYA/WORDINDEX/SEGMENTINDEX
            $masterID = preg_replace("/\\(|\\)|/", "", $location);
            $locationArr = preg_split("/\\:/", $masterID);
            $wordSegmentID = $locationArr[count($locationArr) - 1];
            $wordIndex = $locationArr[count($locationArr) - 2];
            $verseID = $locationArr[count($locationArr) - 3];
            $suraID = $locationArr[count($locationArr) - 4];
            // Remove segment index from location ( will be added as new array below )
            $masterID = substr($masterID, 0, strlen($masterID) - 2);
            // get the reversed buackwalter transliteration for the segment
            $formOrSegmentReverseTransliterated = buckwalterReverseTransliteration($formOrSegment);
            //echoN($formOrSegmentReverseTransliterated);
            // separate features
            $featuresTempArr = preg_split("/\\|/", $featuresList);
            //preprint_r($featuresTempArr);
            $featuresArr = array();
            foreach ($featuresTempArr as $oneFeature) {
                // feature is a key/value set
                if (strpos($oneFeature, ":") !== false) {
                    $oneFeatureKeyValueArr = preg_split("/\\:/", $oneFeature);
                    $featureName = $oneFeatureKeyValueArr[0];
                    $featureValue = $oneFeatureKeyValueArr[1];
                    if ($featureName == "LEM" || $featureName == "ROOT") {
                        //echoN($featureValue);
                        $featureValue = buckwalterReverseTransliteration($featureValue);
                    }
                } else {
                    $featureName = $oneFeature;
                    // 1 here just a dummy value
                    $featureValue = -1;
                }
                $featureValue = trim($featureValue);
                // fill Features Index table
                //$qacFeaturesTable[$featureName][$masterID]= $featureValue;
                $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_FEATURES/{$featureName}";
                $qacFeatureTableBatchApcArr[$apcMemoryEntryKey][$masterID] = $featureValue;
                $featuresArr[$featureName] = $featureValue;
                // non-word features should not be included
                if ($featureName == "LEM" || $featureName == "ROOT") {
                    addToInvertedIndex($invertedIndexBatchApcArr, $lang, trim($featureValue), $suraID - 1, $verseID - 1, $wordIndex, trim($featureName), $formOrSegmentReverseTransliterated);
                    if ($featureName == "ROOT") {
                        //$rootsLookupArray[$formOrSegmentReverseTransliterated]=$featureValue;
                        addValueToMemoryModel($lang, "MODEL_QAC", "QAC_ROOTS_LOOKUP", $formOrSegmentReverseTransliterated, $featureValue);
                    }
                }
            }
            //location significant before increment below
            $qacSegmentToWordTable[$segmentIndex] = $wordIndex;
            // Fill master table
            //$qacMasterSegmentTable[$masterID][]
            $qacMasterTableEntry = array("FORM_EN" => $formOrSegment, "FORM_AR" => $formOrSegmentReverseTransliterated, "TAG" => $posTAG, "SEGMENT_INDEX" => $segmentIndex++, "FEATURES" => $featuresArr);
            $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_MASTERTABLE/{$masterID}";
            $qacMasterTableBatchApcArr[$apcMemoryEntryKey][] = $qacMasterTableEntry;
            // Fill Part of Speech tagging table
            $qacPOSTable[$posTAG][$masterID] = $wordSegmentID;
            $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_POS/{$posTAG}";
            $qacPOSTableBatchApcArr[$apcMemoryEntryKey][$masterID] = $wordSegmentID;
        }
        //preprint_r($qacMasterSegmentTable);
        //preprint_r($qacFeaturesTable);
        //preprint_r($qacPOSTable);
        ##############################################################
        // free resources
        $qacFileLinesArr = null;
        unset($qacFileLinesArr);
        // need to fluch tabel in memory since it is needed by Qurana - in segment function
        addToMemoryModelBatch($qacMasterTableBatchApcArr);
    }
    ######### Qurana Pronomial Anaphone Corpus ###################
    //echoN($quranaPronounResolutionConceptsFile);
    // GET XML FILE CONTENT
    $xmlContent = file_get_contents($quranaPronounResolutionConceptsFile);
    // LOAD XML OBJECT - trim used to avoid first line empty error
    $concepts = simplexml_load_string(trim(stripHTMLComments($xmlContent)));
    // LOAD CONCEPTS
    foreach ($concepts->con as $index => $conceptObj) {
        $conceptID = (string) $conceptObj['id'];
        $conceptNameEN = (string) $conceptObj->english;
        $conceptNameAR = (string) $conceptObj->arabic;
        $quranaConcecpts[$conceptID] = array("EN" => trim($conceptNameEN), "AR" => trim($conceptNameAR), "FREQ" => 0);
    }
    $pronounsCount = 0;
    $segmentsCount = 0;
    //preprint_r($quranaConcecpts);
    // LOAD PRONOUNS // load & parse the file of each SURA and load it in the model
    for ($s = 0; $s < $numberOfSuras; $s++) {
        $suraID = $s + 1;
        $pronounDataFileName = preg_replace("/%s/", $suraID, $quranaPronounResolutionDataFileTemplate);
        //echoN($pronounDataFileName);
        // GET XML FILE CONTENT of the current SURA by customizing file name
        $xmlContent = file_get_contents($pronounDataFileName);
        // LOAD XML OBJECT - trim used to avoid first line empty error
        $chapter = simplexml_load_string(trim(stripHTMLComments($xmlContent)));
        // LOAD CONCEPTS
        foreach ($chapter->verse as $index => $verseObj) {
            $verseLocalSegmentIndex = 0;
            $versesCount++;
            // Loop on all children
            foreach ($verseObj->children() as $index => $childObj) {
                // get tag name
                $tagName = $childObj->getName();
                $verseLocalSegmentIndex++;
                $segmentsCount++;
                // we got a prounoun tag
                if ($tagName == "pron") {
                    $pronounsCount++;
                    // get the verse including this pronoun
                    $verseID = (string) $verseObj['id'];
                    // get pronoun concept ID and antecendent
                    $conceptID = (string) $childObj['con'];
                    $pronounAntecedent = (string) $childObj['ant'];
                    // get segment ID and word form
                    $quranaSegmentID = (string) $childObj->seg['id'];
                    $quranaSegmentForm = (string) $childObj->seg->__toString();
                    $quranaSegmentForm = trim($quranaSegmentForm);
                    // convert Qurana Segment ID to QAC segment for cross referenceing
                    $qacSegment = getQACSegmentByQuranaSeqment($suraID, $verseID, $verseLocalSegmentIndex, $quranaSegmentForm);
                    //echo("$qacSegment,$quranaSegmentID\n");
                    // get the id of the word where the segment is
                    $wordId = $qacSegmentToWordTable[$qacSegment];
                    $quranaConcecpts[$conceptID]["FREQ"]++;
                    // fill pronouns array
                    $quranaResolvedPronouns["{$suraID}:{$verseID}:{$wordId}"][] = array("CONCEPT_ID" => $conceptID, "SEGMENT_INDEX" => $qacSegment, "ANTECEDENT_SEGMENTS" => preg_split("/ /", $pronounAntecedent));
                    if ($lang == "EN") {
                        addToInvertedIndex($invertedIndexBatchApcArr, $lang, strtolower($quranaConcecpts[$conceptID]['EN']), $suraID - 1, $verseID - 1, $wordId, "PRONOUN_ANTECEDENT", $quranaSegmentForm);
                    } else {
                        addToInvertedIndex($invertedIndexBatchApcArr, $lang, $quranaConcecpts[$conceptID]['AR'], $suraID - 1, $verseID - 1, $wordId, "PRONOUN_ANTECEDENT", $quranaSegmentForm);
                    }
                }
            }
        }
    }
    //echoN("SEG:$segmentsCount PRON:$pronounsCount");
    //preprint_r($quranaResolvedPronouns);
    //preprint_r($INVERTED_INDEX);exit;
    ##############################################################
    // free resources
    $xmlContent = null;
    $concepts = null;
    unset($xmlContent);
    unset($concepts);
    //echo preprint_r($QURAN_TEXT);;
    if (strpos($lang, "AR") !== false) {
        $stopWordsArr = getStopWordsArrByFile($arabicStopWordsFile);
        $stopWordsStrictL2Arr = getStopWordsArrByFile($arabicStopWordsFileL2);
        $pauseMarksArr = getPauseMarksArrByFile($pauseMarksFile);
    } else {
        $stopWordsArr = getStopWordsArrByFile($englishStopWordsFile);
        $pauseMarksArr = array();
    }
    //preprint_r($stopWordsArr);
    //preprint_r($pauseMarksArr);
    if (strpos($lang, "AR") !== false) {
        // SETTING PAUSE MARKS COUNTER ARRAY
        foreach ($pauseMarksArr as $pauseMark => $constant) {
            $TOTALS_ARR['PAUSEMARKS'][$pauseMark] = 0;
        }
    }
    /* SURA'S LOOP **/
    for ($s = 0; $s < $numberOfSuras; $s++) {
        $TOTALS_ARR['TOTAL_PER_SURA'][$s] = array();
        $suraNameLang = $lang;
        if ($suraNameLang == "AR_UTH") {
            $suraNameLang = "AR";
        }
        $suraNameLang = strtolower($lang);
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NAME'] = $META_DATA['SURAS'][$s]['name_' . $suraNameLang];
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['CHARS'] = 0;
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NRWORDS'] = 0;
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['WORDS'] = 0;
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES'] = 0;
        $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s] = array();
    }
    /* SURA'S LOOP **/
    for ($s = 0; $s < $numberOfSuras; $s++) {
        //echoN($quranXMLObj->sura[$s]['name']);
        $suraSize = $META_DATA['SURAS'][$s]['ayas'];
        /* VERSES LOOP **/
        for ($a = 0; $a < $suraSize; $a++) {
            //$verseText =
            $verseText = $QURAN_TEXT[$s][$a];
            //echoN("- ".$verseText);
            $wordsArr = preg_split("/ /", $verseText);
            /** CALCULATE VERSE LENGTH **/
            $wordsInVerseIncludingPauses = count($wordsArr);
            $wordsInVerse = $wordsInVerseIncludingPauses - count(array_intersect($wordsArr, array_keys($pauseMarksArr)));
            if ($wordsInVerse >= $maxVerseLength) {
                $maxVerseLength = $wordsInVerse;
                $maxVerse = $verseText;
            }
            if ($wordsInVerse <= $minWordLength) {
                if ($wordsInVerse == $minWordLength) {
                    if (mb_strlen($verseText) < mb_strlen($minVerse)) {
                        $minVerseLength = $wordsInVerse;
                        $minVerse = $verseText;
                    }
                } else {
                    $minVerseLength = $wordsInVerse;
                    $minVerse = $verseText;
                }
            }
            $avgVerseLength += $wordsInVerse;
            /** END CALCULATE VERSE LENGTH **/
            $wordIndex = 0;
            /* WORDS IN VERSE  LOOP **/
            foreach ($wordsArr as $word) {
                $word = trim($word);
                // PAUSE MARK
                if (strpos($lang, "AR") !== false && isset($pauseMarksArr[$word])) {
                    $TOTALS_ARR['PAUSEMARKS'][$word]++;
                    continue;
                } else {
                    // SAJDAH MARK
                    if ($word == $sajdahMark) {
                        $TOTALS_ARR['SAJDAT_TELAWA']['COUNT']++;
                        $TOTALS_ARR['SAJDAT_TELAWA']['VERSES'][] = array($s, $a, $verseText);
                        continue;
                    } else {
                        // SAKTA LATIFA
                        if ($word == $saktaLatifaMark) {
                            $TOTALS_ARR['SAKTA_LATIFA']['COUNT']++;
                            $TOTALS_ARR['SAKTA_LATIFA']['VERSES'][] = array($s, $a, $verseText);
                            continue;
                        }
                    }
                }
                // Mainly for english translations
                if ($lang == "EN") {
                    $word = strtolower(cleanAndTrim($word));
                }
                // ignore empty words - result of trimming
                if (empty($word)) {
                    // the case of " - " in english translations
                    continue;
                }
                $wordIndex++;
                if ($wordIndex == 1) {
                    if (!isset($WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word])) {
                        $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word] = 0;
                    }
                    $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word]++;
                } else {
                    if ($wordIndex == count($wordsArr)) {
                        if (!isset($WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word])) {
                            $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word] = 0;
                        }
                        $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word]++;
                    }
                }
                $TOTALS_ARR['WORDS']++;
                if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s])) {
                    $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s] = array();
                }
                if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a])) {
                    $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a] = array();
                }
                if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word])) {
                    $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word] = 0;
                }
                $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word]++;
                if (!isset($WORDS_FREQUENCY_ARR['WORDS'][$word])) {
                    $WORDS_FREQUENCY_ARR['WORDS'][$word] = 0;
                }
                $WORDS_FREQUENCY_ARR['WORDS'][$word]++;
                $TOTALS_ARR['TOTAL_PER_SURA'][$s]['WORDS']++;
                if (!isset($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word])) {
                    $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word] = 0;
                }
                $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word]++;
                //if (!isset($INVERTED_INDEX[$word]) ) $INVERTED_INDEX[$word] = array();
                //$INVERTED_INDEX[$word][] = array("SURA"=>$s,"AYA"=>$a,"INDEX_IN_AYA_EMLA2Y"=>$wordIndex,"WORD_TYPE"=>"NORMAL_WORD");
                addToInvertedIndex($invertedIndexBatchApcArr, $lang, $word, $s, $a, $wordIndex, "NORMAL_WORD");
                /** CALCULATE WORD LENGTHG **/
                $wordLength = mb_strlen($word);
                if ($wordLength >= $maxWordLength) {
                    $maxWordLength = $wordLength;
                    $maxWord = $word;
                }
                if ($wordLength <= $minWordLength) {
                    $minWordLength = $wordLength;
                    $minWord = $word;
                }
                $avgWordLength += $wordLength;
                /** END CALCULATE WORD LENGTHG **/
                $charsInWordArr = preg_split("//u", $word, -1, PREG_SPLIT_NO_EMPTY);
                /* CHARS IN EACH WORD  LOOP **/
                foreach ($charsInWordArr as $char) {
                    //echoN($char." ".in_array($char,$pauseMarksArrTemp));
                    // SPACE
                    if ($char == " ") {
                        continue;
                    }
                    $TOTALS_ARR['CHARS']++;
                    $TOTALS_ARR['TOTAL_PER_SURA'][$s]['CHARS']++;
                }
            }
            $TOTALS_ARR['VERSES']++;
            $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES']++;
            // 					  			if ( $TOTALS_ARR['VERSES']>30)
            // 					  				exit;
        }
        /** END AYA's LOOP **/
    }
    /** END SURA's LOOP **/
    /* SURA'S LOOP **/
    for ($s = 0; $s < $numberOfSuras; $s++) {
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NRWORDS'] = count($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s]);
        arsort($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s]);
    }
    $TOTALS_ARR['NRWORDS'] = count($WORDS_FREQUENCY_ARR['WORDS']);
    $TOTALS_ARR['PAUSEMARKS_AGGREGATION'] = 0;
    // AGGREGATE PAUSE MARKS
    foreach ($TOTALS_ARR['PAUSEMARKS'] as $pmLabel => $pmCount) {
        //echo $pmLabel.$pmCount;
        $TOTALS_ARR['PAUSEMARKS_AGGREGATION'] += $pmCount;
    }
    /**
     * CALCULATING TF-IDF TABLE
     */
    foreach ($WORDS_FREQUENCY_ARR['WORDS'] as $wordLabel => $wordFreq) {
        $termFrequency = $wordFreq;
        $termFrequencyPercentage = $termFrequency / $TOTALS_ARR['WORDS'] * 100;
        // DOCUMENT = VERSE
        $documentFrequency = 0;
        $inverseDocumentFrequency = 0;
        //CHECKING VERSES
        for ($s = 0; $s < $numberOfSuras; $s++) {
            //$versesPerSura = $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES'];
            //for ($a=0;$a<$versesPerSura;$a++)
            //{
            if (isset($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$wordLabel])) {
                //= $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$wordLabel]
                $documentFrequency++;
            }
            //}
        }
        $inverseDocumentFrequency = log($numberOfSuras / $documentFrequency, 10);
        $TFIDF = $termFrequency * $inverseDocumentFrequency;
        //echoN("WORD:$wordLabel PRCG:$termFrequencyPercentage TF:$termFrequency DF:$documentFrequency IDF:$inverseDocumentFrequency TFIDF:$TFIDF ");
        $WORDS_FREQUENCY_ARR['WORDS_TFIDF'][$wordLabel] = array("TF" => $termFrequency, "TPC" => $termFrequencyPercentage, "DF" => $documentFrequency, "IDF" => $inverseDocumentFrequency, "TFIDF" => $TFIDF);
    }
    /** END OF TFIDF TABLE **/
    rsortBy($WORDS_FREQUENCY_ARR['WORDS_TFIDF'], 'TF');
    //preprint_r($WORDS_FREQUENCY_ARR['WORDS_TFIDF']);
    /** Continuing  WORD/VERSE LENGTH CALCULATE **/
    $avgWordLength = $avgWordLength / $TOTALS_ARR['WORDS'];
    $avgVerseLength = $avgVerseLength / $TOTALS_ARR['VERSES'];
    /*
    echoN($minWordLength." - ".$minWord);
    echoN($maxWordLength." - ".$maxWord);
    echoN($avgWordLength);
    
    echoN($minVerseLength." - ".$minVerse);
    echoN($maxVerseLength." - ".$maxVerse);
    echoN($avgVerseLength);
    */
    $TOTALS_ARR['MIN_WORD_LENGTH'] = $minWordLength;
    $TOTALS_ARR['AVG_WORD_LENGTH'] = round($avgWordLength, 2);
    $TOTALS_ARR['MAX_WORD_LENGTH'] = $maxWordLength;
    $TOTALS_ARR['MIN_WORD'] = $minWord;
    $TOTALS_ARR['MAX_WORD'] = $maxWord;
    $TOTALS_ARR['MIN_VERSE_LENGTH'] = $minVerseLength;
    $TOTALS_ARR['AVG_VERSE_LENGTH'] = round($avgVerseLength, 2);
    $TOTALS_ARR['MAX_VERSE_LENGTH'] = $maxVerseLength;
    $TOTALS_ARR['MIN_VERSE'] = $minVerse;
    $TOTALS_ARR['MAX_VERSE'] = $maxVerse;
    /** end CALCULATE WORD/VERSE LENGTH **/
    //exit;;
    arsort($WORDS_FREQUENCY_ARR['WORDS']);
    arsort($WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS']);
    arsort($WORDS_FREQUENCY_ARR['VERSE_ENDINGS']);
    //preprint_r($WORDS_FREQUENCY_ARR);
    /////// LOADING LANGUAGE RESOURCE FILES
    $resourceFile = $englishResourceFile;
    if (strpos($lang, "AR") !== false) {
        $resourceFile = $arabicResourceFile;
    }
    $languageResourcesArr = file($resourceFile, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES);
    $RESOURCES = array();
    foreach ($languageResourcesArr as $index => $resourceLine) {
        $resourcePairsArr = preg_split("/\\|/", $resourceLine);
        $resourceID = $resourcePairsArr[0];
        $resourceValue = $resourcePairsArr[1];
        $RESOURCES[$resourceID] = $resourceValue;
    }
    //$MODEL_CORE['LOADED']=1;
    //$MODEL_CORE[$lang]['META_DATA'] = $META_DATA;
    addValueToMemoryModel($lang, "MODEL_CORE", "META_DATA", "", $META_DATA);
    //$MODEL_CORE[$lang]['TOTALS'] = $TOTALS_ARR;
    addValueToMemoryModel($lang, "MODEL_CORE", "TOTALS", "", $TOTALS_ARR);
    //$MODEL_CORE[$lang]['WORDS_FREQUENCY'] = $WORDS_FREQUENCY_ARR;
    addValueToMemoryModel($lang, "MODEL_CORE", "WORDS_FREQUENCY", "", $WORDS_FREQUENCY_ARR);
    addValueToMemoryModel($lang, "MODEL_CORE", "WORDS_FREQUENCY", "WORDS", $WORDS_FREQUENCY_ARR['WORDS']);
    //$MODEL_CORE[$lang]['QURAN_TEXT'] = $QURAN_TEXT;
    addValueToMemoryModel($lang, "MODEL_CORE", "QURAN_TEXT", "", $QURAN_TEXT);
    //$MODEL_CORE[$lang]['RESOURCES']=$RESOURCES;
    addValueToMemoryModel($lang, "MODEL_CORE", "RESOURCES", "", $RESOURCES);
    //$MODEL_CORE[$lang]['STOP_WORDS']= $stopWordsArr;
    addValueToMemoryModel($lang, "MODEL_CORE", "STOP_WORDS", "", $stopWordsArr);
    //$MODEL_CORE[$lang]['STOP_WORDS_STRICT_L2']= $stopWordsStrictL2Arr;
    addValueToMemoryModel($lang, "MODEL_CORE", "STOP_WORDS_STRICT_L2", "", $stopWordsStrictL2Arr);
    //file_put_contents("$serializedModelFile.core", (json_encode($MODEL_CORE)));
    //$MODEL_SEARCH[$lang]['INVERTED_INDEX'] = $INVERTED_INDEX;
    /*$invertedIndexIterator = getAPCIterator("MODEL_SEARCH.*");
    			
    		foreach($invertedIndexIterator as $cursor)
    		{
    			preprint_r($cursor);
    		}*/
    addToMemoryModelBatch($invertedIndexBatchApcArr);
    //$res = apc_store("MODEL_CORE[$lang]",$MODEL_CORE[$lang]);
    //if ( $res===false){ throw new Exception("Can't cache MODEL_CORE[$lang]"); }
    //$res = apc_store("MODEL_SEARCH[$lang]",$MODEL_SEARCH[$lang]);
    //if ( $res===false){ throw new Exception("Can't cache MODEL_SEARCH[$lang]"); }
    //file_put_contents("$serializedModelFile.search", (json_encode($MODEL_SEARCH)));
    if ($lang == "AR") {
        //$MODEL_QAC['QAC_MASTERTABLE'] = $qacMasterSegmentTable;
        //$MODEL_QAC['QAC_POS'] = $qacPOSTable;
        addToMemoryModelBatch($qacPOSTableBatchApcArr);
        //$MODEL_QAC['QAC_FEATURES'] = $qacFeaturesTable;
        addToMemoryModelBatch($qacFeatureTableBatchApcArr);
        //$MODEL_QAC['QAC_ROOTS_LOOKUP'] = $rootsLookupArray;
        //file_put_contents("$serializedModelFile.qac", (json_encode($MODEL_QAC)));
        //$res = apc_store("MODEL_QAC",$MODEL_QAC);
        //if ( $res===false){ throw new Exception("Can't cache MODEL_QAC"); }
        rsortBy($quranaConcecpts, 'FREQ');
        $MODEL_QURANA['QURANA_CONCEPTS'] = $quranaConcecpts;
        $MODEL_QURANA['QURANA_PRONOUNS'] = $quranaResolvedPronouns;
        //file_put_contents("$serializedModelFile.qurana", (json_encode($MODEL_QURANA)));
        $res = apc_store("MODEL_QURANA", $MODEL_QURANA);
        if ($res === false) {
            throw new Exception("Can't cache MODEL_QURANA");
        }
    }
    //preprint_r($MODEL['INVERTED_INDEX'] );exit;
    //preprint_r($WORDS_FREQUENCY_ARR['VERSE_ENDINGS']);
    //echo serialize(json_encode($MODEL));
    //preprint_r($MODEL['EN']);
}
Exemplo n.º 8
0
function getStatisticallySginificantWords($extendedQueryWordsArr, $scoringTable)
{
    global $MODEL_CORE, $MODEL_CORE_UTH, $script;
    global $saktaLatifaMark, $sajdahMark;
    //preprint_r($extendedQueryWordsArr);exit;
    $queryTermsCollocation = array();
    $relevanceReverseOrderIndex = count($documentScoreArr);
    foreach ($scoringTable as $documentID => $documentScoreArr) {
        $SURA = $documentScoreArr['SURA'];
        $AYA = $documentScoreArr['AYA'];
        $TEXT = $MODEL_CORE['QURAN_TEXT'][$SURA][$AYA];
        $TEXT_UTH = $MODEL_CORE_UTH['QURAN_TEXT'][$SURA][$AYA];
        $wordsArr = explode(" ", $TEXT);
        $lastWord = null;
        foreach ($wordsArr as $word) {
            $word = cleanAndTrim($word);
            if (empty($word)) {
                continue;
            }
            $word = strtolower($word);
            if (isset($MODEL_CORE['STOP_WORDS'][$word])) {
                continue;
            }
            // ignore pause marks
            if (isPauseMark($word, $MODEL_CORE['TOTALS']['PAUSEMARKS'], $saktaLatifaMark, $sajdahMark)) {
                continue;
            }
            if (!empty($lastWord) && isset($extendedQueryWordsArr[$word]) && !isset($extendedQueryWordsArr[$lastWord])) {
                $queryTermsCollocation[$lastWord]++;
            }
            if (!empty($lastWord) && isset($extendedQueryWordsArr[$lastWord]) && !isset($extendedQueryWordsArr[$word])) {
                $queryTermsCollocation[$word]++;
            }
            $lastWord = $word;
        }
    }
    arsort($queryTermsCollocation);
    //preprint_r($queryTermsCollocation);exit;
    $queryTermsCollocation = array_slice($queryTermsCollocation, 0, 10);
    return $queryTermsCollocation;
}