Пример #1
0
function loadModel($lang, $type, $file)
{
    global $WORDS_FREQUENCY_ARR, $TOTALS_ARR, $MODEL_CORE, $MODEL_SEARCH, $MODEL_QAC, $MODEL_QURANA;
    global $sajdahMark, $saktaLatifaMark, $pauseMarksFile, $serializedModelFile, $basmalaTextUthmani;
    global $numberOfSuras, $numberOfVerses, $quranMetaDataFile, $arabicStopWordsFile, $englishStopWordsFile;
    global $META_DATA, $basmalaText, $englishResourceFile, $arabicResourceFile, $quranCorpusMorphologyFile;
    global $quranaPronounResolutionConceptsFile, $quranaPronounResolutionDataFileTemplate, $quranFileUthmaniAR;
    global $TRANSLATION_MAP_EN_TO_AR, $TRANSLATION_MAP_AR_TO_EN, $TRANSLITERATION_WORDS_MAP, $TRANSLITERATION_VERSES_MAP;
    global $basmalaTextUthmani2, $arabicStopWordsFileL2;
    global $TRANSLITERATION_WORDS_LOCATION_MAP;
    $QURAN_TEXT = array();
    $invertedIndexBatchApcArr = array();
    $qacMasterTableBatchApcArr = array();
    $qacPOSTableBatchApcArr = array();
    $qacFeatureTableBatchApcArr = array();
    $TOTALS_ARR = array();
    $TOTALS_ARR['CHARS'] = 0;
    $TOTALS_ARR['WORDS'] = 0;
    $TOTALS_ARR['NRWORDS'] = 0;
    $TOTALS_ARR['VERSES'] = 0;
    $TOTALS_ARR['SURAS'] = $numberOfSuras;
    $TOTALS_ARR['CHAPTERS'] = 30;
    $TOTALS_ARR['TOTAL_PER_SURA'] = array();
    $TOTALS_ARR['SAJDAT_TELAWA'] = array();
    $TOTALS_ARR['PAUSEMARKS'] = array();
    $TOTALS_ARR['MIN_WORD_LENGTH'] = 0;
    $TOTALS_ARR['AVG_WORD_LENGTH'] = 0;
    $TOTALS_ARR['MAX_WORD_LENGTH'] = 0;
    $TOTALS_ARR['MIN_WORD'] = null;
    $TOTALS_ARR['MAX_WORD'] = null;
    $TOTALS_ARR['MIN_VERSE_LENGTH'] = 0;
    $TOTALS_ARR['AVG_VERSE_LENGTH'] = 0;
    $TOTALS_ARR['MAX_VERSE_LENGTH'] = 0;
    $TOTALS_ARR['MIN_VERSE'] = null;
    $TOTALS_ARR['MAX_VERSE'] = null;
    $TOTALS_ARR['SAJDAT_TELAWA']['COUNT'] = 0;
    $TOTALS_ARR['SAJDAT_TELAWA']['VERSES'] = array();
    $TOTALS_ARR['SAKTA_LATIFA']['COUNT'] = 0;
    $TOTALS_ARR['SAKTA_LATIFA']['VERSES'] = array();
    $INVERTED_INDEX = array();
    $WORDS_FREQUENCY_ARR = array();
    $WORDS_FREQUENCY_ARR['WORDS'] = array();
    $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'] = array();
    $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'] = array();
    $WORDS_FREQUENCY_ARR['WORDS_TFIDF'] = array();
    $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'] = array();
    $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'] = array();
    /** WORD LENGTH **/
    $minWordLength = 1000;
    $minWord = null;
    $maxWordLength = -1;
    $maxWord = null;
    $avgWordLength = 0;
    /** VERSE LENGTH **/
    $minVerseLength = 1000;
    $minVerse = null;
    $maxVerseLength = -1;
    $maxVerse = null;
    $avgVerseLength = 0;
    /** QAC Model **/
    // Master model, contains all QAC data
    $qacMasterSegmentTable = array();
    //pinters/indexes on the master table for POS and features
    $qacPOSTable = array();
    $qacFeaturesTable = array();
    //$qacWordsTable = array();
    $qacSegmentToWordTable = array();
    /** QURANA Corpus **/
    $quranaConcecpts = array();
    $quranaResolvedPronouns = array();
    ########### LOAD DATA ACCORDING TO MODEL SOURCE TYPE
    if ($type == "XML") {
        $sourceContent = simplexml_load_file($file);
    } else {
        $sourceContent = file($file, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES);
    }
    if ($type == "TXT") {
        for ($s = 0; $s < $numberOfVerses; $s++) {
            $line = $sourceContent[$s];
            $lineArr = preg_split("/\\|/", $line);
            $suraIndex = $lineArr[0];
            $ayaIndex = $lineArr[1];
            $text = $lineArr[2];
            //strip "besm allah alrahman al raheem" from furst aya of all suras except the first one
            if (strpos($lang, "AR") !== false && $ayaIndex == 1 && $s != 0) {
                if ($lang == "AR") {
                    $text = trim(str_replace($basmalaText, "", $text));
                } else {
                    if ($lang == "AR_UTH") {
                        $text = trim(str_replace($basmalaTextUthmani, "", $text));
                        $text = trim(str_replace($basmalaTextUthmani2, "", $text));
                    }
                }
            }
            if (!isset($QURAN_TEXT[$suraIndex - 1])) {
                $QURAN_TEXT[$suraIndex - 1] = array();
            }
            $QURAN_TEXT[$suraIndex - 1][$ayaIndex - 1] = $text;
        }
    } else {
        if ($type == "XML") {
            for ($s = 0; $s < $numberOfSuras; $s++) {
                $suraSize = $META_DATA['SURAS'][$s]['ayas'];
                for ($a = 0; $a < $suraSize; $a++) {
                    $QURAN_TEXT[$s][$a] = (string) $sourceContent->sura[$s]->aya[$a]['text'];
                }
            }
        } else {
            throw new Exception("Invalid Source Type ({$type})");
        }
    }
    ##############################################################
    // free resources
    $sourceContent = null;
    unset($sourceContent);
    if ($lang == "AR") {
        ############ LOAD QAC (Quranic Arabic Corpus) FILE ###################################
        //dont skip new lines here (FILE_SKIP_EMPTY_LINES) for the skipping "57" condition below to work
        $qacFileLinesArr = file($quranCorpusMorphologyFile, FILE_IGNORE_NEW_LINES);
        $rootsLookupArray = array();
        $headerIndex = 0;
        $segmentIndex = 1;
        foreach ($qacFileLinesArr as $line) {
            $headerIndex++;
            //ignore header sections
            if ($headerIndex <= 57) {
                continue;
            }
            //if ( $segmentIndex >= 2) exit;
            //echoN($line);
            // convert columns to array
            $lineArr = preg_split("/\t/", $line);
            $location = $lineArr[0];
            $formOrSegment = $lineArr[1];
            $posTAG = $lineArr[2];
            $featuresList = $lineArr[3];
            //preprint_r($lineArr);
            // remove brackets from location and keep it only SURA/AYA/WORDINDEX/SEGMENTINDEX
            $masterID = preg_replace("/\\(|\\)|/", "", $location);
            $locationArr = preg_split("/\\:/", $masterID);
            $wordSegmentID = $locationArr[count($locationArr) - 1];
            $wordIndex = $locationArr[count($locationArr) - 2];
            $verseID = $locationArr[count($locationArr) - 3];
            $suraID = $locationArr[count($locationArr) - 4];
            // Remove segment index from location ( will be added as new array below )
            $masterID = substr($masterID, 0, strlen($masterID) - 2);
            // get the reversed buackwalter transliteration for the segment
            $formOrSegmentReverseTransliterated = buckwalterReverseTransliteration($formOrSegment);
            //echoN($formOrSegmentReverseTransliterated);
            // separate features
            $featuresTempArr = preg_split("/\\|/", $featuresList);
            //preprint_r($featuresTempArr);
            $featuresArr = array();
            foreach ($featuresTempArr as $oneFeature) {
                // feature is a key/value set
                if (strpos($oneFeature, ":") !== false) {
                    $oneFeatureKeyValueArr = preg_split("/\\:/", $oneFeature);
                    $featureName = $oneFeatureKeyValueArr[0];
                    $featureValue = $oneFeatureKeyValueArr[1];
                    if ($featureName == "LEM" || $featureName == "ROOT") {
                        //echoN($featureValue);
                        $featureValue = buckwalterReverseTransliteration($featureValue);
                    }
                } else {
                    $featureName = $oneFeature;
                    // 1 here just a dummy value
                    $featureValue = -1;
                }
                $featureValue = trim($featureValue);
                // fill Features Index table
                //$qacFeaturesTable[$featureName][$masterID]= $featureValue;
                $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_FEATURES/{$featureName}";
                $qacFeatureTableBatchApcArr[$apcMemoryEntryKey][$masterID] = $featureValue;
                $featuresArr[$featureName] = $featureValue;
                // non-word features should not be included
                if ($featureName == "LEM" || $featureName == "ROOT") {
                    addToInvertedIndex($invertedIndexBatchApcArr, $lang, trim($featureValue), $suraID - 1, $verseID - 1, $wordIndex, trim($featureName), $formOrSegmentReverseTransliterated);
                    if ($featureName == "ROOT") {
                        //$rootsLookupArray[$formOrSegmentReverseTransliterated]=$featureValue;
                        addValueToMemoryModel($lang, "MODEL_QAC", "QAC_ROOTS_LOOKUP", $formOrSegmentReverseTransliterated, $featureValue);
                    }
                }
            }
            //location significant before increment below
            $qacSegmentToWordTable[$segmentIndex] = $wordIndex;
            // Fill master table
            //$qacMasterSegmentTable[$masterID][]
            $qacMasterTableEntry = array("FORM_EN" => $formOrSegment, "FORM_AR" => $formOrSegmentReverseTransliterated, "TAG" => $posTAG, "SEGMENT_INDEX" => $segmentIndex++, "FEATURES" => $featuresArr);
            $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_MASTERTABLE/{$masterID}";
            $qacMasterTableBatchApcArr[$apcMemoryEntryKey][] = $qacMasterTableEntry;
            // Fill Part of Speech tagging table
            $qacPOSTable[$posTAG][$masterID] = $wordSegmentID;
            $apcMemoryEntryKey = "{$lang}/MODEL_QAC/QAC_POS/{$posTAG}";
            $qacPOSTableBatchApcArr[$apcMemoryEntryKey][$masterID] = $wordSegmentID;
        }
        //preprint_r($qacMasterSegmentTable);
        //preprint_r($qacFeaturesTable);
        //preprint_r($qacPOSTable);
        ##############################################################
        // free resources
        $qacFileLinesArr = null;
        unset($qacFileLinesArr);
        // need to fluch tabel in memory since it is needed by Qurana - in segment function
        addToMemoryModelBatch($qacMasterTableBatchApcArr);
    }
    ######### Qurana Pronomial Anaphone Corpus ###################
    //echoN($quranaPronounResolutionConceptsFile);
    // GET XML FILE CONTENT
    $xmlContent = file_get_contents($quranaPronounResolutionConceptsFile);
    // LOAD XML OBJECT - trim used to avoid first line empty error
    $concepts = simplexml_load_string(trim(stripHTMLComments($xmlContent)));
    // LOAD CONCEPTS
    foreach ($concepts->con as $index => $conceptObj) {
        $conceptID = (string) $conceptObj['id'];
        $conceptNameEN = (string) $conceptObj->english;
        $conceptNameAR = (string) $conceptObj->arabic;
        $quranaConcecpts[$conceptID] = array("EN" => trim($conceptNameEN), "AR" => trim($conceptNameAR), "FREQ" => 0);
    }
    $pronounsCount = 0;
    $segmentsCount = 0;
    //preprint_r($quranaConcecpts);
    // LOAD PRONOUNS // load & parse the file of each SURA and load it in the model
    for ($s = 0; $s < $numberOfSuras; $s++) {
        $suraID = $s + 1;
        $pronounDataFileName = preg_replace("/%s/", $suraID, $quranaPronounResolutionDataFileTemplate);
        //echoN($pronounDataFileName);
        // GET XML FILE CONTENT of the current SURA by customizing file name
        $xmlContent = file_get_contents($pronounDataFileName);
        // LOAD XML OBJECT - trim used to avoid first line empty error
        $chapter = simplexml_load_string(trim(stripHTMLComments($xmlContent)));
        // LOAD CONCEPTS
        foreach ($chapter->verse as $index => $verseObj) {
            $verseLocalSegmentIndex = 0;
            $versesCount++;
            // Loop on all children
            foreach ($verseObj->children() as $index => $childObj) {
                // get tag name
                $tagName = $childObj->getName();
                $verseLocalSegmentIndex++;
                $segmentsCount++;
                // we got a prounoun tag
                if ($tagName == "pron") {
                    $pronounsCount++;
                    // get the verse including this pronoun
                    $verseID = (string) $verseObj['id'];
                    // get pronoun concept ID and antecendent
                    $conceptID = (string) $childObj['con'];
                    $pronounAntecedent = (string) $childObj['ant'];
                    // get segment ID and word form
                    $quranaSegmentID = (string) $childObj->seg['id'];
                    $quranaSegmentForm = (string) $childObj->seg->__toString();
                    $quranaSegmentForm = trim($quranaSegmentForm);
                    // convert Qurana Segment ID to QAC segment for cross referenceing
                    $qacSegment = getQACSegmentByQuranaSeqment($suraID, $verseID, $verseLocalSegmentIndex, $quranaSegmentForm);
                    //echo("$qacSegment,$quranaSegmentID\n");
                    // get the id of the word where the segment is
                    $wordId = $qacSegmentToWordTable[$qacSegment];
                    $quranaConcecpts[$conceptID]["FREQ"]++;
                    // fill pronouns array
                    $quranaResolvedPronouns["{$suraID}:{$verseID}:{$wordId}"][] = array("CONCEPT_ID" => $conceptID, "SEGMENT_INDEX" => $qacSegment, "ANTECEDENT_SEGMENTS" => preg_split("/ /", $pronounAntecedent));
                    if ($lang == "EN") {
                        addToInvertedIndex($invertedIndexBatchApcArr, $lang, strtolower($quranaConcecpts[$conceptID]['EN']), $suraID - 1, $verseID - 1, $wordId, "PRONOUN_ANTECEDENT", $quranaSegmentForm);
                    } else {
                        addToInvertedIndex($invertedIndexBatchApcArr, $lang, $quranaConcecpts[$conceptID]['AR'], $suraID - 1, $verseID - 1, $wordId, "PRONOUN_ANTECEDENT", $quranaSegmentForm);
                    }
                }
            }
        }
    }
    //echoN("SEG:$segmentsCount PRON:$pronounsCount");
    //preprint_r($quranaResolvedPronouns);
    //preprint_r($INVERTED_INDEX);exit;
    ##############################################################
    // free resources
    $xmlContent = null;
    $concepts = null;
    unset($xmlContent);
    unset($concepts);
    //echo preprint_r($QURAN_TEXT);;
    if (strpos($lang, "AR") !== false) {
        $stopWordsArr = getStopWordsArrByFile($arabicStopWordsFile);
        $stopWordsStrictL2Arr = getStopWordsArrByFile($arabicStopWordsFileL2);
        $pauseMarksArr = getPauseMarksArrByFile($pauseMarksFile);
    } else {
        $stopWordsArr = getStopWordsArrByFile($englishStopWordsFile);
        $pauseMarksArr = array();
    }
    //preprint_r($stopWordsArr);
    //preprint_r($pauseMarksArr);
    if (strpos($lang, "AR") !== false) {
        // SETTING PAUSE MARKS COUNTER ARRAY
        foreach ($pauseMarksArr as $pauseMark => $constant) {
            $TOTALS_ARR['PAUSEMARKS'][$pauseMark] = 0;
        }
    }
    /* SURA'S LOOP **/
    for ($s = 0; $s < $numberOfSuras; $s++) {
        $TOTALS_ARR['TOTAL_PER_SURA'][$s] = array();
        $suraNameLang = $lang;
        if ($suraNameLang == "AR_UTH") {
            $suraNameLang = "AR";
        }
        $suraNameLang = strtolower($lang);
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NAME'] = $META_DATA['SURAS'][$s]['name_' . $suraNameLang];
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['CHARS'] = 0;
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NRWORDS'] = 0;
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['WORDS'] = 0;
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES'] = 0;
        $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s] = array();
    }
    /* SURA'S LOOP **/
    for ($s = 0; $s < $numberOfSuras; $s++) {
        //echoN($quranXMLObj->sura[$s]['name']);
        $suraSize = $META_DATA['SURAS'][$s]['ayas'];
        /* VERSES LOOP **/
        for ($a = 0; $a < $suraSize; $a++) {
            //$verseText =
            $verseText = $QURAN_TEXT[$s][$a];
            //echoN("- ".$verseText);
            $wordsArr = preg_split("/ /", $verseText);
            /** CALCULATE VERSE LENGTH **/
            $wordsInVerseIncludingPauses = count($wordsArr);
            $wordsInVerse = $wordsInVerseIncludingPauses - count(array_intersect($wordsArr, array_keys($pauseMarksArr)));
            if ($wordsInVerse >= $maxVerseLength) {
                $maxVerseLength = $wordsInVerse;
                $maxVerse = $verseText;
            }
            if ($wordsInVerse <= $minWordLength) {
                if ($wordsInVerse == $minWordLength) {
                    if (mb_strlen($verseText) < mb_strlen($minVerse)) {
                        $minVerseLength = $wordsInVerse;
                        $minVerse = $verseText;
                    }
                } else {
                    $minVerseLength = $wordsInVerse;
                    $minVerse = $verseText;
                }
            }
            $avgVerseLength += $wordsInVerse;
            /** END CALCULATE VERSE LENGTH **/
            $wordIndex = 0;
            /* WORDS IN VERSE  LOOP **/
            foreach ($wordsArr as $word) {
                $word = trim($word);
                // PAUSE MARK
                if (strpos($lang, "AR") !== false && isset($pauseMarksArr[$word])) {
                    $TOTALS_ARR['PAUSEMARKS'][$word]++;
                    continue;
                } else {
                    // SAJDAH MARK
                    if ($word == $sajdahMark) {
                        $TOTALS_ARR['SAJDAT_TELAWA']['COUNT']++;
                        $TOTALS_ARR['SAJDAT_TELAWA']['VERSES'][] = array($s, $a, $verseText);
                        continue;
                    } else {
                        // SAKTA LATIFA
                        if ($word == $saktaLatifaMark) {
                            $TOTALS_ARR['SAKTA_LATIFA']['COUNT']++;
                            $TOTALS_ARR['SAKTA_LATIFA']['VERSES'][] = array($s, $a, $verseText);
                            continue;
                        }
                    }
                }
                // Mainly for english translations
                if ($lang == "EN") {
                    $word = strtolower(cleanAndTrim($word));
                }
                // ignore empty words - result of trimming
                if (empty($word)) {
                    // the case of " - " in english translations
                    continue;
                }
                $wordIndex++;
                if ($wordIndex == 1) {
                    if (!isset($WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word])) {
                        $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word] = 0;
                    }
                    $WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS'][$word]++;
                } else {
                    if ($wordIndex == count($wordsArr)) {
                        if (!isset($WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word])) {
                            $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word] = 0;
                        }
                        $WORDS_FREQUENCY_ARR['VERSE_ENDINGS'][$word]++;
                    }
                }
                $TOTALS_ARR['WORDS']++;
                if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s])) {
                    $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s] = array();
                }
                if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a])) {
                    $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a] = array();
                }
                if (!isset($WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word])) {
                    $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word] = 0;
                }
                $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$word]++;
                if (!isset($WORDS_FREQUENCY_ARR['WORDS'][$word])) {
                    $WORDS_FREQUENCY_ARR['WORDS'][$word] = 0;
                }
                $WORDS_FREQUENCY_ARR['WORDS'][$word]++;
                $TOTALS_ARR['TOTAL_PER_SURA'][$s]['WORDS']++;
                if (!isset($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word])) {
                    $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word] = 0;
                }
                $WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$word]++;
                //if (!isset($INVERTED_INDEX[$word]) ) $INVERTED_INDEX[$word] = array();
                //$INVERTED_INDEX[$word][] = array("SURA"=>$s,"AYA"=>$a,"INDEX_IN_AYA_EMLA2Y"=>$wordIndex,"WORD_TYPE"=>"NORMAL_WORD");
                addToInvertedIndex($invertedIndexBatchApcArr, $lang, $word, $s, $a, $wordIndex, "NORMAL_WORD");
                /** CALCULATE WORD LENGTHG **/
                $wordLength = mb_strlen($word);
                if ($wordLength >= $maxWordLength) {
                    $maxWordLength = $wordLength;
                    $maxWord = $word;
                }
                if ($wordLength <= $minWordLength) {
                    $minWordLength = $wordLength;
                    $minWord = $word;
                }
                $avgWordLength += $wordLength;
                /** END CALCULATE WORD LENGTHG **/
                $charsInWordArr = preg_split("//u", $word, -1, PREG_SPLIT_NO_EMPTY);
                /* CHARS IN EACH WORD  LOOP **/
                foreach ($charsInWordArr as $char) {
                    //echoN($char." ".in_array($char,$pauseMarksArrTemp));
                    // SPACE
                    if ($char == " ") {
                        continue;
                    }
                    $TOTALS_ARR['CHARS']++;
                    $TOTALS_ARR['TOTAL_PER_SURA'][$s]['CHARS']++;
                }
            }
            $TOTALS_ARR['VERSES']++;
            $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES']++;
            // 					  			if ( $TOTALS_ARR['VERSES']>30)
            // 					  				exit;
        }
        /** END AYA's LOOP **/
    }
    /** END SURA's LOOP **/
    /* SURA'S LOOP **/
    for ($s = 0; $s < $numberOfSuras; $s++) {
        $TOTALS_ARR['TOTAL_PER_SURA'][$s]['NRWORDS'] = count($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s]);
        arsort($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s]);
    }
    $TOTALS_ARR['NRWORDS'] = count($WORDS_FREQUENCY_ARR['WORDS']);
    $TOTALS_ARR['PAUSEMARKS_AGGREGATION'] = 0;
    // AGGREGATE PAUSE MARKS
    foreach ($TOTALS_ARR['PAUSEMARKS'] as $pmLabel => $pmCount) {
        //echo $pmLabel.$pmCount;
        $TOTALS_ARR['PAUSEMARKS_AGGREGATION'] += $pmCount;
    }
    /**
     * CALCULATING TF-IDF TABLE
     */
    foreach ($WORDS_FREQUENCY_ARR['WORDS'] as $wordLabel => $wordFreq) {
        $termFrequency = $wordFreq;
        $termFrequencyPercentage = $termFrequency / $TOTALS_ARR['WORDS'] * 100;
        // DOCUMENT = VERSE
        $documentFrequency = 0;
        $inverseDocumentFrequency = 0;
        //CHECKING VERSES
        for ($s = 0; $s < $numberOfSuras; $s++) {
            //$versesPerSura = $TOTALS_ARR['TOTAL_PER_SURA'][$s]['VERSES'];
            //for ($a=0;$a<$versesPerSura;$a++)
            //{
            if (isset($WORDS_FREQUENCY_ARR['WORDS_PER_SURA'][$s][$wordLabel])) {
                //= $WORDS_FREQUENCY_ARR['TOTAL_PER_VERSE'][$s][$a][$wordLabel]
                $documentFrequency++;
            }
            //}
        }
        $inverseDocumentFrequency = log($numberOfSuras / $documentFrequency, 10);
        $TFIDF = $termFrequency * $inverseDocumentFrequency;
        //echoN("WORD:$wordLabel PRCG:$termFrequencyPercentage TF:$termFrequency DF:$documentFrequency IDF:$inverseDocumentFrequency TFIDF:$TFIDF ");
        $WORDS_FREQUENCY_ARR['WORDS_TFIDF'][$wordLabel] = array("TF" => $termFrequency, "TPC" => $termFrequencyPercentage, "DF" => $documentFrequency, "IDF" => $inverseDocumentFrequency, "TFIDF" => $TFIDF);
    }
    /** END OF TFIDF TABLE **/
    rsortBy($WORDS_FREQUENCY_ARR['WORDS_TFIDF'], 'TF');
    //preprint_r($WORDS_FREQUENCY_ARR['WORDS_TFIDF']);
    /** Continuing  WORD/VERSE LENGTH CALCULATE **/
    $avgWordLength = $avgWordLength / $TOTALS_ARR['WORDS'];
    $avgVerseLength = $avgVerseLength / $TOTALS_ARR['VERSES'];
    /*
    echoN($minWordLength." - ".$minWord);
    echoN($maxWordLength." - ".$maxWord);
    echoN($avgWordLength);
    
    echoN($minVerseLength." - ".$minVerse);
    echoN($maxVerseLength." - ".$maxVerse);
    echoN($avgVerseLength);
    */
    $TOTALS_ARR['MIN_WORD_LENGTH'] = $minWordLength;
    $TOTALS_ARR['AVG_WORD_LENGTH'] = round($avgWordLength, 2);
    $TOTALS_ARR['MAX_WORD_LENGTH'] = $maxWordLength;
    $TOTALS_ARR['MIN_WORD'] = $minWord;
    $TOTALS_ARR['MAX_WORD'] = $maxWord;
    $TOTALS_ARR['MIN_VERSE_LENGTH'] = $minVerseLength;
    $TOTALS_ARR['AVG_VERSE_LENGTH'] = round($avgVerseLength, 2);
    $TOTALS_ARR['MAX_VERSE_LENGTH'] = $maxVerseLength;
    $TOTALS_ARR['MIN_VERSE'] = $minVerse;
    $TOTALS_ARR['MAX_VERSE'] = $maxVerse;
    /** end CALCULATE WORD/VERSE LENGTH **/
    //exit;;
    arsort($WORDS_FREQUENCY_ARR['WORDS']);
    arsort($WORDS_FREQUENCY_ARR['VERSE_BEGINNINGS']);
    arsort($WORDS_FREQUENCY_ARR['VERSE_ENDINGS']);
    //preprint_r($WORDS_FREQUENCY_ARR);
    /////// LOADING LANGUAGE RESOURCE FILES
    $resourceFile = $englishResourceFile;
    if (strpos($lang, "AR") !== false) {
        $resourceFile = $arabicResourceFile;
    }
    $languageResourcesArr = file($resourceFile, FILE_SKIP_EMPTY_LINES | FILE_IGNORE_NEW_LINES);
    $RESOURCES = array();
    foreach ($languageResourcesArr as $index => $resourceLine) {
        $resourcePairsArr = preg_split("/\\|/", $resourceLine);
        $resourceID = $resourcePairsArr[0];
        $resourceValue = $resourcePairsArr[1];
        $RESOURCES[$resourceID] = $resourceValue;
    }
    //$MODEL_CORE['LOADED']=1;
    //$MODEL_CORE[$lang]['META_DATA'] = $META_DATA;
    addValueToMemoryModel($lang, "MODEL_CORE", "META_DATA", "", $META_DATA);
    //$MODEL_CORE[$lang]['TOTALS'] = $TOTALS_ARR;
    addValueToMemoryModel($lang, "MODEL_CORE", "TOTALS", "", $TOTALS_ARR);
    //$MODEL_CORE[$lang]['WORDS_FREQUENCY'] = $WORDS_FREQUENCY_ARR;
    addValueToMemoryModel($lang, "MODEL_CORE", "WORDS_FREQUENCY", "", $WORDS_FREQUENCY_ARR);
    addValueToMemoryModel($lang, "MODEL_CORE", "WORDS_FREQUENCY", "WORDS", $WORDS_FREQUENCY_ARR['WORDS']);
    //$MODEL_CORE[$lang]['QURAN_TEXT'] = $QURAN_TEXT;
    addValueToMemoryModel($lang, "MODEL_CORE", "QURAN_TEXT", "", $QURAN_TEXT);
    //$MODEL_CORE[$lang]['RESOURCES']=$RESOURCES;
    addValueToMemoryModel($lang, "MODEL_CORE", "RESOURCES", "", $RESOURCES);
    //$MODEL_CORE[$lang]['STOP_WORDS']= $stopWordsArr;
    addValueToMemoryModel($lang, "MODEL_CORE", "STOP_WORDS", "", $stopWordsArr);
    //$MODEL_CORE[$lang]['STOP_WORDS_STRICT_L2']= $stopWordsStrictL2Arr;
    addValueToMemoryModel($lang, "MODEL_CORE", "STOP_WORDS_STRICT_L2", "", $stopWordsStrictL2Arr);
    //file_put_contents("$serializedModelFile.core", (json_encode($MODEL_CORE)));
    //$MODEL_SEARCH[$lang]['INVERTED_INDEX'] = $INVERTED_INDEX;
    /*$invertedIndexIterator = getAPCIterator("MODEL_SEARCH.*");
    			
    		foreach($invertedIndexIterator as $cursor)
    		{
    			preprint_r($cursor);
    		}*/
    addToMemoryModelBatch($invertedIndexBatchApcArr);
    //$res = apc_store("MODEL_CORE[$lang]",$MODEL_CORE[$lang]);
    //if ( $res===false){ throw new Exception("Can't cache MODEL_CORE[$lang]"); }
    //$res = apc_store("MODEL_SEARCH[$lang]",$MODEL_SEARCH[$lang]);
    //if ( $res===false){ throw new Exception("Can't cache MODEL_SEARCH[$lang]"); }
    //file_put_contents("$serializedModelFile.search", (json_encode($MODEL_SEARCH)));
    if ($lang == "AR") {
        //$MODEL_QAC['QAC_MASTERTABLE'] = $qacMasterSegmentTable;
        //$MODEL_QAC['QAC_POS'] = $qacPOSTable;
        addToMemoryModelBatch($qacPOSTableBatchApcArr);
        //$MODEL_QAC['QAC_FEATURES'] = $qacFeaturesTable;
        addToMemoryModelBatch($qacFeatureTableBatchApcArr);
        //$MODEL_QAC['QAC_ROOTS_LOOKUP'] = $rootsLookupArray;
        //file_put_contents("$serializedModelFile.qac", (json_encode($MODEL_QAC)));
        //$res = apc_store("MODEL_QAC",$MODEL_QAC);
        //if ( $res===false){ throw new Exception("Can't cache MODEL_QAC"); }
        rsortBy($quranaConcecpts, 'FREQ');
        $MODEL_QURANA['QURANA_CONCEPTS'] = $quranaConcecpts;
        $MODEL_QURANA['QURANA_PRONOUNS'] = $quranaResolvedPronouns;
        //file_put_contents("$serializedModelFile.qurana", (json_encode($MODEL_QURANA)));
        $res = apc_store("MODEL_QURANA", $MODEL_QURANA);
        if ($res === false) {
            throw new Exception("Can't cache MODEL_QURANA");
        }
    }
    //preprint_r($MODEL['INVERTED_INDEX'] );exit;
    //preprint_r($WORDS_FREQUENCY_ARR['VERSE_ENDINGS']);
    //echo serialize(json_encode($MODEL));
    //preprint_r($MODEL['EN']);
}
Пример #2
0
function removeStopwordsAndTrim($str, $lang = "AR")
{
    global $englishStopWordsFile, $arabicStopWordsFile;
    $stopWordsFile = $arabicStopWordsFile;
    if ($lang == "EN") {
        $stopWordsFile = $englishStopWordsFile;
        $str = strtolower($str);
    }
    $stopWordsArr = getStopWordsArrByFile($stopWordsFile);
    $strArr = preg_split("/ /", $str);
    $newStr = array();
    foreach ($strArr as $index => $word) {
        //echoN("$stopWordsArr[$word] $word");
        if (empty($word) || isset($stopWordsArr[$word])) {
            continue;
        }
        $newStr[] = $word;
    }
    return implode(" ", $newStr);
}
    //echoN(count($finalConcepts));
    //exit;
    file_put_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.final.concepts.stage4", serialize($finalConcepts));
}
if ($ENRICH_CONCEPTS_METADATA_DBPEDIA) {
    $newConcepts = array();
    $dbpediaCacheArr = array();
    $relationsArr = unserialize(file_get_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.final.relations"));
    $finalConcepts = unserialize(file_get_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.final.concepts.stage4"));
    $finalTerms = unserialize(file_get_contents("{$ONTOLOGY_EXTRACTION_FOLDER}/temp.all.terms"));
    $dbpediaCacheArr = unserialize(file_get_contents("../data/cache/dbpedia.resources"));
    $typeNS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
    $resourceIDTemplate = "http://live.dbpedia.org/resource/{NAME}";
    $urlTemplate = "http://live.dbpedia.org/data/{NAME}.json";
    $wikipediaIDTemplate = "http://en.wikipedia.org/wiki/{NAME}";
    $stopWordsArr = getStopWordsArrByFile($englishStopWordsFile);
    $typesArr = array();
    $conceptsFiltered = 0;
    $conceptsEnriched = 0;
    $newConceptsAdded = 0;
    $newRelationsAdded = 0;
    $enrichedFinalConcepts = $finalConcepts;
    foreach ($finalConcepts as $concept => $coneptArr) {
        $conceptNameEn = $coneptArr['EXTRA']['TRANSLATION_EN'];
        $conceptNameAr = $concept;
        //$coneptArr['EXTRA']['SIMPLE_WORD'];
        if (!empty($conceptNameEn) && preg_match("/ /", $conceptNameEn) == 0 && !isset($stopWordsArr[strtolower($conceptNameEn)])) {
            //echoN("NOT FILTERED:$conceptNameEn|$conceptNameAr");
            $conceptsFiltered++;
            $conceptName = $conceptNameEn;
            $url = str_replace("{NAME}", $conceptName, $urlTemplate);