foreach ($out as $idx => $str) { $str = trim($str); if (strlen($str) == 0) { continue; } $this->ChangedVerbs[$this->get_ed($str)] = $str; $this->ChangedVerbs[$this->get_ing($str)] = $str; } return $this; } public function show() { foreach ($this->ChangedVerbs as $key => $past) { printf("%s => %s <br>", $key, $past); } } public function getBaseForm($word, &$bChanged) { if (array_key_exists($word, $this->ChangedVerbs)) { $bChanged = true; return $this->ChangedVerbs[$word]; } else { $bChanged = false; return $word; } } } if (0) { $rv = new EnglishRegularVerbs(); $rv->show(); }
public function load2freq($filename) { $this->init2(); $IrregVerb = new EnglishIrregularVerbs(); $RegVerb = new EnglishRegularVerbs(); $data = file_get_contents($filename); if (false == $data) { print "failed6 to read file:" . $filename; die; } //$data = str_replace("'s ", " ", $data); $data = strtolower($data); $data = preg_replace("/([a-z]+[0-9]+[:][0-9]+[\\s])/i", " ", $data); //del verse name. $data = preg_replace("/[a-z]['][s][\\s]/i", " ", $data); //del ---'s . //$data = preg_replace("/[:'\"\?\.,=+()_[]{}|\\\/~`!@#\$%^&\*<>]/i", " ", $data); //del ---'s . $data = preg_replace("/[^a-z]/i", " ", $data); //del symbols . ^:not letters. //$pattern = "/src=[\"']?([^\"']?.*(png|jpTotalWordsg|gif))[\"']?/i"; //$pattern = "/[0-9]+[\s]+[A-Za-z']+[\s]*/i"; //^:start of line; \s:space; $:end line; +:>=1; //$tot = preg_match_all("/[\s]+/", $data, $out); //print_r($out[0]); $out = preg_split("/[\\s]+/", $data); //space foreach ($out as $idx => $str) { $str = trim($str); if (strlen($str) == 0) { continue; } // $word = $str; $bChanged = false; $s = substr($str, -1); //get last char; if ("s" == $s) { $this->wordends($str); $word = $this->del_s($str, $bChanged); } if (!$bChanged) { $word = $IrregVerb->getBaseForm($str, $bChanged); } if (!$bChanged) { $word = $RegVerb->getBaseForm($str, $bChanged); } if (!$bChanged) { //iiregular noun plural, verb tense. if (array_key_exists($word, $this->Ignores)) { $word = $this->Ignores[$word]; } } //ignored words if (strlen($word) == 0) { echo count($out) . "," . $idx . $str . " == bad word process<br>"; continue; } if (!isset($this->WordFreqArr[$word])) { $this->WordFreqArr[$word] = array("sort" => 0, "afreq" => 0, "arank" => 0, "rfreq" => 0, "rrank" => 0); } $arr = $this->WordFreqArr[$word]; $arr["afreq"] += 1; $arr["arank"] = 0; $arr["sort"] = $arr["afreq"]; $this->WordFreqArr[$word] = $arr; $this->TotalWords += 1; } //$IrregVerb->show_FoundUsed(); $this->Ignores = array_merge($this->Ignores, $IrregVerb->ChangedVerbs); $this->Ignores = array_merge($this->Ignores, $RegVerb->ChangedVerbs); }