コード例 #1
0
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
and the GNU Affero General Public License along with this program.
If not, see <http://www.gnu.org/licenses/>.
*********************************************************************
*/
if (!isset($chain)) {
    include "includes/fns.php";
    include "/opt/gaidhlig/config.php";
    $cgfinished = "cgfinished";
    $words = "words";
}
drop_existing_table($cgfinished);
$sql_table = "\nCREATE TABLE {$cgfinished} (\n    id serial NOT NULL,\n\tsentence integer,\n\tlocation integer,\n\tsurface character varying(100),\n    lemma character varying(100),\n    enlemma character varying(100),\n    pos character varying(200),\n    extra character varying(100),\n\tseg character varying(100)\n);\n";
$result_table = pg_query($db_handle, $sql_table);
$sql_pkey = "\nALTER TABLE ONLY " . $cgfinished . " ADD CONSTRAINT " . $cgfinished . "_pk PRIMARY KEY (id);\n";
$result_pkey = pg_query($db_handle, $sql_pkey);
$lines = file("outputs/{$words}_cg_applied.txt");
// Open input file.
foreach ($lines as $line_num => $line) {
    if (preg_match("/^\"</", $line)) {
        preg_match("/<(?P<surface>.*)>/", $line, $quote);
        // Get the surface form.
        $surface = $quote[surface];
        //echo $surface."\n";
    } elseif (preg_match("/^\t\"/", $line)) {
        preg_match("/\"(?P<lemma>.+)\"/", $line, $mylemma);
        // Get the lemma.
コード例 #2
0
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License or the GNU
Affero General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option)
any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
and the GNU Affero General Public License along with this program.
If not, see <http://www.gnu.org/licenses/>.
*********************************************************************
*/
// This script copies all the multiwords from glalist to a separate table, so that they can be used to mark up the additional copy of the input file that will form the sentences table that produces the words table.
if (!isset($chain)) {
    include "includes/fns.php";
    include "/opt/gaidhlig/config.php";
}
$multiwords = "multiwords";
drop_existing_table($multiwords);
$sql_table = query("\nCREATE TABLE multiwords (\n\tid serial NOT NULL,\n    glaid integer,\n    surface character varying(100) DEFAULT ''::character varying,\n    lemma character varying(100) DEFAULT ''::character varying,\n    enlemma character varying(100) DEFAULT ''::character varying,\n    clar character varying(100) DEFAULT ''::character varying,\n    pos character varying(20) DEFAULT ''::character varying,\n    gender character varying(20) DEFAULT ''::character varying,\n    number character varying(50) DEFAULT ''::character varying,\n    gcase character varying(20) DEFAULT ''::character varying,\n    tense character varying(100) DEFAULT ''::character varying,\n    notes character varying(250) DEFAULT ''::character varying,\n    extra character varying(100) DEFAULT ''::character varying\n);\n");
$sql_pkey = query("\nALTER TABLE ONLY " . $multiwords . " ADD CONSTRAINT " . $multiwords . "_pk PRIMARY KEY (id);\n");
// The multiwords with the most number of spaces need to come first, to prevent shorter phrases with some of the same words firing.
// These queries could probably be generated using a loop.
$sql4 = query("insert into multiwords (glaid, surface, lemma, enlemma, clar, pos, gender, number, gcase, tense, notes, extra) select id, surface, lemma, enlemma, clar, pos, gender, number, gcase, tense, notes, extra from glalist where surface~'^.[^\\\\s]*\\\\s.[^\\\\s]*\\\\s.[^\\\\s]*\\\\s.[^\\\\s]*\\\\s.[^\\\\s]*\$' order by surface;");
$sql3 = query("insert into multiwords (glaid, surface, lemma, enlemma, clar, pos, gender, number, gcase, tense, notes, extra) select id, surface, lemma, enlemma, clar, pos, gender, number, gcase, tense, notes, extra from glalist where surface~'^.[^\\\\s]*\\\\s.[^\\\\s]*\\\\s.[^\\\\s]*\\\\s.[^\\\\s]*\$' order by surface;");
$sql2 = query("insert into multiwords (glaid, surface, lemma, enlemma, clar, pos, gender, number, gcase, tense, notes, extra) select id, surface, lemma, enlemma, clar, pos, gender, number, gcase, tense, notes, extra from glalist where surface~'^.[^\\\\s]*\\\\s.[^\\\\s]*\\\\s.[^\\\\s]*\$' order by surface;");
$sql1 = query("insert into multiwords (glaid, surface, lemma, enlemma, clar, pos, gender, number, gcase, tense, notes, extra) select id, surface, lemma, enlemma, clar, pos, gender, number, gcase, tense, notes, extra from glalist where surface~'^.[^\\\\s]*\\\\s.[^\\\\s]*\$' order by surface;");
コード例 #3
0
    include "includes/fns.php";
    include "/opt/gaidhlig/config.php";
}
$dir = "./inputs";
//$files=scandir($dir);
// Uncomment for corpus.
// $sentences="sentences";
// $files=array("inputs/smo.txt");
// $end=".txt";
// Uncomment for multiwords.
//$sentences="sentences_mw";
$sentences = "taic_mw";
//$files=array("inputs/smo.out");
$files = array("taic1-10.out");
$end = ".out";
drop_existing_table($sentences);
$sql_table = query("\nCREATE TABLE {$sentences} (\n    id serial NOT NULL,\n    filename character varying(100)  DEFAULT ''::character varying,\n    surface text,\n    english text,\n    word_g integer,\n    word_e integer\n);\n");
$sql_pkey = query("\nALTER TABLE ONLY " . $sentences . " ADD CONSTRAINT " . $sentences . "_pk PRIMARY KEY (id);\n");
foreach ($files as $file) {
    if (preg_match("/{$end}/", $file)) {
        $filename = basename(preg_replace("/\\..*\$/", "", $file));
        echo $filename;
        $lineno = 0;
        $lines = file($dir . "/" . $file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
        foreach ($lines as $line) {
            $line = pg_escape_string(trim($line));
            if ($lineno % 2 == 0) {
                $sql = query("insert into {$sentences} (filename, surface) values ('{$filename}', '{$line}');");
                echo $line . "\n";
            } else {
                $sql = query("update {$sentences} set english='{$line}' where id=currval('{$sentences}_id_seq');");
コード例 #4
0
<?php

// This script sets up an empty table in the glalist format.  Edit the query at the end to populate the new table.
include "includes/fns.php";
include "/opt/gaidhlig/config.php";
$newtable = "taic_unk";
drop_existing_table($newtable);
$sql_table = query("\nCREATE TABLE {$newtable} (\n    {$newtable}_id serial NOT NULL,\n    surface character varying(100) DEFAULT ''::character varying,\n    lemma character varying(100) DEFAULT ''::character varying,\n    enlemma character varying(100) DEFAULT ''::character varying,\n    clar character varying(100) DEFAULT ''::character varying,\n    pos character varying(20) DEFAULT ''::character varying,\n    gender character varying(20) DEFAULT ''::character varying,\n    number character varying(50) DEFAULT ''::character varying,\n    gcase character varying(20) DEFAULT ''::character varying,\n    tense character varying(100) DEFAULT ''::character varying,\n    notes character varying(250) DEFAULT ''::character varying,\n    extra character varying(100) DEFAULT ''::character varying\n);\n");
$sql_pkey = query("alter table only {$newtable} add constraint {$newtable}_id_pkey primary key ({$newtable}_id);");
$sql = query("select surface, auto, count(surface) from taic_words where auto~'\\.UNK' group by surface, auto order by surface;");
while ($row = pg_fetch_object($sql)) {
    $surface = pg_escape_string(trim($row->surface));
    $sql_g = query("insert into {$newtable} (surface) values ('{$surface}');");
}
コード例 #5
0
ファイル: store_words.php プロジェクト: donnekgit/gaidhlig
You should have received a copy of the GNU General Public License
and the GNU Affero General Public License along with this program.
If not, see <http://www.gnu.org/licenses/>.
*********************************************************************
*/
// This script splits the sentences from store_sentences.php into words and stores them in a db table.
if (!isset($chain)) {
    include "includes/fns.php";
    include "/opt/gaidhlig/config.php";
    //$sentences="sentences";
    //$sentences="sentences_mw";
    $sentences = "taic_mw";
    $words = "taic_words";
}
$thesewords = array();
drop_existing_table($words);
$sql_table = query("\nCREATE TABLE {$words} (\n    {$words}_id serial NOT NULL,\n    filename character varying(100),\n    sentence_id integer,\n    location integer,\n    surface character varying(100),\n    auto character varying(250)\n);\n");
$sql_pkey = query("\nALTER TABLE ONLY " . $words . " ADD CONSTRAINT " . $words . "_pk PRIMARY KEY ({$words}_id);\n");
$sql = query("select * from {$sentences} order by id;");
while ($row = pg_fetch_object($sql)) {
    $newutt = trim($row->surface);
    $i = 1;
    $surface_words = array_filter(explode(' ', $newutt));
    foreach ($surface_words as $surface_word) {
        echo $row->id . ": " . $surface_word . "\n";
        $surface_word = pg_escape_string(trim($surface_word));
        $clean_word = wordclean($surface_word);
        $clean_word = preg_replace("/_/", " ", $clean_word);
        $sql_w = query("insert into {$words} (sentence_id, location, surface, filename) values ('{$row->id}', '{$i}', '{$clean_word}', '{$row->filename}')");
        $i = ++$i;
    }