private function buildLmfIndividuals($filename, $fileOfIndividuals) { $resourceName = $this->getResourceName(); $file = fopen($filename, 'r'); $xml = fread($file, filesize($filename)); fclose($file); $dom = new \DOMDocument('1.0', 'UTF-8'); $dom->loadXML($xml); $fileIndividuals = fopen($fileOfIndividuals, "w+"); $recordNr = 1; foreach ($dom->getElementsByTagName('return') as $domRecord) { /* @var $domRecord \DOMElement */ $nodes = $domRecord->childNodes; $arr = array(); foreach ($nodes as $node) { /* @var $node \DOMElement */ if ($node->nodeName == 'metadata') { $metadata = new \DOMDocument('1.0', 'UTF-8'); $metadata->loadXML($node->nodeValue); $ins = array(); // Taking a record /* @var $record \DOMElement */ $record = $metadata->getElementsByTagName('record')->item(0); foreach ($record->getElementsByTagName('el') as $el) { /* @var $el \DOMElement */ if ($el->getAttribute('value') || $el->getAttribute('name') == 'Reiksme') { // Lemma if ($el->getAttribute('name') == 'AntrastinisZodis') { $ins['lemma'] = htmlspecialchars($el->getAttribute('value')); } // Forms if ($el->getAttribute('name') == 'Forma') { $ins['wordForms'][] = $el->getAttribute('value'); } // Pronunciation if ($el->getAttribute('name') == 'Tarimas') { $ins['pronunciation'] = $el->getAttribute('value'); } // Senses if ($el->getAttribute('name') == 'Reiksme') { $senseArr = array(); foreach ($el->childNodes as $sense) { // There are some DOMTExt nodes, so we will ignore them if (get_class($sense) == 'DOMElement') { /* @var $sense \DOMElement */ // PartOfSpeach if ($sense->getAttribute('name') == 'KalbosDalis') { $senseArr['partOfSpeach'] = $this->fullAbbreviation($sense->getAttribute('value')); } // Equivalents if ($sense->getAttribute('name') == 'Atitikmuo') { // Dictionary can contain ilegal xml chars $senseArr['equivalent'][] = htmlspecialchars($sense->getAttribute('value')); } } } $ins['senses'][] = $senseArr; } } } $arr[$node->nodeName] = $ins; } else { $arr[$node->nodeName] = $node->nodeValue; } } // TODO pridėti tarimą ir wordFormas // Concert the array to lexical entry /* array contains * - id * - header * - status * - metadata * - lemma (attr: word) * - (attr: writer) * - (attr: imageURL) * - (attr: sourceLink) * - (attr: * - pronunciation () - @TODO * - wordForms * - senses * - partOfSpeach * - equivalent */ if (isset($arr['metadata']['lemma'])) { $lexicalEntries = array(); $senseNr = 1; $isFirst = TRUE; foreach ($arr['metadata']['senses'] as $sense) { $lmfSense = new Owl\LmfSense(); if ($isFirst) { $lexicalEntry = new Owl\LmfLexicalEntry($resourceName); $lexicalEntry->setUri($this->getUriFactory()->create('LexicalEntry', $arr['metadata']['lemma'], $arr['id'])); // Set Lemma $lmfLemma = new Owl\LmfLemma(); $lmfLemma->setWrittenForm($arr['metadata']['lemma']); $lmfLemma->setUri($this->getUriFactory()->create('Lemma', $arr['metadata']['lemma'], $arr['id'])); $lexicalEntry->setLemma($lmfLemma); $lexicalEntry->setPartOfSpeech($sense['partOfSpeach']); array_push($lexicalEntries, $lexicalEntry); $isFirst = FALSE; } else { reset($lexicalEntries); $lexicalEntry = NULL; // Check if lexical entry with specified part of speech exists foreach ($lexicalEntries as $lexEntry) { /* @var $lexEntry Owl\LmfLexicalEntry */ if ($lexEntry->getPartOfSpeech() == $sense['partOfSpeach']) { $lexicalEntry = $lexEntry; } } // Creation of new entity of lexical entry if (!$lexicalEntry) { $lexicalEntry = new Owl\LmfLexicalEntry($resourceName); $lexicalEntry->setUri($this->getUriFactory()->create('LexicalEntry', $arr['metadata']['lemma'] . '-' . (sizeof($lexicalEntries) + 1), $arr['id'])); // Set Lemma $lmfLemma = new Owl\LmfLemma(); $lmfLemma->setWrittenForm($arr['metadata']['lemma']); $lmfLemma->setUri($this->getUriFactory()->create('Lemma', $arr['metadata']['lemma'] . '-' . (sizeof($lexicalEntries) + 1), $arr['id'])); $lexicalEntry->setLemma($lmfLemma); $lexicalEntry->setPartOfSpeech($sense['partOfSpeach']); array_push($lexicalEntries, $lexicalEntry); } } $lmfSense->setUri($this->getUriFactory()->create('Sense', $lexicalEntry->getLemma()->getWrittenForm(), $arr['id'] . '-' . $senseNr++)); $lmfSense->setLemmaWrittenForm($lexicalEntry->getLemma()->getWrittenForm()); $equivalents = $sense['equivalent']; $rank = 1; foreach ($equivalents as $equivalent) { $lmfEquivalent = new Owl\LmfEquivalent(); // Bug "patekti į nepatogią padėtį" firs space is nor normal if ($equivalent == 'patekti į nepatogią padėtį') { $equivalent = 'patekti į nepatogią padėtį'; } $lmfEquivalent->setUri($this->getUriFactory()->create('Equivalent', $equivalent, $arr['id'] . '-' . $rank)); $lmfEquivalent->setLanguage('Lietuvių'); $lmfEquivalent->setWrittenForm($equivalent); $lmfEquivalent->setRank($rank++); $lmfSense->addEquivalent($lmfEquivalent); } $lexicalEntry->addSense($lmfSense); } // Word form if (!empty($arr['metadata']['wordForms'])) { $rank = 1; foreach ($arr['metadata']['wordForms'] as $wordForm) { $lmfWordForm = new Owl\LmfWordForm(); $lmfWordForm->setUri($this->getUriFactory()->create('WordForm', $wordForm, $arr['id'] . '-' . $rank++)); $lmfWordForm->setWrittenForm($wordForm); $lexicalEntry->addWordForm($lmfWordForm); } } // When is more than one sense foreach ($lexicalEntries as $lexicalEntry) { fwrite($fileIndividuals, $lexicalEntry->toLmfString()); } } //echo '<br />' . $recordNr++ . '-' . $arr['id'] . '-' . $arr['metadata']['lemma']; } fclose($fileIndividuals); }
protected function buildLmfIndividuals($filename, $fileOfIndividuals) { $resourceName = $this->getResourceName(); $file = fopen($filename, 'r'); $xml = fread($file, filesize($filename)); fclose($file); $dom = new \DOMDocument('1.0', 'UTF-8'); $dom->loadXML($xml); //$data = array(); $fileIndividuals = fopen($fileOfIndividuals, "w+"); $recordNr = 1; // Get record ids $attributes = array(); /** * Data structure * id * header * metadata - homonym * - [1..n] word * - [1..n] idiom + All child elements are transformed to one level * - idiomtag * - expl * - example * - exampletag * - link * - linktag * - entryfulltext - this element is not used * status */ $n = array(); foreach ($dom->getElementsByTagName('return') as $domRecord) { /* @var $domRecord \DOMElement */ $nodes = $domRecord->childNodes; $arr = array(); foreach ($nodes as $node) { /* @var $node \DOMElement */ if ($node->nodeName == 'metadata' && $node->nodeValue) { $metadata = new \DOMDocument('1.0', 'UTF-8'); $metadata->loadXML($node->nodeValue); $ins = array(); $num = 1; // Taking a dc records /* @var $record \DOMElement */ $record = $metadata->getElementsByTagName('record')->item(0); foreach ($record->getElementsByTagName('el') as $el) { /* @var $el \DOMElement */ if ($el->getAttribute('value') || $el->getAttribute('name') == 'word') { // Lemma if ($el->getAttribute('name') == 'word') { if (!isset($ins['lemma'])) { // First is lemma $ins['lemma'] = htmlspecialchars($el->getAttribute('value')); } else { // Second and next are wordForms $ins['wordForms'][] = htmlspecialchars($el->getAttribute('value')); } } // Idioms if ($el->getAttribute('name') == 'idiom') { $idiomsArr = array('idiom' => $el->getAttribute('value')); $idiomsArr['explanations'] = array(); // Take all childer elements they belong to same idiom foreach ($el->childNodes as $expl) { // There are some DOMTExt nodes, so we will ignore them if (get_class($expl) == 'DOMElement') { /* @var $expl \DOMElement */ if ($expl->getAttribute('name') == 'expl') { $idiomsArr['explanations'][] = array($expl->getAttribute('name') => $this->getChildNodesArray($expl)); } elseif ($expl->getAttribute('name') == 'link') { $idiomsArr['links'][] = array($expl->getAttribute('name') => $this->getChildNodesArray($expl)); } else { $idiomsArr[$expl->getAttribute('name')][] = $expl->getAttribute('value'); } } } $ins['idioms'][] = $idiomsArr; } } } $arr[$node->nodeName] = $ins; } else { $arr[$node->nodeName] = $node->nodeValue; } } $recordNr++; // Concert the array to lexical entry /* array contains all atributes of data structure * - id * - header * - status * - metadata * * all feeld of data structure with is presented upper */ if ($arr['status'] != '-1' && !empty($arr['metadata']['lemma'])) { $lexicalEntry = new Owl\LmfLexicalEntry($resourceName); $lexicalEntry->setUri($this->getUriFactory()->create('LexicalEntry', $arr['metadata']['lemma'], $arr['id'])); $lmfLemma = new Owl\LmfLemma(); $lmfLemma->setWrittenForm($arr['metadata']['lemma']); $lmfLemma->setUri($this->getUriFactory()->create('Lemma', $arr['metadata']['lemma'], $arr['id'])); $lexicalEntry->setLemma($lmfLemma); $lmfSense = new Owl\LmfSense(); $lmfSense->setLemmaWrittenForm($lmfLemma->getWrittenForm()); $lmfSense->setUri($this->getUriFactory()->create('Sense', $arr['metadata']['lemma'], $arr['id'])); $lmfDefintion = new Owl\LmfDefinition(); $lmfDefintion->setUri($this->getUriFactory()->create('Definition', $arr['metadata']['lemma'], $arr['id'])); $lmfTextRepresentation = new Owl\LmfTextRepresentation(); $lmfTextRepresentation->setUri($this->getUriFactory()->create('TextRepresentation', $arr['metadata']['lemma'], $arr['id'])); $writtenForm = "<![CDATA["; foreach ($arr['metadata']['idioms'] as $key => $attr) { //$writtenForm .= "<div>"; if (isset($attr['idiom'])) { $writtenForm .= "\n<br/><span style=\"font-weight: bold;\">{$attr['idiom']}</span> "; // IdiomTag if (isset($attr['idiomtag'])) { $writtenForm .= implode('., ', $attr['idiomtag']) . ". "; } // Explanation if (isset($attr['explanations'])) { $countExpl = count($attr['explanations']); foreach ($attr['explanations'] as $key => $expls) { if (isset($expls['expl'])) { if ($countExpl > 1) { $writtenForm .= "\n<br/> <em>" . ($key + 1) . ".</em><i>{$expls['expl']['value']}:</i>"; } else { $writtenForm .= "<i>{$expls['expl']['value']}:</i>"; } // Examples if (isset($expls['expl']['children'])) { foreach ($expls['expl']['children'] as $example) { if (isset($example['example']['value'])) { $writtenForm .= " {$example['example']['value']}."; } if (isset($example['example']['children'][0]['exampletag']['value'])) { $writtenForm .= " " . $example['example']['children'][0]['exampletag']['value'] . "."; } } } } } } // TODO make real links between lemmas if (isset($attr['links'])) { foreach ($attr['links'] as $key => $link) { if (isset($link['link']['value'])) { if (isset($link['link']['children'][0]['linktag']['value'])) { $writtenForm .= "<i>{$link['link']['children'][0]['linktag']['value']}</i> "; } $writtenForm .= $link['link']['value']; } } } } //$writtenForm .= "</div>"; } $writtenForm .= "]]>"; $lmfTextRepresentation->setWrittenForm($writtenForm); $lmfDefintion->addTextRepresentation($lmfTextRepresentation); $lmfSense->setDefinition($lmfDefintion); $lexicalEntry->addSense($lmfSense); // Word form if (!empty($arr['metadata']['wordForms'])) { $rank = 1; foreach ($arr['metadata']['wordForms'] as $wordForm) { $lmfWordForm = new Owl\LmfWordForm(); $lmfWordForm->setUri($this->getUriFactory()->create('WordForm', $wordForm, $arr['id'] . '-' . $rank++)); $lmfWordForm->setWrittenForm($wordForm); $lexicalEntry->addWordForm($lmfWordForm); } } fwrite($fileIndividuals, $lexicalEntry->toLmfString()); } } fclose($fileIndividuals); if (!empty($n)) { print_r($n); } }
protected function buildLmfIndividuals($filename, $fileOfIndividuals) { $resourceName = $this->getResourceName(); $file = fopen($filename, 'r'); $xml = fread($file, filesize($filename)); fclose($file); $dom = new \DOMDocument('1.0', 'UTF-8'); $dom->loadXML($xml); $fileIndividuals = fopen($fileOfIndividuals, "w+"); $recordNr = 1; // Get record ids $attributes = array(); /** * Data structure * id * header * metadata - homonym * - word * - grammar * - wordtag * - [1..n] antonym + All child elements are transformed to one level * - antgramar * - anttag * - antremote * - [1..n] valcontext * - [1..n] example * - [1..n]exampletag * - expl + word explanation * - antexpl + antonym explanation * - entryfulltext - this element is not used * status */ $n = array(); $lexEntries = array(); // Index of all posible lexical entries ([] => 'lemma') $lexIndex = array(); foreach ($dom->getElementsByTagName('return') as $domRecord) { /* @var $domRecord \DOMElement */ $nodes = $domRecord->childNodes; $arr = array(); foreach ($nodes as $node) { /* @var $node \DOMElement */ if ($node->nodeName == 'metadata' && $node->nodeValue) { $metadata = new \DOMDocument('1.0', 'UTF-8'); $metadata->loadXML($node->nodeValue); $ins = array(); $num = 1; // Taking a dc records /* @var $record \DOMElement */ $record = $metadata->getElementsByTagName('record')->item(0); foreach ($record->getElementsByTagName('el') as $el) { /* @var $el \DOMElement */ if ($el->getAttribute('value') || $el->getAttribute('name') == 'valcontext') { // Homonym if ($el->getAttribute('name') == 'homonym') { if ($el->getAttribute('value')) { $ins['homonym'] = $el->getAttribute('value'); } } // Lemma if ($el->getAttribute('name') == 'word') { if (!isset($ins['lemma'])) { // First is lemma $ins['lemma'] = htmlspecialchars($el->getAttribute('value')); } else { // Second and next are wordForms $ins['wordForms'][] = htmlspecialchars($el->getAttribute('value')); } } // Antonyms if ($el->getAttribute('name') == 'antonym') { $ant = array('antonym' => $el->getAttribute('value')); foreach ($el->childNodes as $param) { // There are some DOMTExt nodes, so we will ignore them if (get_class($param) == 'DOMElement') { /* @var $param \DOMElement */ $ant[$param->getAttribute('name')][] = $param->getAttribute('value'); } } $ins['antonyms'][] = $ant; } // value context if ($el->getAttribute('name') == 'valcontext') { $valcontextArr = array(); // Take all childer elements they belong to same idiom foreach ($el->childNodes as $expl) { // There are some DOMTExt nodes, so we will ignore them if (get_class($expl) == 'DOMElement') { /* @var $expl \DOMElement */ if ($expl->getAttribute('name') == 'example') { $valcontextArr['examples'][] = array($expl->getAttribute('name') => $this->getChildNodesArray($expl)); } else { $valcontextArr[$expl->getAttribute('name')][] = $expl->getAttribute('value'); } } } $ins['valcontexts'][] = $valcontextArr; } } } $arr[$node->nodeName] = $ins; } else { $arr[$node->nodeName] = $node->nodeValue; } } $recordNr++; // Concert the array to lexical entry /* array contains all atributes of data structure * - id * - header * - status * - metadata * * all feeld of data structure with is presented upper */ if ($arr['status'] != '-1' && !empty($arr['metadata']['lemma'])) { $homonym = isset($arr['metadata']['homonym']) ? $arr['metadata']['homonym'] : ''; $lexIndex[$arr['id']] = $arr['metadata']['lemma']; // Lexical entry has multiple senses if (isset($lexEntries[$arr['metadata']['lemma']])) { $lexicalEntry = $lexEntries[$arr['metadata']['lemma']]; $lmfLemma = $lexicalEntry->getLemma(); } else { $lexicalEntry = new Owl\LmfLexicalEntry($resourceName); $lexicalEntry->setUri($this->getUriFactory()->create('LexicalEntry', $arr['metadata']['lemma'], 0)); $lmfLemma = new Owl\LmfLemma(); $lmfLemma->setWrittenForm($arr['metadata']['lemma']); $lmfLemma->setUri($this->getUriFactory()->create('Lemma', $arr['metadata']['lemma'], $arr['id'])); $lexicalEntry->setLemma($lmfLemma); } $lmfSense = new Owl\LmfSense(); $lmfSense->setLemmaWrittenForm($lmfLemma->getWrittenForm()); if ($homonym) { $lmfSense->setRank($homonym); } $lmfSense->setUri($this->getUriFactory()->create('Sense', $arr['metadata']['lemma'], $arr['id'] . '-' . $homonym)); $lmfDefintion = new Owl\LmfDefinition(); $lmfDefintion->setUri($this->getUriFactory()->create('Definition', $arr['metadata']['lemma'], $arr['id'] . '-' . $homonym)); $lmfTextRepresentation = new Owl\LmfTextRepresentation(); $lmfTextRepresentation->setUri($this->getUriFactory()->create('TextRepresentation', $arr['metadata']['lemma'], $arr['id'] . '-' . $homonym)); $antonyms = $arr['metadata']['antonyms']; $writtenForm = "<![CDATA["; foreach ($arr['metadata']['valcontexts'] as $key => $attr) { if (sizeof($arr['metadata']['valcontexts']) > 1) { $writtenForm .= "\n<span style=\"font-weight: bold;\">" . ($key + 1) . "</span>"; } // Explanation if (isset($attr['expl'])) { $writtenForm .= "\n<em>{$attr['expl'][0]}</em> <br />"; // Antonyms foreach ($antonyms as $key => $antonym) { $writtenForm .= "\n<span style=\"font-weight: bold;\">{$antonym['antonym']}</span>, "; } // remove last comma $writtenForm = substr($writtenForm, 0, strlen($writtenForm) - 2) . ' '; } // Antonym explanation if (isset($attr['antexpl'])) { $writtenForm .= "\n<br/> <em>{$attr['antexpl'][0]}</em>"; } // Examples if (isset($attr['examples'])) { $countExpl = count($attr['examples']); foreach ($attr['examples'] as $key => $expls) { // Example if (isset($expls['example']['value'])) { $writtenForm .= "\n<br />{$expls['example']['value']}"; // Tags if (isset($expls['example']['children'])) { foreach ($expls['example']['children'] as $example) { if (isset($example['exampletag']['value'])) { $writtenForm .= " {$example['exampletag']['value']}."; } } } } } } } $writtenForm .= " ]]>"; $lmfTextRepresentation->setWrittenForm($writtenForm); $lmfDefintion->addTextRepresentation($lmfTextRepresentation); $lmfSense->setDefinition($lmfDefintion); // Add sense relations foreach ($antonyms as $key => $antonym) { /* $antonymLexicalEntry = new Owl\LmfLexicalEntry($resourceName); $antonymLexicalEntry->setUri($this->getUriFactory()->create('LexicalEntry', $antonym['antonym'], 0)); $antonymLmfLemma = new Owl\LmfLemma(); $antonymLmfLemma->setWrittenForm($arr['metadata']['lemma']); $antonymLmfLemma->setUri($this->getUriFactory()->create('Lemma', $antonym['antonym'], 0)); $antonymLmfLemma->setWrittenForm($antonym['antonym']); $antonymLexicalEntry->setLemma($antonymLmfLemma); fwrite($fileIndividuals, $antonymLexicalEntry->toLmfString()); */ $senseRelation = new Owl\LmfSenseRelation(); $senseRelation->setUri($this->getUriFactory()->create('SenseRelation', $arr['metadata']['lemma'], $arr['id'] . '-' . $key)); $senseRelation->setType('Antonimas'); $senseRelation->setRank($key + 1); //$senseRelation->addSenseRelatedTo($antonymLexicalEntry); $senseRelation->setWrittenForm($antonym['antonym']); $lmfSense->addSenseRelation($senseRelation); } $lexicalEntry->addSense($lmfSense); // Word form if (!empty($arr['metadata']['wordForms'])) { $rank = 1; foreach ($arr['metadata']['wordForms'] as $wordForm) { $lmfWordForm = new Owl\LmfWordForm(); $lmfWordForm->setUri($this->getUriFactory()->create('WordForm', $wordForm, $arr['id'] . '-' . $rank++)); $lmfWordForm->setWrittenForm($wordForm); $lexicalEntry->addWordForm($lmfWordForm); } } // save not homonyms if (!$homonym) { fwrite($fileIndividuals, $lexicalEntry->toLmfString()); } else { // Update lexical entries $lexEntries[$arr['metadata']['lemma']] = @$lexicalEntry; } } } // Save homonyms foreach ($lexEntries as $lexEntry) { fwrite($fileIndividuals, $lexEntry->toLmfString()); } fclose($fileIndividuals); if (!empty($n)) { print_r($n); } }