echo "\nLoad Freebase and compute\n"; $count = 0; $input = fopen($argv[2], 'r'); $mapping = new Mapping(new DummyCVTProvider(), new DummyReviewedFacts(), $argv[3]); $isoDateParser = new ValueParsers\IsoTimestampParser(new ValueParsers\CalendarModelParser(new ValueParsers\ParserOptions()), new ValueParsers\ParserOptions()); while ($line = fgets($input)) { $count++; if ($count % 1000000 === 0) { echo '.'; } list($s, $p, $o) = explode("\t", trim($line, " .\t\n\r\v"), 3); $s = substr($s, 28, -1); if (!$mapping->isFreebaseMapped($s)) { continue; } $s = $mapping->mapMid($s); $p = substr($p, 24, -1); //Format object if ($o[0] === '"') { if (preg_match('/^"(.+)"(@en)?$/', $o, $matches)) { $o = '"' . str_replace(["\n", '"'], [' ', ' '], $matches[1]) . '"'; } elseif (preg_match('/"(.+)"\\^\\^<([^<>]+)>/', $o, $matches)) { $value = $matches[1]; switch ($matches[2]) { case 'http://www.w3.org/2001/XMLSchema#gYear': $value .= '-00'; case 'http://www.w3.org/2001/XMLSchema#gYearMonth': // FALLTHROUGH $value .= '-00'; case 'http://www.w3.org/2001/XMLSchema#date': // FALLTHROUGH
} $language = $parts[$i]; } $language = strtolower($language); if (array_key_exists($language, $LANGUAGE_CODE_CONVERSION)) { $language = $LANGUAGE_CODE_CONVERSION[$language]; } if ($language === 'no' | $language === 'zh-hant') { continue; //TODO: what should we do with these two languages? } if (!$mapping->isFreebaseMapped($mid)) { continue; } addToStat('mapped-topic-labels', $language); $qid = $mapping->mapMid($mid); if (!array_key_exists($qid, $wikidataLabels)) { $stats['missing-data']++; continue; } if (in_array($language, $wikidataLabels[$qid])) { addToStat('existing-labels', $language); continue; } fwrite($output, $qid . "\t" . $language . "\t" . $label . "\n"); addToStat('new-labels', $language); $count++; if ($count % 100000 === 0) { echo '.'; } }