public function addReferences(MicroData $microData, $item, $sourceUrl) { // Only cache entity lookup stuff per item we are adding references for! // (but can be used for multiple sourceURLs!! if (!$item->getId()->equals($this->lastEntityId)) { $this->inMemoryEntityLookup = new InMemoryEntityLookup(); } $referenceCounter = 0; foreach ($this->callbackMap as $propertyIdString => $valueGetterFunction) { $values = $valueGetterFunction($microData); $statements = $item->getStatements()->getByPropertyId(new PropertyId($propertyIdString)); foreach ($values as $value) { foreach ($statements->getIterator() as &$statement) { $mainSnak = $statement->getMainSnak(); if (!$mainSnak instanceof PropertyValueSnak) { continue; // Ignore some and no value statements } /** @var EntityIdValue $valueEntityIdValue */ $valueEntityIdValue = $mainSnak->getDataValue(); /** @var ItemId $valueItemId */ $valueItemId = $valueEntityIdValue->getEntityId(); if ($this->inMemoryEntityLookup->hasEntity($valueItemId)) { $valueItem = $this->inMemoryEntityLookup->getEntity($valueItemId); } else { $valueItem = $this->wikibaseFactory->newItemLookup()->getItemForId($valueItemId); $this->inMemoryEntityLookup->addEntity($valueItem); } if (!in_array(strtolower($value), DataModelUtils::getMainTermsAsLowerCaseStrings($valueItem->getFingerprint()))) { continue; // Ignore things that don't appear to have the correct value } if (DataModelUtils::statementHasReferenceForUrlWithSameDomain($statement, $sourceUrl)) { continue; // Ignore statements that already have this URL domain as a ref } // Add the new reference! $newReference = DataModelUtils::getReferenceForUrl($sourceUrl); try { $this->wikibaseFactory->newReferenceSetter()->set($newReference, $statement, null, new EditInfo(urldecode($sourceUrl), EditInfo::NOTMINOR, EditInfo::BOT)); //NOTE: keep our in memory item copy up to date (yay such reference passing) $statement->addNewReference($newReference->getSnaks()); $referenceCounter++; } catch (UsageException $e) { //Ignore } } } } return $referenceCounter; }
protected function execute(InputInterface $input, OutputInterface $output) { $user = $input->getOption('user'); $userDetails = $this->appConfig->offsetGet('users.' . $user); if ($userDetails === null) { throw new RuntimeException('User not found in config'); } $wiki = $input->getOption('wiki'); $wikiDetails = $this->appConfig->offsetGet('wikis.' . $wiki); if ($wikiDetails === null) { throw new RuntimeException('Wiki not found in config'); } $sparql = $input->getOption('sparql'); if ($sparql === null || empty($sparql)) { throw new RuntimeException('SPARQL endpoint must be passed'); } $this->setServices($wikiDetails['url'], $sparql); $propertyString = $input->getOption('property'); $property = new PropertyId($propertyString); if ($propertyString === null || $propertyString === '' || $property === null) { throw new RuntimeException('No property given'); } $output->writeln('Running SPARQL query to find items to check'); $queryBuilder = new QueryBuilder(array('wdt' => 'http://www.wikidata.org/prop/direct/')); $itemIds = $this->sparqlQueryRunner->getItemIdsFromQuery($queryBuilder->select('?item')->where('?item', 'wdt:' . $propertyString, '?value')->limit(10000)->__toString()); $loggedIn = $this->wikibaseApi->login(new ApiUser($userDetails['username'], $userDetails['password'])); if (!$loggedIn) { $output->writeln('Failed to log in to wikibase wiki'); return -1; } $itemLookup = $this->wikibaseFactory->newItemLookup(); $statementRemover = $this->wikibaseFactory->newStatementRemover(); foreach ($itemIds as $itemId) { $item = $itemLookup->getItemForId($itemId); foreach ($item->getStatements()->getIterator() as $statement) { if ($statement->getPropertyId()->equals($property)) { $statementRemover->remove($statement, new EditInfo('Removing Statement')); } } } return 0; }
/** * @param OutputInterface $output * @param ItemId[] $itemIds * @param bool $force */ private function executeForItemIds(OutputInterface $output, array $itemIds, $force) { $itemLookup = $this->wikibaseFactory->newItemLookup(); $processedItemIdStrings = $this->getProcessedItemIdStrings(); $loopCounter = 0; /** @var FormatterHelper $formatter */ $formatter = $this->getHelper('formatter'); foreach ($itemIds as $itemId) { $loopCounter++; $itemIdString = $itemId->getSerialization(); $output->writeln('----------------------------------------------------'); if ($loopCounter % 10 != 0) { $processedItemIdStrings = $this->getProcessedItemIdStrings(); } if (!$force && in_array($itemId->getSerialization(), $processedItemIdStrings)) { $output->writeln($formatter->formatSection($itemIdString, 'Already processed')); continue; } try { $output->writeln($formatter->formatSection($itemIdString, 'Loading Item')); $item = $itemLookup->getItemForId($itemId); } catch (ItemLookupException $e) { $output->writeln($formatter->formatSection($itemIdString, 'Failed to load item (exception)', 'error')); continue; } if ($item === null) { $output->writeln($formatter->formatSection($itemIdString, 'Failed to load item (null)', 'error')); continue; } // Get the item types.. $types = array(); foreach ($item->getStatements()->getByPropertyId(new PropertyId('P31'))->toArray() as $instanceStatement) { $mainSnak = $instanceStatement->getMainSnak(); if ($mainSnak instanceof PropertyValueSnak) { /** @var EntityIdValue $instanceItemIdValue */ $instanceItemIdValue = $mainSnak->getDataValue(); $idSerialization = $instanceItemIdValue->getEntityId()->getSerialization(); if (array_key_exists($idSerialization, $this->instanceMap)) { $types[] = $this->instanceMap[$idSerialization]; } } } if (empty($types)) { $output->writeln($formatter->formatSection($itemIdString, 'Didn\\t find any useful instance of statements', 'comment')); continue; } // Note: only load Wikipedias $siteLinkList = DataModelUtils::getSitelinksWiteSiteIdSuffix($item->getSiteLinkList(), 'wiki'); $output->writeln($formatter->formatSection($itemIdString, $siteLinkList->count() . ' Wikipedia pages to request')); $parseProgressBar = new ProgressBar($output, $siteLinkList->count()); $parseProgressBar->display(); /** @var PromiseInterface[] $parsePromises */ $parsePromises = array(); foreach ($siteLinkList->getIterator() as $siteLink) { $siteId = $siteLink->getSiteId(); $pageName = $item->getSiteLinkList()->getBySiteId($siteId)->getPageName(); $sourceMwFactory = $this->wmFactoryFactory->getFactory($siteId); $sourceParser = $sourceMwFactory->newParser(); $pageIdentifier = new PageIdentifier(new Title($pageName)); $parsePromises[$siteId] = $sourceParser->parsePageAsync($pageIdentifier); $parseProgressBar->advance(); } $links = array(); foreach ($parsePromises as $siteId => $promise) { try { $parseResult = $promise->wait(); if (array_key_exists('externallinks', $parseResult)) { foreach ($parseResult['externallinks'] as $externalLink) { // Ignore archive.org links if (strstr($externalLink, 'archive.org') === false) { $links[] = $this->normalizeExternalLink($externalLink); } } } } catch (Exception $e) { $parseProgressBar->clear(); $output->writeln($formatter->formatSection($itemIdString, $e->getMessage(), 'error')); $parseProgressBar->display(); // Ignore failed requests } } $parseProgressBar->finish(); $output->writeln(''); $links = array_unique($links); shuffle($links); /** @var Request[] $linkRequests */ $linkRequests = array(); foreach ($links as $link) { $linkRequests[] = new Request('GET', $link, array('allow_redirects' => array('track_redirects' => true), 'connect_timeout' => 3.14, 'timeout' => 10)); } $output->writeln($formatter->formatSection($itemIdString, count($linkRequests) . ' External links to (download, action)')); if (empty($linkRequests)) { continue; } // Make a bunch of requests and act on the responses $referencesAddedToItem = 0; $externalLinkProgressBar = new ProgressBar($output, count($linkRequests) * 2); $externalLinkProgressBar->display(); $pool = new Pool($this->externalLinkClient, $linkRequests, array('fulfilled' => function ($response) use($externalLinkProgressBar, $item, $types, $referencesAddedToItem, $output) { $externalLinkProgressBar->advance(); // 1st advance point if ($response instanceof ResponseInterface) { $link = $response->getHeaderLine('X-GUZZLE-EFFECTIVE-URL'); $html = $response->getBody(); $referencesAddedFromLink = 0; foreach ($this->microDataExtractor->extract($html) as $microData) { foreach ($types as $type) { if ($microData->hasType($type) && array_key_exists($type, $this->referencerMap)) { foreach ($this->referencerMap[$type] as $referencer) { /** @var Referencer $referencer */ $addedReferences = $referencer->addReferences($microData, $item, $link); $referencesAddedToItem += $addedReferences; $referencesAddedFromLink += $addedReferences; } } } } if ($referencesAddedFromLink > 0) { $externalLinkProgressBar->clear(); $output->write("\r"); $output->writeln($referencesAddedFromLink . ' reference(s) added from ' . urldecode($link)); $externalLinkProgressBar->display(); } } $externalLinkProgressBar->advance(); // 2nd advance point }, 'rejected' => function () use($externalLinkProgressBar) { // TODO add this to some kind of verbose log? $externalLinkProgressBar->advance(); // 1st advance point })); $pool->promise()->wait(); $externalLinkProgressBar->finish(); $output->writeln(''); $output->writeln($formatter->formatSection($itemIdString, $referencesAddedToItem . ' References added')); $this->markIdAsProcessed($itemId); } }
protected function execute(InputInterface $input, OutputInterface $output) { $user = $input->getOption('user'); $userDetails = $this->appConfig->get('users.' . $user); if ($userDetails === null) { throw new RuntimeException('User not found in config'); } $pageIdentifier = null; if ($input->getOption('title') != null) { $sourceTitle = $input->getOption('title'); $pageIdentifier = new PageIdentifier(new Title($sourceTitle)); } else { throw new RuntimeException('No titles was set!'); } $sourceApi = new MediawikiApi("https://www.mediawiki.org/w/api.php"); $targetApi = new MediawikiApi("https://www.wikidata.org/w/api.php"); $loggedIn = $targetApi->login(new ApiUser($userDetails['username'], $userDetails['password'])); if (!$loggedIn) { $output->writeln('Failed to log in to target wiki'); return -1; } $sourceMwFactory = new MediawikiFactory($sourceApi); $sourceParser = $sourceMwFactory->newParser(); $parseResult = $sourceParser->parsePage($pageIdentifier); //Get the wikibase item if it exists $itemIdString = null; if (array_key_exists('properties', $parseResult)) { foreach ($parseResult['properties'] as $pageProp) { if ($pageProp['name'] == 'wikibase_item') { $itemIdString = $pageProp['*']; } } } $targetWbFactory = new WikibaseFactory($targetApi, new DataValueDeserializer(array('boolean' => 'DataValues\\BooleanValue', 'number' => 'DataValues\\NumberValue', 'string' => 'DataValues\\StringValue', 'unknown' => 'DataValues\\UnknownValue', 'globecoordinate' => 'DataValues\\Geo\\Values\\GlobeCoordinateValue', 'monolingualtext' => 'DataValues\\MonolingualTextValue', 'multilingualtext' => 'DataValues\\MultilingualTextValue', 'quantity' => 'DataValues\\QuantityValue', 'time' => 'DataValues\\TimeValue', 'wikibase-entityid' => 'Wikibase\\DataModel\\Entity\\EntityIdValue')), new DataValueSerializer()); // Create an item if there is no item yet! if ($itemIdString === null) { $output->writeln("Creating a new Item"); $item = new Item(); $item->setLabel('en', $sourceTitle); //TODO this siteid should come from somewhere? $item->getSiteLinkList()->setNewSiteLink('mediawikiwiki', $sourceTitle); $targetRevSaver = $targetWbFactory->newRevisionSaver(); $item = $targetRevSaver->save(new Revision(new Content($item))); } else { $item = $targetWbFactory->newItemLookup()->getItemForId(new ItemId($itemIdString)); } // Add instance of if not already there $hasInstanceOfExtension = false; foreach ($item->getStatements()->getByPropertyId(new PropertyId('P31'))->getMainSnaks() as $mainSnak) { if ($mainSnak instanceof PropertyValueSnak) { /** @var EntityIdValue $dataValue */ $dataValue = $mainSnak->getDataValue(); if ($dataValue->getEntityId()->equals(new ItemId('Q6805426'))) { $hasInstanceOfExtension = true; break; } } } if (!$hasInstanceOfExtension) { $output->writeln("Creating instance of Statement"); $targetWbFactory->newStatementCreator()->create(new PropertyValueSnak(new PropertyId('P31'), new EntityIdValue(new ItemId('Q6805426'))), $item->getId()); } // Try to add a licence $catLicenseMap = array('Public_domain_licensed_extensions' => 'Q19652'); $extensionLicenseItemIdString = null; if (array_key_exists('categories', $parseResult)) { foreach ($parseResult['categories'] as $categoryInfo) { if (array_key_exists($categoryInfo['*'], $catLicenseMap)) { $extensionLicenseItemIdString = $catLicenseMap[$categoryInfo['*']]; } } } if ($extensionLicenseItemIdString !== null) { $output->writeln("Creating Licence Statement"); $statementCreator = $targetWbFactory->newStatementCreator(); //TODO make sure it isn't already there???? $statementCreator->create(new PropertyValueSnak(new PropertyId('P275'), new EntityIdValue(new ItemId($extensionLicenseItemIdString))), $item->getId()); } }
protected function execute(InputInterface $input, OutputInterface $output) { // Get options $user = $input->getOption('user'); $userDetails = $this->appConfig->offsetGet('users.' . $user); if ($userDetails === null) { throw new RuntimeException('User not found in config'); } $items = $input->getOption('item'); if (empty($items)) { $output->writeln('Running SPARQL query to find items to check'); $queryBuilder = new QueryBuilder(array('prov' => 'http://www.w3.org/ns/prov#', 'wd' => 'http://www.wikidata.org/entity/', 'wikibase' => 'http://wikiba.se/ontology#', 'prv' => 'http://www.wikidata.org/prop/reference/value/')); $itemIds = $this->sparqlQueryRunner->getItemIdsFromQuery($queryBuilder->select('?item')->where('?ref', 'prv:P813', '?value')->also('?value', 'wikibase:timeCalendarModel', 'wd:Q1985786')->also('?st', 'prov:wasDerivedFrom', '?ref')->also('?item', '?pred', '?st')->limit(10000)->__toString()); } else { /** @var ItemId[] $itemIds */ $itemIds = array(); foreach (array_unique($items) as $itemIdString) { $itemIds[] = new ItemId($itemIdString); } } $itemIds = array_unique($itemIds); $output->writeln('Running for ' . count($itemIds) . ' items'); // Log in to Wikidata $loggedIn = $this->wikibaseApi->login(new ApiUser($userDetails['username'], $userDetails['password'])); if (!$loggedIn) { $output->writeln('Failed to log in to wikidata wiki'); return -1; } $itemLookup = $this->wikibaseFactory->newItemLookup(); foreach ($itemIds as $itemId) { $output->write($itemId->getSerialization() . ' '); $item = $itemLookup->getItemForId($itemId); foreach ($item->getStatements()->getIterator() as $statement) { foreach ($statement->getReferences() as $reference) { /** @var Reference $reference */ foreach ($reference->getSnaks()->getIterator() as $snak) { if ($snak instanceof PropertyValueSnak) { if ($snak->getPropertyId()->getSerialization() == 'P813') { /** @var TimeValue $dataValue */ $dataValue = $snak->getDataValue(); // We can assume ALL retrieval dates should be Gregorian! if ($dataValue->getCalendarModel() === TimeValue::CALENDAR_JULIAN) { $oldRefHash = $reference->getHash(); $statementGuid = $statement->getGuid(); $snakList = $reference->getSnaks(); $snakList = new SnakList($snakList->getArrayCopy()); $snakList->removeSnak($snak); $fixedTimestamp = $this->getFixedTimestamp($dataValue->getTime()); if ($fixedTimestamp) { $snakList->addSnak(new PropertyValueSnak(new PropertyId('P813'), new TimeValue($fixedTimestamp, $dataValue->getTimezone(), $dataValue->getBefore(), $dataValue->getAfter(), $dataValue->getPrecision(), TimeValue::CALENDAR_GREGORIAN))); $editSummary = 'Fix reference retrieval date'; $output->write('.'); } else { //TODO optionally remove rather than always doing so? $editSummary = 'Removing bad reference retrieval date'; $output->write('x'); } try { $this->wikibaseFactory->newReferenceSetter()->set(new Reference($snakList), $statementGuid, $oldRefHash, new EditInfo($editSummary)); } catch (UsageException $e) { $output->writeln(''); $output->write($e->getMessage()); } } } } } } } $output->writeln(''); } return 0; }