public function addReferences(MicroData $microData, $item, $sourceUrl) { $referenceCounter = 0; foreach ($this->propMap as $propertyIdString => $schemaPropertyString) { $regexMap = $this->regexMap[$propertyIdString]; $values = array(); foreach ($microData->getProperty($schemaPropertyString, MicroData::PROP_STRING) as $propertyValue) { // Don't match URLS! if (strstr($propertyValue, '//')) { continue; } $values[] = $propertyValue; } $statements = $item->getStatements()->getByPropertyId(new PropertyId($propertyIdString)); foreach ($values as $value) { foreach ($statements->getIterator() as &$statement) { $mainSnak = $statement->getMainSnak(); if (!$mainSnak instanceof PropertyValueSnak) { continue; // Ignore some and no value statements } if (DataModelUtils::statementHasReferenceForUrlWithSameDomain($statement, $sourceUrl)) { continue; // Ignore statements that already have this URL domain as a ref } /** @var EntityIdValue $valueEntityIdValue */ $valueEntityIdValue = $mainSnak->getDataValue(); /** @var EntityId $valueEntityId */ $valueEntityId = $valueEntityIdValue->getEntityId(); $valueEntityIdString = $valueEntityId->getSerialization(); if (!array_key_exists($valueEntityIdString, $regexMap)) { //TODO log that this ItemId is missing? continue; } $regex = $regexMap[$valueEntityIdString]; if (!preg_match($regex, $value)) { // ItemId regex didn't match this schema value continue; } // Add the new reference! $newReference = DataModelUtils::getReferenceForUrl($sourceUrl); try { $this->wikibaseFactory->newReferenceSetter()->set($newReference, $statement, null, new EditInfo(urldecode($sourceUrl), EditInfo::NOTMINOR, EditInfo::BOT)); //NOTE: keep our in memory item copy up to date (yay such reference passing) $statement->addNewReference($newReference->getSnaks()); $referenceCounter++; } catch (UsageException $e) { //Ignore } } } } return $referenceCounter; }
public function addReferences(MicroData $microData, $item, $sourceUrl) { $referenceCounter = 0; foreach ($this->propMap as $propertyIdString => $schemaPropertyString) { /** @var TimeValue[] $timeValues */ $timeValues = array(); foreach ($microData->getProperty($schemaPropertyString, MicroData::PROP_STRING) as $propertyValue) { try { $date = new DateTime(trim($propertyValue)); $timeValues[] = $this->timeParser->parse($date->format('Y m d')); } catch (Exception $e) { // Ignore failed parsing } } $statements = $item->getStatements()->getByPropertyId(new PropertyId($propertyIdString)); foreach ($timeValues as $timeValue) { foreach ($statements->getIterator() as &$statement) { $mainSnak = $statement->getMainSnak(); if (!$mainSnak instanceof PropertyValueSnak) { continue; // Ignore some and no value statements } if (DataModelUtils::statementHasReferenceForUrlWithSameDomain($statement, $sourceUrl)) { continue; // Ignore statements that already have this URL domain as a ref } if (!$timeValue->equals($mainSnak->getDataValue())) { continue; } // Add the new reference! $newReference = DataModelUtils::getReferenceForUrl($sourceUrl); try { $this->wikibaseFactory->newReferenceSetter()->set($newReference, $statement, null, new EditInfo(urldecode($sourceUrl), EditInfo::NOTMINOR, EditInfo::BOT)); //NOTE: keep our in memory item copy up to date (yay such reference passing) $statement->addNewReference($newReference->getSnaks()); $referenceCounter++; } catch (UsageException $e) { //Ignore } } } } return $referenceCounter; }
/** * @param OutputInterface $output * @param ItemId[] $itemIds * @param bool $force */ private function executeForItemIds(OutputInterface $output, array $itemIds, $force) { $itemLookup = $this->wikibaseFactory->newItemLookup(); $processedItemIdStrings = $this->getProcessedItemIdStrings(); $loopCounter = 0; /** @var FormatterHelper $formatter */ $formatter = $this->getHelper('formatter'); foreach ($itemIds as $itemId) { $loopCounter++; $itemIdString = $itemId->getSerialization(); $output->writeln('----------------------------------------------------'); if ($loopCounter % 10 != 0) { $processedItemIdStrings = $this->getProcessedItemIdStrings(); } if (!$force && in_array($itemId->getSerialization(), $processedItemIdStrings)) { $output->writeln($formatter->formatSection($itemIdString, 'Already processed')); continue; } try { $output->writeln($formatter->formatSection($itemIdString, 'Loading Item')); $item = $itemLookup->getItemForId($itemId); } catch (ItemLookupException $e) { $output->writeln($formatter->formatSection($itemIdString, 'Failed to load item (exception)', 'error')); continue; } if ($item === null) { $output->writeln($formatter->formatSection($itemIdString, 'Failed to load item (null)', 'error')); continue; } // Get the item types.. $types = array(); foreach ($item->getStatements()->getByPropertyId(new PropertyId('P31'))->toArray() as $instanceStatement) { $mainSnak = $instanceStatement->getMainSnak(); if ($mainSnak instanceof PropertyValueSnak) { /** @var EntityIdValue $instanceItemIdValue */ $instanceItemIdValue = $mainSnak->getDataValue(); $idSerialization = $instanceItemIdValue->getEntityId()->getSerialization(); if (array_key_exists($idSerialization, $this->instanceMap)) { $types[] = $this->instanceMap[$idSerialization]; } } } if (empty($types)) { $output->writeln($formatter->formatSection($itemIdString, 'Didn\\t find any useful instance of statements', 'comment')); continue; } // Note: only load Wikipedias $siteLinkList = DataModelUtils::getSitelinksWiteSiteIdSuffix($item->getSiteLinkList(), 'wiki'); $output->writeln($formatter->formatSection($itemIdString, $siteLinkList->count() . ' Wikipedia pages to request')); $parseProgressBar = new ProgressBar($output, $siteLinkList->count()); $parseProgressBar->display(); /** @var PromiseInterface[] $parsePromises */ $parsePromises = array(); foreach ($siteLinkList->getIterator() as $siteLink) { $siteId = $siteLink->getSiteId(); $pageName = $item->getSiteLinkList()->getBySiteId($siteId)->getPageName(); $sourceMwFactory = $this->wmFactoryFactory->getFactory($siteId); $sourceParser = $sourceMwFactory->newParser(); $pageIdentifier = new PageIdentifier(new Title($pageName)); $parsePromises[$siteId] = $sourceParser->parsePageAsync($pageIdentifier); $parseProgressBar->advance(); } $links = array(); foreach ($parsePromises as $siteId => $promise) { try { $parseResult = $promise->wait(); if (array_key_exists('externallinks', $parseResult)) { foreach ($parseResult['externallinks'] as $externalLink) { // Ignore archive.org links if (strstr($externalLink, 'archive.org') === false) { $links[] = $this->normalizeExternalLink($externalLink); } } } } catch (Exception $e) { $parseProgressBar->clear(); $output->writeln($formatter->formatSection($itemIdString, $e->getMessage(), 'error')); $parseProgressBar->display(); // Ignore failed requests } } $parseProgressBar->finish(); $output->writeln(''); $links = array_unique($links); shuffle($links); /** @var Request[] $linkRequests */ $linkRequests = array(); foreach ($links as $link) { $linkRequests[] = new Request('GET', $link, array('allow_redirects' => array('track_redirects' => true), 'connect_timeout' => 3.14, 'timeout' => 10)); } $output->writeln($formatter->formatSection($itemIdString, count($linkRequests) . ' External links to (download, action)')); if (empty($linkRequests)) { continue; } // Make a bunch of requests and act on the responses $referencesAddedToItem = 0; $externalLinkProgressBar = new ProgressBar($output, count($linkRequests) * 2); $externalLinkProgressBar->display(); $pool = new Pool($this->externalLinkClient, $linkRequests, array('fulfilled' => function ($response) use($externalLinkProgressBar, $item, $types, $referencesAddedToItem, $output) { $externalLinkProgressBar->advance(); // 1st advance point if ($response instanceof ResponseInterface) { $link = $response->getHeaderLine('X-GUZZLE-EFFECTIVE-URL'); $html = $response->getBody(); $referencesAddedFromLink = 0; foreach ($this->microDataExtractor->extract($html) as $microData) { foreach ($types as $type) { if ($microData->hasType($type) && array_key_exists($type, $this->referencerMap)) { foreach ($this->referencerMap[$type] as $referencer) { /** @var Referencer $referencer */ $addedReferences = $referencer->addReferences($microData, $item, $link); $referencesAddedToItem += $addedReferences; $referencesAddedFromLink += $addedReferences; } } } } if ($referencesAddedFromLink > 0) { $externalLinkProgressBar->clear(); $output->write("\r"); $output->writeln($referencesAddedFromLink . ' reference(s) added from ' . urldecode($link)); $externalLinkProgressBar->display(); } } $externalLinkProgressBar->advance(); // 2nd advance point }, 'rejected' => function () use($externalLinkProgressBar) { // TODO add this to some kind of verbose log? $externalLinkProgressBar->advance(); // 1st advance point })); $pool->promise()->wait(); $externalLinkProgressBar->finish(); $output->writeln(''); $output->writeln($formatter->formatSection($itemIdString, $referencesAddedToItem . ' References added')); $this->markIdAsProcessed($itemId); } }
/** * @param string $url * * @return Reference */ public static function getReferenceForUrl($url) { return new Reference(array(new PropertyValueSnak(new PropertyId('P854'), new StringValue($url)), new PropertyValueSnak(new PropertyId('P813'), DataModelUtils::getCurrentTimeValue()))); }
public function addReferences(MicroData $microData, $item, $sourceUrl) { // Only cache entity lookup stuff per item we are adding references for! // (but can be used for multiple sourceURLs!! if (!$item->getId()->equals($this->lastEntityId)) { $this->inMemoryEntityLookup = new InMemoryEntityLookup(); } $referenceCounter = 0; foreach ($this->callbackMap as $propertyIdString => $valueGetterFunction) { $values = $valueGetterFunction($microData); $statements = $item->getStatements()->getByPropertyId(new PropertyId($propertyIdString)); foreach ($values as $value) { foreach ($statements->getIterator() as &$statement) { $mainSnak = $statement->getMainSnak(); if (!$mainSnak instanceof PropertyValueSnak) { continue; // Ignore some and no value statements } /** @var EntityIdValue $valueEntityIdValue */ $valueEntityIdValue = $mainSnak->getDataValue(); /** @var ItemId $valueItemId */ $valueItemId = $valueEntityIdValue->getEntityId(); if ($this->inMemoryEntityLookup->hasEntity($valueItemId)) { $valueItem = $this->inMemoryEntityLookup->getEntity($valueItemId); } else { $valueItem = $this->wikibaseFactory->newItemLookup()->getItemForId($valueItemId); $this->inMemoryEntityLookup->addEntity($valueItem); } if (!in_array(strtolower($value), DataModelUtils::getMainTermsAsLowerCaseStrings($valueItem->getFingerprint()))) { continue; // Ignore things that don't appear to have the correct value } if (DataModelUtils::statementHasReferenceForUrlWithSameDomain($statement, $sourceUrl)) { continue; // Ignore statements that already have this URL domain as a ref } // Add the new reference! $newReference = DataModelUtils::getReferenceForUrl($sourceUrl); try { $this->wikibaseFactory->newReferenceSetter()->set($newReference, $statement, null, new EditInfo(urldecode($sourceUrl), EditInfo::NOTMINOR, EditInfo::BOT)); //NOTE: keep our in memory item copy up to date (yay such reference passing) $statement->addNewReference($newReference->getSnaks()); $referenceCounter++; } catch (UsageException $e) { //Ignore } } } } return $referenceCounter; }