function processHEntry($hEntry, $mf, $url, $resolveRelationships = true, Guzzle\Http\ClientInterface $client = null, $purifier = null) { if ($client === null) { $client = new Guzzle\Http\Client(); } if ($purifier === null) { $purifier = function ($value) { return $value; }; } // Use comment-presentation algorithm to clean up. $cleansed = comments\parse($hEntry); $referencedPosts = []; $referencedPostUrls = []; // Used internally to keep track of what referenced posts have been processed already. $indexedContent = M\getPlaintext($hEntry, 'content', $cleansed['text']); $displayContent = $purifier(M\getHtml($hEntry, 'content')); $cleansed['content'] = $indexedContent; $cleansed['display_content'] = $displayContent; // Handle all datetime cases, as per http://indiewebcamp.com/h-entry#How_to_consume_h-entry try { $published = new DateTime($cleansed['published']); $utcPublished = clone $published; $utcPublished->setTimezone(new DateTimeZone('UTC')); } catch (Exception $e) { $published = $utcPublished = false; } $inTheFuture = $utcPublished > new DateTime(null, new DateTimeZone('UTC')); // DateTime() accepts “false” as a constructor param for some reason. if (!$published and !$cleansed['published'] or $utcPublished > new DateTime(null, new DateTimeZone('UTC'))) { // If there’s absolutely no datetime, our best guess has to be “now”. // Additional heuristics could be used in the bizarre case of having a feed where an item without datetime is // published in between two items with datetimes, allowing us to guess the published datetime is between the two, // but until that actually happens it’s not worth coding for. $cleansed['published'] = gmdate('c'); $utcPublished = new DateTime(null, new DateTimeZone('UTC')); } else { // “published” is given and parses correctly, into $published. // Currently it’s not trivial to figure out if a given datetime is floating or not, so assume that the timezone // given here is correct for the moment. When this can be determined, follow http://indiewebcamp.com/datetime#implying_timezone_from_webmentions } // There’s some case causing $utcPublished to still be false and I can’t be bothered to debug it right now, so here’s a fix. if ($utcPublished === false) { $utcPublished = new DateTime(null, new DateTimeZone('UTC')); } // Store a string representation of published to be indexed+queried upon. $cleansed['published_utc'] = $utcPublished->format(DateTime::W3C); if (M\hasProp($hEntry, 'photo')) { $cleansed['photo'] = $purifier(M\getHtml($hEntry, 'photo')); } if (M\hasProp($hEntry, 'logo')) { $cleansed['logo'] = $purifier(M\getHtml($hEntry, 'logo')); } // For every post this post has a relation (in-reply-to, repost-of, like-of etc.), fetch and resolve that URL, // index it as it’s own post (if it doesn’t already exist) and store only a reference to it here. $references = ['in-reply-to' => [], 'like-of' => [], 'repost-of' => []]; foreach ($references as $relation => $_) { $refUrls = []; // These will be feed pages not permalink pages so cannot check rels, only microformats properties. if (M\hasProp($hEntry, $relation)) { foreach ($hEntry['properties'][$relation] as $value) { if (is_string($value)) { $refUrls[] = $value; } elseif (is_array($value) and isset($value['html'])) { // e-* properties unlikely to be URLs but try all the same. $refUrls[] = $value['value']; } elseif (M\isMicroformat($value)) { if (M\hasProp($value, 'url')) { $refUrls[] = M\getProp($value, 'url'); } elseif (M\hasProp($value, 'uid')) { $refUrls[] = M\getProp($value, 'uid'); } } else { // If this happens, the microformats parsing spec has changed. Currently do nothing as we don’t know how to interpret this. } } } if ($resolveRelationships) { foreach ($refUrls as $refUrl) { try { $resp = $client->get($refUrl)->send(); $refResolvedUrl = $resp->getEffectiveUrl(); $refMf = Mf2\parse($resp->getBody(1), $refResolvedUrl); $refHEntries = M\findMicroformatsByType($refMf, 'h-entry'); $relatedUrl = $refResolvedUrl; if (count($refHEntries) > 0) { $refHEntry = $refHEntries[0]; $refSearchUrl = M\hasProp($refHEntry, 'url') ? M\getProp($refHEntry, 'url') : $refResolvedUrl; if (!in_array($refSearchUrl, $referencedPostUrls)) { list($refCleansed, $_) = processHEntry($refHEntry, $refMf, $refResolvedUrl, false, $client, $purifier); $referencedPosts[] = $refCleansed; $referencedPostUrls[] = $refSearchUrl; $relatedUrl = $refSearchUrl; } } $references[$relation][] = $relatedUrl; } catch (Guzzle\Common\Exception\GuzzleException $e) { $references[$relation][] = $refUrl; } } } else { // If we’re not resolving relationships, the most accurate data we have is the data given already. $references[$relation] = $refUrls; } // Now we have the best possible list of URLs, attach it to $cleansed. $cleansed[$relation] = array_unique($references[$relation]); } if (!M\hasProp($hEntry, 'author') or !M\isMicroformat($hEntry['properties']['author'][0])) { // No authorship data given, we need to find the author! // TODO: proper /authorship implementation. // TODO: wrap proper /authorship implementation in layer which does purification, simplification, fallback. $potentialAuthor = M\getAuthor($hEntry, $mf, $url); if (M\isMicroformat($potentialAuthor)) { $cleansed['author'] = flattenHCard($potentialAuthor, $url); } elseif (!empty($mf['rels']['author'])) { // TODO: look in elasticsearch index for a person with the first rel-author URL then fall back to fetching. // Fetch the first author URL and look for a representative h-card there. $relAuthorMf = Mf2\fetch($mf['rels']['author'][0]); $relAuthorHCards = M\findMicroformatsByType($relAuthorMf, 'h-card'); foreach ($relAuthorHCards as $raHCard) { $relMes = @($relAuthorMf['rels']['me'] ?: []); if ((M\getProp($raHCard, 'url') === M\getProp($raHCard, 'url')) === $mf['rels']['author'][0]) { $cleansed['author'] = flattenHCard($raHCard, $mf['rels']['author'][0]); } elseif (M\hasProp($raHCard, 'url') and count(array_intersect($raHCard['properties']['url'], $relMes)) > 0) { $cleansed['author'] = flattenHCard($raHCard, $mf['rels']['author'][0]); } } } // If after all that there’s still no authorship data, fake some. if ($cleansed['author']['name'] === false) { $cleansed['author'] = flattenHCard(['properties' => []], $url); try { $response = $client->head("{$cleansed['author']['url']}/favicon.ico")->send(); if (strpos($response->getHeader('content-type'), 'image') !== false) { // This appears to be a valid image! $cleansed['author']['photo'] = $response->getEffectiveUrl(); } } catch (Guzzle\Common\Exception\GuzzleException $e) { // No photo fallback could be found. } } } // TODO: this will be M\getLocation when it’s ported to the other library. if (($location = getLocation($hEntry)) !== null) { $cleansed['location'] = $location; // TODO: do additional reverse lookups of address details if none are provided. if (!empty($location['latitude']) and !empty($location['longitude'])) { // If this is a valid point, add a point with mashed names for elasticsearch to index. $cleansed['location_point'] = ['lat' => $location['latitude'], 'lon' => $location['longitude']]; } } // TODO: figure out what other properties need storing/indexing, and whether anything else needs mashing for // elasticsearch to index more easily. return [$cleansed, $referencedPosts]; }
/** * @group network */ public function testFetchMicroformats() { $mf = Mf2\fetch('http://waterpigs.co.uk/'); $this->assertArrayHasKey('items', $mf); $mf = Mf2\fetch('http://waterpigs.co.uk/photo.jpg', null, $curlInfo); $this->assertNull($mf); $this->assertContains('jpeg', $curlInfo['content_type']); }
/** * @see https://github.com/indieweb/php-mf2/issues/84 */ public function testRelativeURLResolvedWithFinalURL() { $mf = Mf2\fetch('http://aaron.pk/4Zn5'); $this->assertEquals('https://aaronparecki.com/2014/12/23/5/photo.jpeg', $mf['items'][0]['properties']['photo'][0]); }
} if (isset($_GET['url'])) { $url = $_GET['url']; } else { //header("HTTP/1.0 404 Not Found"); echo 'no url provided'; exit; } $parse = parse_url($url); $host = $parse['host']; /* * lets parse the url */ require 'vendor/autoload.php'; use Mf2; $mf = Mf2\fetch($url); if (count($mf['items']) < 1) { header("HTTP/1.0 404 Not Found"); echo "nothing found"; exit; } foreach ($mf['items'] as $microformat) { //echo "A {$microformat['type'][0]} called {$microformat['properties']['name'][0]}\n"; if ($microformat['type'][0] == "h-card") { $author_photo = $microformat['properties']['photo'][0]; } if ($microformat['type'][0] == "h-entry" or $microformat['type'][0] == "h-as-article") { $hentryfound = 1; } } if (!$hentryfound) {