function getLocation(array $mf) { $location = []; $locationDataSources = [$mf]; if (M\hasProp($mf, 'location') and M\isMicroformat($mf['properties']['location'][0])) { $locationDataSources[] = $mf['properties']['location'][0]; } if (M\hasProp($mf, 'adr') and M\isMicroformat($mf['properties']['adr'][0])) { $locationDataSources[] = $mf['properties']['adr'][0]; } if (M\hasProp($mf, 'geo')) { $geo = $mf['properties']['geo'][0]; if (M\isMicroformat($geo)) { $locationDataSources[] = $geo; } elseif (is_string($geo)) { $parts = parse_url($geo); if (!empty($parts['scheme']) and $parts['scheme'] == 'geo' and !empty($parts['path'])) { $geoParts = explode(',', $parts['path']); $derivedGeo = ['type' => ['h-geo'], 'properties' => ['latitude' => [$geoParts[0]], 'longitude' => [$geoParts[1]]]]; if (count($geoParts) > 2) { $derivedGeo['properties']['altitude'] = [$geoParts[2]]; } $locationDataSources[] = $derivedGeo; } } } $locationProperties = ['street-address', 'extended-address', 'post-office-box', 'locality', 'region', 'postal-code', 'country-name', 'label', 'latitude', 'longitude', 'altitude']; // Search all the location data sources for each property, storing the first one we come across. foreach ($locationProperties as $propName) { foreach ($locationDataSources as $mf) { if (M\hasProp($mf, $propName)) { $location[$propName] = M\getPlaintext($mf, $propName); } } } return empty($location) ? null : $location; }
function processHEntry($hEntry, $mf, $url, $resolveRelationships = true, Guzzle\Http\ClientInterface $client = null, $purifier = null) { if ($client === null) { $client = new Guzzle\Http\Client(); } if ($purifier === null) { $purifier = function ($value) { return $value; }; } // Use comment-presentation algorithm to clean up. $cleansed = comments\parse($hEntry); $referencedPosts = []; $referencedPostUrls = []; // Used internally to keep track of what referenced posts have been processed already. $indexedContent = M\getPlaintext($hEntry, 'content', $cleansed['text']); $displayContent = $purifier(M\getHtml($hEntry, 'content')); $cleansed['content'] = $indexedContent; $cleansed['display_content'] = $displayContent; // Handle all datetime cases, as per http://indiewebcamp.com/h-entry#How_to_consume_h-entry try { $published = new DateTime($cleansed['published']); $utcPublished = clone $published; $utcPublished->setTimezone(new DateTimeZone('UTC')); } catch (Exception $e) { $published = $utcPublished = false; } $inTheFuture = $utcPublished > new DateTime(null, new DateTimeZone('UTC')); // DateTime() accepts “false” as a constructor param for some reason. if (!$published and !$cleansed['published'] or $utcPublished > new DateTime(null, new DateTimeZone('UTC'))) { // If there’s absolutely no datetime, our best guess has to be “now”. // Additional heuristics could be used in the bizarre case of having a feed where an item without datetime is // published in between two items with datetimes, allowing us to guess the published datetime is between the two, // but until that actually happens it’s not worth coding for. $cleansed['published'] = gmdate('c'); $utcPublished = new DateTime(null, new DateTimeZone('UTC')); } else { // “published” is given and parses correctly, into $published. // Currently it’s not trivial to figure out if a given datetime is floating or not, so assume that the timezone // given here is correct for the moment. When this can be determined, follow http://indiewebcamp.com/datetime#implying_timezone_from_webmentions } // There’s some case causing $utcPublished to still be false and I can’t be bothered to debug it right now, so here’s a fix. if ($utcPublished === false) { $utcPublished = new DateTime(null, new DateTimeZone('UTC')); } // Store a string representation of published to be indexed+queried upon. $cleansed['published_utc'] = $utcPublished->format(DateTime::W3C); if (M\hasProp($hEntry, 'photo')) { $cleansed['photo'] = $purifier(M\getHtml($hEntry, 'photo')); } if (M\hasProp($hEntry, 'logo')) { $cleansed['logo'] = $purifier(M\getHtml($hEntry, 'logo')); } // For every post this post has a relation (in-reply-to, repost-of, like-of etc.), fetch and resolve that URL, // index it as it’s own post (if it doesn’t already exist) and store only a reference to it here. $references = ['in-reply-to' => [], 'like-of' => [], 'repost-of' => []]; foreach ($references as $relation => $_) { $refUrls = []; // These will be feed pages not permalink pages so cannot check rels, only microformats properties. if (M\hasProp($hEntry, $relation)) { foreach ($hEntry['properties'][$relation] as $value) { if (is_string($value)) { $refUrls[] = $value; } elseif (is_array($value) and isset($value['html'])) { // e-* properties unlikely to be URLs but try all the same. $refUrls[] = $value['value']; } elseif (M\isMicroformat($value)) { if (M\hasProp($value, 'url')) { $refUrls[] = M\getProp($value, 'url'); } elseif (M\hasProp($value, 'uid')) { $refUrls[] = M\getProp($value, 'uid'); } } else { // If this happens, the microformats parsing spec has changed. Currently do nothing as we don’t know how to interpret this. } } } if ($resolveRelationships) { foreach ($refUrls as $refUrl) { try { $resp = $client->get($refUrl)->send(); $refResolvedUrl = $resp->getEffectiveUrl(); $refMf = Mf2\parse($resp->getBody(1), $refResolvedUrl); $refHEntries = M\findMicroformatsByType($refMf, 'h-entry'); $relatedUrl = $refResolvedUrl; if (count($refHEntries) > 0) { $refHEntry = $refHEntries[0]; $refSearchUrl = M\hasProp($refHEntry, 'url') ? M\getProp($refHEntry, 'url') : $refResolvedUrl; if (!in_array($refSearchUrl, $referencedPostUrls)) { list($refCleansed, $_) = processHEntry($refHEntry, $refMf, $refResolvedUrl, false, $client, $purifier); $referencedPosts[] = $refCleansed; $referencedPostUrls[] = $refSearchUrl; $relatedUrl = $refSearchUrl; } } $references[$relation][] = $relatedUrl; } catch (Guzzle\Common\Exception\GuzzleException $e) { $references[$relation][] = $refUrl; } } } else { // If we’re not resolving relationships, the most accurate data we have is the data given already. $references[$relation] = $refUrls; } // Now we have the best possible list of URLs, attach it to $cleansed. $cleansed[$relation] = array_unique($references[$relation]); } if (!M\hasProp($hEntry, 'author') or !M\isMicroformat($hEntry['properties']['author'][0])) { // No authorship data given, we need to find the author! // TODO: proper /authorship implementation. // TODO: wrap proper /authorship implementation in layer which does purification, simplification, fallback. $potentialAuthor = M\getAuthor($hEntry, $mf, $url); if (M\isMicroformat($potentialAuthor)) { $cleansed['author'] = flattenHCard($potentialAuthor, $url); } elseif (!empty($mf['rels']['author'])) { // TODO: look in elasticsearch index for a person with the first rel-author URL then fall back to fetching. // Fetch the first author URL and look for a representative h-card there. $relAuthorMf = Mf2\fetch($mf['rels']['author'][0]); $relAuthorHCards = M\findMicroformatsByType($relAuthorMf, 'h-card'); foreach ($relAuthorHCards as $raHCard) { $relMes = @($relAuthorMf['rels']['me'] ?: []); if ((M\getProp($raHCard, 'url') === M\getProp($raHCard, 'url')) === $mf['rels']['author'][0]) { $cleansed['author'] = flattenHCard($raHCard, $mf['rels']['author'][0]); } elseif (M\hasProp($raHCard, 'url') and count(array_intersect($raHCard['properties']['url'], $relMes)) > 0) { $cleansed['author'] = flattenHCard($raHCard, $mf['rels']['author'][0]); } } } // If after all that there’s still no authorship data, fake some. if ($cleansed['author']['name'] === false) { $cleansed['author'] = flattenHCard(['properties' => []], $url); try { $response = $client->head("{$cleansed['author']['url']}/favicon.ico")->send(); if (strpos($response->getHeader('content-type'), 'image') !== false) { // This appears to be a valid image! $cleansed['author']['photo'] = $response->getEffectiveUrl(); } } catch (Guzzle\Common\Exception\GuzzleException $e) { // No photo fallback could be found. } } } // TODO: this will be M\getLocation when it’s ported to the other library. if (($location = getLocation($hEntry)) !== null) { $cleansed['location'] = $location; // TODO: do additional reverse lookups of address details if none are provided. if (!empty($location['latitude']) and !empty($location['longitude'])) { // If this is a valid point, add a point with mashed names for elasticsearch to index. $cleansed['location_point'] = ['lat' => $location['latitude'], 'lon' => $location['longitude']]; } } // TODO: figure out what other properties need storing/indexing, and whether anything else needs mashing for // elasticsearch to index more easily. return [$cleansed, $referencedPosts]; }
public static function refresh_feed($feed_id) { $feed = db\get_feed($feed_id); echo "Refreshing feed " . $feed->feed_url . " ({$feed_id})\n\n"; // check if this feed is already being refreshed, and re-queue the job for 30 seconds from now to give the first job a chance to finish if ($feed->refresh_in_progress) { echo "This feed is already being processed, re-queuing for later\n"; DeferredTask::queue('FeedTask', 'refresh_feed', $feed_id, 5); return; // return here which will cause the job runner to re-queue the job } // mark that this feed is currently being refreshed $feed->refresh_started = date('Y-m-d H:i:s'); $feed->refresh_in_progress = 1; $feed->save(); // only deal with mf2 feeds for now try { $response = request\get_url($feed->feed_url, true); $header_rels = IndieWeb\http_rels($response['headers']); $html = $response['body']; $mf2 = feeds\parse_mf2($html, $feed->feed_url); $hub_url = false; if (k($header_rels, 'hub')) { $hub_url = $header_rels['hub'][0]; $hub_url_source = 'http'; } elseif (k($mf2, 'rels') && k($mf2['rels'], 'hub')) { $hub_url = $mf2['rels']['hub'][0]; $hub_url_source = 'html'; } // check for PuSH info and subscribe to the hub if found if ($hub_url) { if (k($header_rels, 'self')) { $self_url = $header_rels['self'][0]; $self_url_source = 'http'; } elseif (k($mf2, 'rels') && k($mf2['rels'], 'self')) { $self_url = $mf2['rels']['self'][0]; $self_url_source = 'html'; } else { $self_url = $feed->feed_url; $self_url_source = 'default'; } // Keep track of what the hub URL was last time we saw it $last_hub_url = $feed->push_hub_url; // Store the new hub and topic $feed->push_hub_url = $hub_url; $feed->push_topic_url = $self_url; // re-subscribe if the expiration date is coming up soon // or if the hub has changed if ($feed->push_subscribed == 0 || $hub_url != $last_hub_url || $feed->push_expiration && strtotime($feed->push_expiration) - 300 < time()) { echo "Attempting to subscribe to the hub!\n"; echo "Hub: " . $feed->push_hub_url . " (found in {$hub_url_source})\n"; echo "Topic: " . $feed->push_topic_url . " (found in {$self_url_source})\n"; // This will cause the hub to make a GET request to the callback URL which we will to verify $response = request\post($feed->push_hub_url, ['hub.mode' => 'subscribe', 'hub.topic' => $feed->push_topic_url, 'hub.callback' => 'http://' . Config::$hostname . '/push/feed/' . $feed->hash]); echo "Hub responded:\n"; echo $response['status'] . "\n"; echo $response['body'] . "\n"; } $feed->save(); } // check if there are any h-entry posts $info = feeds\find_feed_info($mf2); if ($info) { #print_r($info); foreach ($info['entries'] as $i => $e) { echo "\nProcessing entry {$i}\n"; // Find the canonical URL for the entry and fetch the page $entry_url = Mf2\getPlaintext($e, 'url'); if ($entry_url) { echo $entry_url . "\n"; // Parse the entry for all required info and store in the "entries" table $entry_html = request\get_url($entry_url); if ($entry_html) { $entry_mf2 = feeds\parse_mf2($entry_html, $entry_url); $entries = Mf2\findMicroformatsByType($entry_mf2['items'], 'h-entry'); $entry_mf2 = $entries[0]; if (!Mf2\isMicroformat($entry_mf2)) { echo "Does not appear to be a microformat\n"; continue; } if (!in_array('h-entry', $entry_mf2['type'])) { print_r($entry_mf2); continue; } if (!($entry = ORM::for_table('entries')->where('feed_id', $feed->id)->where('url', $entry_url)->find_one())) { $entry = ORM::for_table('entries')->create(); $entry->feed_id = $feed->id; $entry->url = $entry_url; } // Decide whether to store the name, summary and content depending on whether they are unique $name = Mf2\getPlaintext($entry_mf2, 'name'); $summary = Mf2\getPlaintext($entry_mf2, 'summary'); $content = Mf2\getHtml($entry_mf2, 'content'); $content_text = Mf2\getPlaintext($entry_mf2, 'content'); // Store the name if it's different from the summary and the content if (!feeds\content_is_equal($name, $summary) && !feeds\content_is_equal($name, $content_text)) { $entry->name = $name; echo "Entry has a name: {$name}\n"; } else { $entry->name = ''; } // Store the summary if it's different from the content if ($summary && !feeds\content_is_equal($summary, $content_text)) { $entry->summary = $summary; echo "Entry has a summary\n"; } else { $entry->summary = ''; } $entry->content = $content; $date_string = Mf2\getPlaintext($entry_mf2, 'published'); if ($date_string) { try { $date = new DateTime($date_string); if ($date) { $entry->timezone_offset = $date->format('Z'); $date->setTimeZone(new DateTimeZone('UTC')); $entry->date_published = $date->format('Y-m-d H:i:s'); echo "Published: {$entry->date_published}\n"; } } catch (Exception $e) { echo "Error parsing date: {$date_string}\n"; } } // Set the date published to now if none was found in the entry if (!$entry->date_published) { $entry->date_published = date('Y-m-d H:i:s'); } if (Mf2\getPlaintext($entry_mf2, 'like-of')) { $entry->like_of_url = Mf2\getPlaintext($entry_mf2, 'like-of'); } if (Mf2\getPlaintext($entry_mf2, 'repost-of')) { $entry->repost_of_url = Mf2\getPlaintext($entry_mf2, 'repost-of'); } // TODO: move this to a helper // finds the URL for a property if the property is a plain string or a nested h-cite if (Mf2\getPlaintext($entry_mf2, 'in-reply-to')) { if (Mf2\isMicroformat($entry_mf2['properties']['in-reply-to'][0])) { $entry->in_reply_to_url = $entry_mf2['properties']['in-reply-to'][0]['properties']['url'][0]; } else { $entry->in_reply_to_url = Mf2\getPlaintext($entry_mf2, 'in-reply-to'); } } if (Mf2\getPlaintext($entry_mf2, 'photo')) { $entry->photo_url = Mf2\getPlaintext($entry_mf2, 'photo'); } if (Mf2\getPlaintext($entry_mf2, 'video')) { $entry->video_url = Mf2\getPlaintext($entry_mf2, 'video'); } if (Mf2\getPlaintext($entry_mf2, 'audio')) { $entry->audio_url = Mf2\getPlaintext($entry_mf2, 'audio'); } $author_mf2 = false; if (Mf2\hasProp($entry_mf2, 'author')) { $author_mf2 = $entry_mf2['properties']['author'][0]; } elseif (Mf2\hasProp($info, 'author')) { $author_mf2 = $info['properties']['author'][0]; } if ($author_mf2) { $entry->author_name = Mf2\getPlaintext($author_mf2, 'name'); $entry->author_url = Mf2\getPlaintext($author_mf2, 'url'); $entry->author_photo = Mf2\getPlaintext($author_mf2, 'photo'); } else { echo "NO AUTHOR WAS FOUND!!\n"; } if (Mf2\hasProp($entry_mf2, 'like')) { $entry->num_likes = count($entry_mf2['properties']['like']); } if (Mf2\hasProp($entry_mf2, 'repost')) { $entry->num_reposts = count($entry_mf2['properties']['repost']); } if (Mf2\hasProp($entry_mf2, 'comment')) { $entry->num_comments = count($entry_mf2['properties']['comment']); } if (Mf2\hasProp($entry_mf2, 'rsvp')) { $entry->num_rsvps = count($entry_mf2['properties']['rsvp']); } $entry->date_retrieved = date('Y-m-d H:i:s'); $entry->date_updated = date('Y-m-d H:i:s'); $entry->save(); // Add or update all tags for this entry if (Mf2\hasProp($entry_mf2, 'category')) { $entry_tags = array_unique(array_map(function ($c) { return strtolower(trim($c, '#')); }, $entry_mf2['properties']['category'])); foreach ($entry_tags as $tag) { if (!ORM::for_table('entry_tags')->where('entry_id', $entry->id)->where('tag', $tag)->find_one()) { $et = ORM::for_table('entry_tags')->create(); $et->entry_id = $entry->id; $et->tag = $tag; $et->save(); } } } else { $entry_tags = array(); } // TODO: Remove tags that are no longer found in the entry // Add syndication URLs if (Mf2\hasProp($entry_mf2, 'syndication')) { $syndications = array_unique($entry_mf2['properties']['syndication']); foreach ($syndications as $syn) { if (!ORM::for_table('entry_syndications')->where('entry_id', $entry->id)->where('syndication_url', $syn)->find_one()) { $es = ORM::for_table('entry_syndications')->create(); $es->entry_id = $entry->id; $es->syndication_url = $syn; $es->save(); } } } // TODO: Remove urls that are no longer found in the entry // Run through all the channels that have this feed and add the entry to each channel $sources = ORM::for_table('channel_sources')->where('feed_id', $feed_id)->find_many(); foreach ($sources as $source) { #$channel = ORM::for_table('channel')->where('id',$source->channel_id)->find_one(); $add = false; if ($source->filter) { $tags = explode(',', $source->filter); foreach ($tags as $tag) { if (preg_match('/\\b' . $tag . '\\b/', $entry->content . "\n" . $entry->name . "\n" . $entry->summary)) { $add = true; } if (in_array(strtolower($tag), $entry_tags)) { $add = true; } } } else { $add = true; } if ($add) { $ce = ORM::for_table('channel_entries')->where('channel_id', $source->channel_id)->where('entry_id', $entry->id)->find_one(); if (!$ce) { $ce = ORM::for_table('channel_entries')->create(); $ce->channel_id = $source->channel_id; $ce->entry_id = $entry->id; } $ce->entry_published = $entry->date_published; $ce->date_created = date('Y-m-d H:i:s'); $ce->save(); echo "Adding to channel\n"; } } } else { // Bad response returned, might be 410 deleted // TODO: Figure out if it's a deleted post or just temporary error } } else { echo "No URL was found for this entry\n"; } } } $feed->last_retrieved = date('Y-m-d H:i:s'); } catch (Exception $e) { echo "Error processing feed!\n"; echo $e->getMessage() . "\n"; echo $e->getTraceAsString() . "\n"; } // mark complete // TODO: add some exception handling that will set this to 0 on errors? $feed->refresh_in_progress = 0; $feed->save(); }
<?php } elseif ($postType == 'repost') { ?> <p class="property-block-name">Repost Of</p> <?php if (is_string(Mf2\getProp($hEntry, 'repost-of'))) { ?> <a href="<?php echo Mf2\getProp($hEntry, 'repost-of'); ?> "><?php echo Mf2\getProp($hEntry, 'repost-of'); ?> </a> <?php } elseif (Mf2\isMicroformat(Mf2\getProp($hEntry, 'repost-of'))) { ?> <?php if (!in_array('h-cite', Mf2\getProp($hEntry, 'repost-of'))) { ?> <p>The nested <code>h-cite</code> microformat should be an <a href="http://microformats.org/wiki/h-cite"><code>h-cite</code></a> as it refers to off-site content.</p> <?php } ?> <?php if (Mf2\hasProp(Mf2\getProp($hEntry, 'repost-of'), 'url')) { ?> <a href="<?php echo Mf2\getProp(Mf2\getProp($hEntry, 'repost-of'), 'url'); ?>