Example #1
0
function processHEntry($hEntry, $mf, $url, $resolveRelationships = true, Guzzle\Http\ClientInterface $client = null, $purifier = null)
{
    if ($client === null) {
        $client = new Guzzle\Http\Client();
    }
    if ($purifier === null) {
        $purifier = function ($value) {
            return $value;
        };
    }
    // Use comment-presentation algorithm to clean up.
    $cleansed = comments\parse($hEntry);
    $referencedPosts = [];
    $referencedPostUrls = [];
    // Used internally to keep track of what referenced posts have been processed already.
    $indexedContent = M\getPlaintext($hEntry, 'content', $cleansed['text']);
    $displayContent = $purifier(M\getHtml($hEntry, 'content'));
    $cleansed['content'] = $indexedContent;
    $cleansed['display_content'] = $displayContent;
    // Handle all datetime cases, as per http://indiewebcamp.com/h-entry#How_to_consume_h-entry
    try {
        $published = new DateTime($cleansed['published']);
        $utcPublished = clone $published;
        $utcPublished->setTimezone(new DateTimeZone('UTC'));
    } catch (Exception $e) {
        $published = $utcPublished = false;
    }
    $inTheFuture = $utcPublished > new DateTime(null, new DateTimeZone('UTC'));
    // DateTime() accepts “false” as a constructor param for some reason.
    if (!$published and !$cleansed['published'] or $utcPublished > new DateTime(null, new DateTimeZone('UTC'))) {
        // If there’s absolutely no datetime, our best guess has to be “now”.
        // Additional heuristics could be used in the bizarre case of having a feed where an item without datetime is
        // published in between two items with datetimes, allowing us to guess the published datetime is between the two,
        // but until that actually happens it’s not worth coding for.
        $cleansed['published'] = gmdate('c');
        $utcPublished = new DateTime(null, new DateTimeZone('UTC'));
    } else {
        // “published” is given and parses correctly, into $published.
        // Currently it’s not trivial to figure out if a given datetime is floating or not, so assume that the timezone
        // given here is correct for the moment. When this can be determined, follow http://indiewebcamp.com/datetime#implying_timezone_from_webmentions
    }
    // There’s some case causing $utcPublished to still be false and I can’t be bothered to debug it right now, so here’s a fix.
    if ($utcPublished === false) {
        $utcPublished = new DateTime(null, new DateTimeZone('UTC'));
    }
    // Store a string representation of published to be indexed+queried upon.
    $cleansed['published_utc'] = $utcPublished->format(DateTime::W3C);
    if (M\hasProp($hEntry, 'photo')) {
        $cleansed['photo'] = $purifier(M\getHtml($hEntry, 'photo'));
    }
    if (M\hasProp($hEntry, 'logo')) {
        $cleansed['logo'] = $purifier(M\getHtml($hEntry, 'logo'));
    }
    // For every post this post has a relation (in-reply-to, repost-of, like-of etc.), fetch and resolve that URL,
    // index it as it’s own post (if it doesn’t already exist) and store only a reference to it here.
    $references = ['in-reply-to' => [], 'like-of' => [], 'repost-of' => []];
    foreach ($references as $relation => $_) {
        $refUrls = [];
        // These will be feed pages not permalink pages so cannot check rels, only microformats properties.
        if (M\hasProp($hEntry, $relation)) {
            foreach ($hEntry['properties'][$relation] as $value) {
                if (is_string($value)) {
                    $refUrls[] = $value;
                } elseif (is_array($value) and isset($value['html'])) {
                    // e-* properties unlikely to be URLs but try all the same.
                    $refUrls[] = $value['value'];
                } elseif (M\isMicroformat($value)) {
                    if (M\hasProp($value, 'url')) {
                        $refUrls[] = M\getProp($value, 'url');
                    } elseif (M\hasProp($value, 'uid')) {
                        $refUrls[] = M\getProp($value, 'uid');
                    }
                } else {
                    // If this happens, the microformats parsing spec has changed. Currently do nothing as we don’t know how to interpret this.
                }
            }
        }
        if ($resolveRelationships) {
            foreach ($refUrls as $refUrl) {
                try {
                    $resp = $client->get($refUrl)->send();
                    $refResolvedUrl = $resp->getEffectiveUrl();
                    $refMf = Mf2\parse($resp->getBody(1), $refResolvedUrl);
                    $refHEntries = M\findMicroformatsByType($refMf, 'h-entry');
                    $relatedUrl = $refResolvedUrl;
                    if (count($refHEntries) > 0) {
                        $refHEntry = $refHEntries[0];
                        $refSearchUrl = M\hasProp($refHEntry, 'url') ? M\getProp($refHEntry, 'url') : $refResolvedUrl;
                        if (!in_array($refSearchUrl, $referencedPostUrls)) {
                            list($refCleansed, $_) = processHEntry($refHEntry, $refMf, $refResolvedUrl, false, $client, $purifier);
                            $referencedPosts[] = $refCleansed;
                            $referencedPostUrls[] = $refSearchUrl;
                            $relatedUrl = $refSearchUrl;
                        }
                    }
                    $references[$relation][] = $relatedUrl;
                } catch (Guzzle\Common\Exception\GuzzleException $e) {
                    $references[$relation][] = $refUrl;
                }
            }
        } else {
            // If we’re not resolving relationships, the most accurate data we have is the data given already.
            $references[$relation] = $refUrls;
        }
        // Now we have the best possible list of URLs, attach it to $cleansed.
        $cleansed[$relation] = array_unique($references[$relation]);
    }
    if (!M\hasProp($hEntry, 'author') or !M\isMicroformat($hEntry['properties']['author'][0])) {
        // No authorship data given, we need to find the author!
        // TODO: proper /authorship implementation.
        // TODO: wrap proper /authorship implementation in layer which does purification, simplification, fallback.
        $potentialAuthor = M\getAuthor($hEntry, $mf, $url);
        if (M\isMicroformat($potentialAuthor)) {
            $cleansed['author'] = flattenHCard($potentialAuthor, $url);
        } elseif (!empty($mf['rels']['author'])) {
            // TODO: look in elasticsearch index for a person with the first rel-author URL then fall back to fetching.
            // Fetch the first author URL and look for a representative h-card there.
            $relAuthorMf = Mf2\fetch($mf['rels']['author'][0]);
            $relAuthorHCards = M\findMicroformatsByType($relAuthorMf, 'h-card');
            foreach ($relAuthorHCards as $raHCard) {
                $relMes = @($relAuthorMf['rels']['me'] ?: []);
                if ((M\getProp($raHCard, 'url') === M\getProp($raHCard, 'url')) === $mf['rels']['author'][0]) {
                    $cleansed['author'] = flattenHCard($raHCard, $mf['rels']['author'][0]);
                } elseif (M\hasProp($raHCard, 'url') and count(array_intersect($raHCard['properties']['url'], $relMes)) > 0) {
                    $cleansed['author'] = flattenHCard($raHCard, $mf['rels']['author'][0]);
                }
            }
        }
        // If after all that there’s still no authorship data, fake some.
        if ($cleansed['author']['name'] === false) {
            $cleansed['author'] = flattenHCard(['properties' => []], $url);
            try {
                $response = $client->head("{$cleansed['author']['url']}/favicon.ico")->send();
                if (strpos($response->getHeader('content-type'), 'image') !== false) {
                    // This appears to be a valid image!
                    $cleansed['author']['photo'] = $response->getEffectiveUrl();
                }
            } catch (Guzzle\Common\Exception\GuzzleException $e) {
                // No photo fallback could be found.
            }
        }
    }
    // TODO: this will be M\getLocation when it’s ported to the other library.
    if (($location = getLocation($hEntry)) !== null) {
        $cleansed['location'] = $location;
        // TODO: do additional reverse lookups of address details if none are provided.
        if (!empty($location['latitude']) and !empty($location['longitude'])) {
            // If this is a valid point, add a point with mashed names for elasticsearch to index.
            $cleansed['location_point'] = ['lat' => $location['latitude'], 'lon' => $location['longitude']];
        }
    }
    // TODO: figure out what other properties need storing/indexing, and whether anything else needs mashing for
    // elasticsearch to index more easily.
    return [$cleansed, $referencedPosts];
}
Example #2
0
 public static function refresh_feed($feed_id)
 {
     $feed = db\get_feed($feed_id);
     echo "Refreshing feed " . $feed->feed_url . " ({$feed_id})\n\n";
     // check if this feed is already being refreshed, and re-queue the job for 30 seconds from now to give the first job a chance to finish
     if ($feed->refresh_in_progress) {
         echo "This feed is already being processed, re-queuing for later\n";
         DeferredTask::queue('FeedTask', 'refresh_feed', $feed_id, 5);
         return;
         // return here which will cause the job runner to re-queue the job
     }
     // mark that this feed is currently being refreshed
     $feed->refresh_started = date('Y-m-d H:i:s');
     $feed->refresh_in_progress = 1;
     $feed->save();
     // only deal with mf2 feeds for now
     try {
         $response = request\get_url($feed->feed_url, true);
         $header_rels = IndieWeb\http_rels($response['headers']);
         $html = $response['body'];
         $mf2 = feeds\parse_mf2($html, $feed->feed_url);
         $hub_url = false;
         if (k($header_rels, 'hub')) {
             $hub_url = $header_rels['hub'][0];
             $hub_url_source = 'http';
         } elseif (k($mf2, 'rels') && k($mf2['rels'], 'hub')) {
             $hub_url = $mf2['rels']['hub'][0];
             $hub_url_source = 'html';
         }
         // check for PuSH info and subscribe to the hub if found
         if ($hub_url) {
             if (k($header_rels, 'self')) {
                 $self_url = $header_rels['self'][0];
                 $self_url_source = 'http';
             } elseif (k($mf2, 'rels') && k($mf2['rels'], 'self')) {
                 $self_url = $mf2['rels']['self'][0];
                 $self_url_source = 'html';
             } else {
                 $self_url = $feed->feed_url;
                 $self_url_source = 'default';
             }
             // Keep track of what the hub URL was last time we saw it
             $last_hub_url = $feed->push_hub_url;
             // Store the new hub and topic
             $feed->push_hub_url = $hub_url;
             $feed->push_topic_url = $self_url;
             // re-subscribe if the expiration date is coming up soon
             // or if the hub has changed
             if ($feed->push_subscribed == 0 || $hub_url != $last_hub_url || $feed->push_expiration && strtotime($feed->push_expiration) - 300 < time()) {
                 echo "Attempting to subscribe to the hub!\n";
                 echo "Hub: " . $feed->push_hub_url . " (found in {$hub_url_source})\n";
                 echo "Topic: " . $feed->push_topic_url . " (found in {$self_url_source})\n";
                 // This will cause the hub to make a GET request to the callback URL which we will to verify
                 $response = request\post($feed->push_hub_url, ['hub.mode' => 'subscribe', 'hub.topic' => $feed->push_topic_url, 'hub.callback' => 'http://' . Config::$hostname . '/push/feed/' . $feed->hash]);
                 echo "Hub responded:\n";
                 echo $response['status'] . "\n";
                 echo $response['body'] . "\n";
             }
             $feed->save();
         }
         // check if there are any h-entry posts
         $info = feeds\find_feed_info($mf2);
         if ($info) {
             #print_r($info);
             foreach ($info['entries'] as $i => $e) {
                 echo "\nProcessing entry {$i}\n";
                 // Find the canonical URL for the entry and fetch the page
                 $entry_url = Mf2\getPlaintext($e, 'url');
                 if ($entry_url) {
                     echo $entry_url . "\n";
                     // Parse the entry for all required info and store in the "entries" table
                     $entry_html = request\get_url($entry_url);
                     if ($entry_html) {
                         $entry_mf2 = feeds\parse_mf2($entry_html, $entry_url);
                         $entries = Mf2\findMicroformatsByType($entry_mf2['items'], 'h-entry');
                         $entry_mf2 = $entries[0];
                         if (!Mf2\isMicroformat($entry_mf2)) {
                             echo "Does not appear to be a microformat\n";
                             continue;
                         }
                         if (!in_array('h-entry', $entry_mf2['type'])) {
                             print_r($entry_mf2);
                             continue;
                         }
                         if (!($entry = ORM::for_table('entries')->where('feed_id', $feed->id)->where('url', $entry_url)->find_one())) {
                             $entry = ORM::for_table('entries')->create();
                             $entry->feed_id = $feed->id;
                             $entry->url = $entry_url;
                         }
                         // Decide whether to store the name, summary and content depending on whether they are unique
                         $name = Mf2\getPlaintext($entry_mf2, 'name');
                         $summary = Mf2\getPlaintext($entry_mf2, 'summary');
                         $content = Mf2\getHtml($entry_mf2, 'content');
                         $content_text = Mf2\getPlaintext($entry_mf2, 'content');
                         // Store the name if it's different from the summary and the content
                         if (!feeds\content_is_equal($name, $summary) && !feeds\content_is_equal($name, $content_text)) {
                             $entry->name = $name;
                             echo "Entry has a name: {$name}\n";
                         } else {
                             $entry->name = '';
                         }
                         // Store the summary if it's different from the content
                         if ($summary && !feeds\content_is_equal($summary, $content_text)) {
                             $entry->summary = $summary;
                             echo "Entry has a summary\n";
                         } else {
                             $entry->summary = '';
                         }
                         $entry->content = $content;
                         $date_string = Mf2\getPlaintext($entry_mf2, 'published');
                         if ($date_string) {
                             try {
                                 $date = new DateTime($date_string);
                                 if ($date) {
                                     $entry->timezone_offset = $date->format('Z');
                                     $date->setTimeZone(new DateTimeZone('UTC'));
                                     $entry->date_published = $date->format('Y-m-d H:i:s');
                                     echo "Published: {$entry->date_published}\n";
                                 }
                             } catch (Exception $e) {
                                 echo "Error parsing date: {$date_string}\n";
                             }
                         }
                         // Set the date published to now if none was found in the entry
                         if (!$entry->date_published) {
                             $entry->date_published = date('Y-m-d H:i:s');
                         }
                         if (Mf2\getPlaintext($entry_mf2, 'like-of')) {
                             $entry->like_of_url = Mf2\getPlaintext($entry_mf2, 'like-of');
                         }
                         if (Mf2\getPlaintext($entry_mf2, 'repost-of')) {
                             $entry->repost_of_url = Mf2\getPlaintext($entry_mf2, 'repost-of');
                         }
                         // TODO: move this to a helper
                         // finds the URL for a property if the property is a plain string or a nested h-cite
                         if (Mf2\getPlaintext($entry_mf2, 'in-reply-to')) {
                             if (Mf2\isMicroformat($entry_mf2['properties']['in-reply-to'][0])) {
                                 $entry->in_reply_to_url = $entry_mf2['properties']['in-reply-to'][0]['properties']['url'][0];
                             } else {
                                 $entry->in_reply_to_url = Mf2\getPlaintext($entry_mf2, 'in-reply-to');
                             }
                         }
                         if (Mf2\getPlaintext($entry_mf2, 'photo')) {
                             $entry->photo_url = Mf2\getPlaintext($entry_mf2, 'photo');
                         }
                         if (Mf2\getPlaintext($entry_mf2, 'video')) {
                             $entry->video_url = Mf2\getPlaintext($entry_mf2, 'video');
                         }
                         if (Mf2\getPlaintext($entry_mf2, 'audio')) {
                             $entry->audio_url = Mf2\getPlaintext($entry_mf2, 'audio');
                         }
                         $author_mf2 = false;
                         if (Mf2\hasProp($entry_mf2, 'author')) {
                             $author_mf2 = $entry_mf2['properties']['author'][0];
                         } elseif (Mf2\hasProp($info, 'author')) {
                             $author_mf2 = $info['properties']['author'][0];
                         }
                         if ($author_mf2) {
                             $entry->author_name = Mf2\getPlaintext($author_mf2, 'name');
                             $entry->author_url = Mf2\getPlaintext($author_mf2, 'url');
                             $entry->author_photo = Mf2\getPlaintext($author_mf2, 'photo');
                         } else {
                             echo "NO AUTHOR WAS FOUND!!\n";
                         }
                         if (Mf2\hasProp($entry_mf2, 'like')) {
                             $entry->num_likes = count($entry_mf2['properties']['like']);
                         }
                         if (Mf2\hasProp($entry_mf2, 'repost')) {
                             $entry->num_reposts = count($entry_mf2['properties']['repost']);
                         }
                         if (Mf2\hasProp($entry_mf2, 'comment')) {
                             $entry->num_comments = count($entry_mf2['properties']['comment']);
                         }
                         if (Mf2\hasProp($entry_mf2, 'rsvp')) {
                             $entry->num_rsvps = count($entry_mf2['properties']['rsvp']);
                         }
                         $entry->date_retrieved = date('Y-m-d H:i:s');
                         $entry->date_updated = date('Y-m-d H:i:s');
                         $entry->save();
                         // Add or update all tags for this entry
                         if (Mf2\hasProp($entry_mf2, 'category')) {
                             $entry_tags = array_unique(array_map(function ($c) {
                                 return strtolower(trim($c, '#'));
                             }, $entry_mf2['properties']['category']));
                             foreach ($entry_tags as $tag) {
                                 if (!ORM::for_table('entry_tags')->where('entry_id', $entry->id)->where('tag', $tag)->find_one()) {
                                     $et = ORM::for_table('entry_tags')->create();
                                     $et->entry_id = $entry->id;
                                     $et->tag = $tag;
                                     $et->save();
                                 }
                             }
                         } else {
                             $entry_tags = array();
                         }
                         // TODO: Remove tags that are no longer found in the entry
                         // Add syndication URLs
                         if (Mf2\hasProp($entry_mf2, 'syndication')) {
                             $syndications = array_unique($entry_mf2['properties']['syndication']);
                             foreach ($syndications as $syn) {
                                 if (!ORM::for_table('entry_syndications')->where('entry_id', $entry->id)->where('syndication_url', $syn)->find_one()) {
                                     $es = ORM::for_table('entry_syndications')->create();
                                     $es->entry_id = $entry->id;
                                     $es->syndication_url = $syn;
                                     $es->save();
                                 }
                             }
                         }
                         // TODO: Remove urls that are no longer found in the entry
                         // Run through all the channels that have this feed and add the entry to each channel
                         $sources = ORM::for_table('channel_sources')->where('feed_id', $feed_id)->find_many();
                         foreach ($sources as $source) {
                             #$channel = ORM::for_table('channel')->where('id',$source->channel_id)->find_one();
                             $add = false;
                             if ($source->filter) {
                                 $tags = explode(',', $source->filter);
                                 foreach ($tags as $tag) {
                                     if (preg_match('/\\b' . $tag . '\\b/', $entry->content . "\n" . $entry->name . "\n" . $entry->summary)) {
                                         $add = true;
                                     }
                                     if (in_array(strtolower($tag), $entry_tags)) {
                                         $add = true;
                                     }
                                 }
                             } else {
                                 $add = true;
                             }
                             if ($add) {
                                 $ce = ORM::for_table('channel_entries')->where('channel_id', $source->channel_id)->where('entry_id', $entry->id)->find_one();
                                 if (!$ce) {
                                     $ce = ORM::for_table('channel_entries')->create();
                                     $ce->channel_id = $source->channel_id;
                                     $ce->entry_id = $entry->id;
                                 }
                                 $ce->entry_published = $entry->date_published;
                                 $ce->date_created = date('Y-m-d H:i:s');
                                 $ce->save();
                                 echo "Adding to channel\n";
                             }
                         }
                     } else {
                         // Bad response returned, might be 410 deleted
                         // TODO: Figure out if it's a deleted post or just temporary error
                     }
                 } else {
                     echo "No URL was found for this entry\n";
                 }
             }
         }
         $feed->last_retrieved = date('Y-m-d H:i:s');
     } catch (Exception $e) {
         echo "Error processing feed!\n";
         echo $e->getMessage() . "\n";
         echo $e->getTraceAsString() . "\n";
     }
     // mark complete
     // TODO: add some exception handling that will set this to 0 on errors?
     $feed->refresh_in_progress = 0;
     $feed->save();
 }