Example #1
0
function authorHCard(array $mf, $url, $defaultPhoto = null)
{
    $hEntries = M\findMicroformatsByType($mf, 'h-entry');
    if (count($hEntries) == 0) {
        $hEntries[0] = ['type' => ['h-entry'], 'properties' => ['name' => '']];
    }
    $h = M\getAuthor($hEntries[0], $mf, $url);
    if ($h === null) {
        return null;
    }
    return ['name' => M\getPlaintext($h, 'name'), 'photo' => M\getPlaintext($h, 'photo', $defaultPhoto), 'url' => M\getPlaintext($h, 'url', null)];
}
Example #2
0
function processHEntry($hEntry, $mf, $url, $resolveRelationships = true, Guzzle\Http\ClientInterface $client = null, $purifier = null)
{
    if ($client === null) {
        $client = new Guzzle\Http\Client();
    }
    if ($purifier === null) {
        $purifier = function ($value) {
            return $value;
        };
    }
    // Use comment-presentation algorithm to clean up.
    $cleansed = comments\parse($hEntry);
    $referencedPosts = [];
    $referencedPostUrls = [];
    // Used internally to keep track of what referenced posts have been processed already.
    $indexedContent = M\getPlaintext($hEntry, 'content', $cleansed['text']);
    $displayContent = $purifier(M\getHtml($hEntry, 'content'));
    $cleansed['content'] = $indexedContent;
    $cleansed['display_content'] = $displayContent;
    // Handle all datetime cases, as per http://indiewebcamp.com/h-entry#How_to_consume_h-entry
    try {
        $published = new DateTime($cleansed['published']);
        $utcPublished = clone $published;
        $utcPublished->setTimezone(new DateTimeZone('UTC'));
    } catch (Exception $e) {
        $published = $utcPublished = false;
    }
    $inTheFuture = $utcPublished > new DateTime(null, new DateTimeZone('UTC'));
    // DateTime() accepts “false” as a constructor param for some reason.
    if (!$published and !$cleansed['published'] or $utcPublished > new DateTime(null, new DateTimeZone('UTC'))) {
        // If there’s absolutely no datetime, our best guess has to be “now”.
        // Additional heuristics could be used in the bizarre case of having a feed where an item without datetime is
        // published in between two items with datetimes, allowing us to guess the published datetime is between the two,
        // but until that actually happens it’s not worth coding for.
        $cleansed['published'] = gmdate('c');
        $utcPublished = new DateTime(null, new DateTimeZone('UTC'));
    } else {
        // “published” is given and parses correctly, into $published.
        // Currently it’s not trivial to figure out if a given datetime is floating or not, so assume that the timezone
        // given here is correct for the moment. When this can be determined, follow http://indiewebcamp.com/datetime#implying_timezone_from_webmentions
    }
    // There’s some case causing $utcPublished to still be false and I can’t be bothered to debug it right now, so here’s a fix.
    if ($utcPublished === false) {
        $utcPublished = new DateTime(null, new DateTimeZone('UTC'));
    }
    // Store a string representation of published to be indexed+queried upon.
    $cleansed['published_utc'] = $utcPublished->format(DateTime::W3C);
    if (M\hasProp($hEntry, 'photo')) {
        $cleansed['photo'] = $purifier(M\getHtml($hEntry, 'photo'));
    }
    if (M\hasProp($hEntry, 'logo')) {
        $cleansed['logo'] = $purifier(M\getHtml($hEntry, 'logo'));
    }
    // For every post this post has a relation (in-reply-to, repost-of, like-of etc.), fetch and resolve that URL,
    // index it as it’s own post (if it doesn’t already exist) and store only a reference to it here.
    $references = ['in-reply-to' => [], 'like-of' => [], 'repost-of' => []];
    foreach ($references as $relation => $_) {
        $refUrls = [];
        // These will be feed pages not permalink pages so cannot check rels, only microformats properties.
        if (M\hasProp($hEntry, $relation)) {
            foreach ($hEntry['properties'][$relation] as $value) {
                if (is_string($value)) {
                    $refUrls[] = $value;
                } elseif (is_array($value) and isset($value['html'])) {
                    // e-* properties unlikely to be URLs but try all the same.
                    $refUrls[] = $value['value'];
                } elseif (M\isMicroformat($value)) {
                    if (M\hasProp($value, 'url')) {
                        $refUrls[] = M\getProp($value, 'url');
                    } elseif (M\hasProp($value, 'uid')) {
                        $refUrls[] = M\getProp($value, 'uid');
                    }
                } else {
                    // If this happens, the microformats parsing spec has changed. Currently do nothing as we don’t know how to interpret this.
                }
            }
        }
        if ($resolveRelationships) {
            foreach ($refUrls as $refUrl) {
                try {
                    $resp = $client->get($refUrl)->send();
                    $refResolvedUrl = $resp->getEffectiveUrl();
                    $refMf = Mf2\parse($resp->getBody(1), $refResolvedUrl);
                    $refHEntries = M\findMicroformatsByType($refMf, 'h-entry');
                    $relatedUrl = $refResolvedUrl;
                    if (count($refHEntries) > 0) {
                        $refHEntry = $refHEntries[0];
                        $refSearchUrl = M\hasProp($refHEntry, 'url') ? M\getProp($refHEntry, 'url') : $refResolvedUrl;
                        if (!in_array($refSearchUrl, $referencedPostUrls)) {
                            list($refCleansed, $_) = processHEntry($refHEntry, $refMf, $refResolvedUrl, false, $client, $purifier);
                            $referencedPosts[] = $refCleansed;
                            $referencedPostUrls[] = $refSearchUrl;
                            $relatedUrl = $refSearchUrl;
                        }
                    }
                    $references[$relation][] = $relatedUrl;
                } catch (Guzzle\Common\Exception\GuzzleException $e) {
                    $references[$relation][] = $refUrl;
                }
            }
        } else {
            // If we’re not resolving relationships, the most accurate data we have is the data given already.
            $references[$relation] = $refUrls;
        }
        // Now we have the best possible list of URLs, attach it to $cleansed.
        $cleansed[$relation] = array_unique($references[$relation]);
    }
    if (!M\hasProp($hEntry, 'author') or !M\isMicroformat($hEntry['properties']['author'][0])) {
        // No authorship data given, we need to find the author!
        // TODO: proper /authorship implementation.
        // TODO: wrap proper /authorship implementation in layer which does purification, simplification, fallback.
        $potentialAuthor = M\getAuthor($hEntry, $mf, $url);
        if (M\isMicroformat($potentialAuthor)) {
            $cleansed['author'] = flattenHCard($potentialAuthor, $url);
        } elseif (!empty($mf['rels']['author'])) {
            // TODO: look in elasticsearch index for a person with the first rel-author URL then fall back to fetching.
            // Fetch the first author URL and look for a representative h-card there.
            $relAuthorMf = Mf2\fetch($mf['rels']['author'][0]);
            $relAuthorHCards = M\findMicroformatsByType($relAuthorMf, 'h-card');
            foreach ($relAuthorHCards as $raHCard) {
                $relMes = @($relAuthorMf['rels']['me'] ?: []);
                if ((M\getProp($raHCard, 'url') === M\getProp($raHCard, 'url')) === $mf['rels']['author'][0]) {
                    $cleansed['author'] = flattenHCard($raHCard, $mf['rels']['author'][0]);
                } elseif (M\hasProp($raHCard, 'url') and count(array_intersect($raHCard['properties']['url'], $relMes)) > 0) {
                    $cleansed['author'] = flattenHCard($raHCard, $mf['rels']['author'][0]);
                }
            }
        }
        // If after all that there’s still no authorship data, fake some.
        if ($cleansed['author']['name'] === false) {
            $cleansed['author'] = flattenHCard(['properties' => []], $url);
            try {
                $response = $client->head("{$cleansed['author']['url']}/favicon.ico")->send();
                if (strpos($response->getHeader('content-type'), 'image') !== false) {
                    // This appears to be a valid image!
                    $cleansed['author']['photo'] = $response->getEffectiveUrl();
                }
            } catch (Guzzle\Common\Exception\GuzzleException $e) {
                // No photo fallback could be found.
            }
        }
    }
    // TODO: this will be M\getLocation when it’s ported to the other library.
    if (($location = getLocation($hEntry)) !== null) {
        $cleansed['location'] = $location;
        // TODO: do additional reverse lookups of address details if none are provided.
        if (!empty($location['latitude']) and !empty($location['longitude'])) {
            // If this is a valid point, add a point with mashed names for elasticsearch to index.
            $cleansed['location_point'] = ['lat' => $location['latitude'], 'lon' => $location['longitude']];
        }
    }
    // TODO: figure out what other properties need storing/indexing, and whether anything else needs mashing for
    // elasticsearch to index more easily.
    return [$cleansed, $referencedPosts];
}