/** * Build an usage event. * @param $hookName string * @param $args array * @return array */ protected function buildUsageEvent($hookName, $args) { // Finished downloading a file? if ($hookName == 'FileManager::downloadFileFinished') { // The usage event for this request is already build and // passed to any other registered hook. return null; } $application = Application::getApplication(); $request = $application->getRequest(); $router = $request->getRouter(); /* @var $router PageRouter */ $templateMgr = $args[0]; /* @var $templateMgr TemplateManager */ // We are just interested in page requests. if (!is_a($router, 'PageRouter')) { return false; } // Check whether we are in journal context. $context = $router->getContext($request); if (!$context) { return false; } // Prepare request information. list($pubObject, $downloadSuccess, $assocType, $idParams, $canonicalUrlPage, $canonicalUrlOp, $canonicalUrlParams) = $this->getUsageEventData($hookName, $args, $request, $router, $templateMgr, $context); if (!$pubObject) { return false; } // Timestamp. $time = Core::getCurrentDate(); // Actual document size, MIME type. $htmlPageAssocTypes = $this->getHtmlPageAssocTypes(); if (in_array($assocType, $htmlPageAssocTypes)) { // HTML pages with no file downloads. $docSize = 0; $mimeType = 'text/html'; } else { // Files. $docSize = (int) $pubObject->getFileSize(); $mimeType = $pubObject->getFileType(); } $canonicalUrl = $router->url($request, null, $canonicalUrlPage, $canonicalUrlOp, $canonicalUrlParams); // Make sure we log the server name and not aliases. $configBaseUrl = Config::getVar('general', 'base_url'); $requestBaseUrl = $request->getBaseUrl(); if ($requestBaseUrl !== $configBaseUrl) { // Make sure it's not an url override (no alias on that case). if (!in_array($requestBaseUrl, Config::getContextBaseUrls()) && $requestBaseUrl !== Config::getVar('general', 'base_url[index]')) { // Alias found, replace it by base_url from config file. // Make sure we use the correct base url override value for the context, if any. $baseUrlReplacement = Config::getVar('general', 'base_url[' . $context->getPath() . ']'); if (!$baseUrlReplacement) { $baseUrlReplacement = $configBaseUrl; } $canonicalUrl = str_replace($requestBaseUrl, $baseUrlReplacement, $canonicalUrl); } } // Public identifiers. // 1) A unique system internal ID that will help us to easily attribute // statistics to a specific publication object. array_unshift($idParams, 'c' . $context->getId()); $siteId = $this->getUniqueSiteId(); if (empty($siteId)) { // Create a globally unique, persistent site ID // so that we can uniquely identify publication // objects from this site, even if the URL or any // other externally influenced information changes. $siteId = uniqid(); $this->updateSetting(0, 'uniqueSiteId', $siteId); } array_unshift($idParams, $siteId); $applicationName = $application->getName(); $applicationId = $applicationName . ':' . implode('-', $idParams); $idKey = 'other::' . $applicationName; $identifiers = array($idKey => $applicationId); // 2) Standardized public identifiers, e.g. DOI, URN, etc. if ($this->isPubIdObjectType($pubObject)) { $pubIdPlugins = PluginRegistry::loadCategory('pubIds', true, $context->getId()); if (is_array($pubIdPlugins)) { foreach ($pubIdPlugins as $pubIdPlugin) { if (!$pubIdPlugin->getEnabled()) { continue; } $pubId = $pubObject->getStoredPubId($pubIdPlugin->getPubIdType()); if ($pubId) { $identifiers[$pubIdPlugin->getPubIdType()] = $pubId; } } } } // Service URI. $serviceUri = $router->url($request, $context->getPath()); // IP and Host. $ip = $request->getRemoteAddr(); $host = null; if (isset($_SERVER['REMOTE_HOST'])) { // We do NOT actively look up the remote host to // avoid the performance penalty. We only set the remote // host if we get it "for free". $host = $_SERVER['REMOTE_HOST']; } // HTTP user agent. $userAgent = $request->getUserAgent(); // HTTP referrer. $referrer = isset($_SERVER['HTTP_REFERER']) ? $_SERVER['HTTP_REFERER'] : null; // User and roles. $user = $request->getUser(); $roles = array(); if ($user) { $roleDao = DAORegistry::getDAO('RoleDAO'); /* @var $roleDao PKPRoleDAO */ $rolesByContext = $roleDao->getByUserIdGroupedByContext($user->getId()); foreach (array(CONTEXT_SITE, $context->getId()) as $workingContext) { if (isset($rolesByContext[$workingContext])) { foreach ($rolesByContext[$workingContext] as $roleId => $role) { $roles[] = $roleId; } } } } // Try a simple classification of the request. $classification = null; if (!empty($roles)) { // Access by editors, authors, etc. $internalRoles = array_diff($roles, array(ROLE_ID_READER)); if (!empty($internalRoles)) { $classification = USAGE_EVENT_PLUGIN_CLASSIFICATION_ADMIN; } } if ($request->isBot()) { // The bot classification overwrites other classifications. $classification = USAGE_EVENT_PLUGIN_CLASSIFICATION_BOT; } // TODO: Classify LOCKSS or similar as 'internal' access. /* * Comparison of our event log format with Apache log parameters... * * 1) default parameters: * %h: remote hostname or IP => $ip, $host * %l: remote logname (identd) => not supported, see $user, $roles instead * %u: remote user => not supported, see $user, $roles instead * %t: request time => $time * %r: query => derived objects: $pubObject, $assocType, $canonicalUrl, $identifiers, $serviceUri, $classification * %s: status => not supported (always 200 in our case) * %b: response size => $docSize * * 2) other common parameters * %O: bytes sent => not supported (cannot be reliably determined from within PHP) * %X: connection status => $downloadSuccess (not reliable!) * %{ContentType}o: => $mimeType * %{User-agent}i: => $userAgent * %{Referer}i: => $referrer * * Several items, e.g. time etc., may differ from what Apache * would actually log. But the differences do not matter for our use * cases. */ // Collect all information into an array. $usageEvent = compact('time', 'pubObject', 'assocType', 'canonicalUrl', 'mimeType', 'identifiers', 'docSize', 'downloadSuccess', 'serviceUri', 'ip', 'host', 'user', 'roles', 'userAgent', 'referrer', 'classification'); return $usageEvent; }
/** * Check if the passed base url is part of * the passed url, based on the context base url * configuration. Both parameters can represent * full url (host plus path) or just the path, * but they have to be consistent. * @param $baseUrl string Full base url * or just it's path info. * @param $url string Full url or just it's * path info. * @return boolean */ function _checkBaseUrl($baseUrl, $url) { // Check if both base url and url have host // component or not. $baseUrlHasHost = (bool) parse_url($baseUrl, PHP_URL_HOST); $urlHasHost = (bool) parse_url($url, PHP_URL_HOST); if ($baseUrlHasHost !== $urlHasHost) { return false; } $contextBaseUrls =& Config::getContextBaseUrls(); // If the base url is found inside the passed url, // then we might found the right context path. if (strpos($url, $baseUrl) === 0) { if (strpos($url, '/index.php') == strlen($baseUrl) - 1) { // index.php appears right after the base url, // no more possible paths. return true; } else { // Still have to check if there is no other context // base url that combined with it's context path is // equal to this base url. If it exists, we can't // tell which base url is contained in url. foreach ($contextBaseUrls as $contextPath => $workingBaseUrl) { $urlToCheck = $workingBaseUrl . '/' . $contextPath; if (!$baseUrlHasHost) { $urlToCheck = parse_url($urlToCheck, PHP_URL_PATH); } if ($baseUrl == $urlToCheck) { return null; } } return true; } } return false; }
/** * Build an usage event. * @param $hookName string * @param $args array * @return array */ function _buildUsageEvent($hookName, $args) { // Finished downloading a file? if ($hookName == 'FileManager::downloadFileFinished') { // The usage event for this request is already build and // passed to any other registered hook. return null; } $application =& Application::getApplication(); $request =& $application->getRequest(); $router =& $request->getRouter(); /* @var $router PageRouter */ $templateMgr =& $args[0]; /* @var $templateMgr TemplateManager */ // We are just interested in page requests. if (!is_a($router, 'PageRouter')) { return false; } // Check whether we are in journal context. $journal =& $router->getContext($request); if (!$journal) { return false; } // Prepare request information. $downloadSuccess = false; $idParams = array(); $canonicalUrlParams = array(); switch ($hookName) { // Article abstract and HTML galley. case 'TemplateManager::display': $page = $router->getRequestedPage($request); $op = $router->getRequestedOp($request); // First check for a journal index page view. if (($page == 'index' || empty($page)) && $op == 'index') { $pubObject =& $templateMgr->get_template_vars('currentJournal'); if (is_a($pubObject, 'Journal')) { $assocType = ASSOC_TYPE_JOURNAL; $canonicalUrlOp = ''; $downloadSuccess = true; break; } else { return false; } } // We are interested in access to the article abstract/galley, issue view page. $wantedPages = array('article', 'issue'); $wantedOps = array('view', 'articleView'); if (!in_array($page, $wantedPages) || !in_array($op, $wantedOps)) { return false; } $issue =& $templateMgr->get_template_vars('issue'); $galley =& $templateMgr->get_template_vars('galley'); /* @var $galley ArticleGalley */ $article =& $templateMgr->get_template_vars('article'); // If there is no published object, there is no usage event. if (!$issue && !$galley && !$article) { return false; } if ($galley) { if ($galley->isHTMLGalley()) { $pubObject =& $galley; $assocType = ASSOC_TYPE_GALLEY; $canonicalUrlParams = array($article->getBestArticleId(), $pubObject->getBestGalleyId($journal)); $idParams = array('a' . $article->getId(), 'g' . $pubObject->getId()); } else { // This is an access to an intermediary galley page which we // do not count. return false; } } else { if ($article) { $pubObject =& $article; $assocType = ASSOC_TYPE_ARTICLE; $canonicalUrlParams = array($pubObject->getBestArticleId($journal)); $idParams = array('a' . $pubObject->getId()); } else { $pubObject =& $issue; $assocType = ASSOC_TYPE_ISSUE; $canonicalUrlParams = array($pubObject->getBestIssueId($journal)); $idParams = array('i' . $pubObject->getId()); } } // The article, issue and HTML/remote galley pages do not download anything. $downloadSuccess = true; $canonicalUrlOp = 'view'; break; case 'ArticleHandler::viewRemoteGalley': $article =& $args[0]; $pubObject =& $args[1]; $assocType = ASSOC_TYPE_GALLEY; $canonicalUrlParams = array($article->getBestArticleId(), $pubObject->getBestGalleyId($journal)); $idParams = array('a' . $article->getId(), 'g' . $pubObject->getId()); $downloadSuccess = true; $canonicalUrlOp = 'view'; break; // Article galley (except for HTML and remote galley). // Article galley (except for HTML and remote galley). case 'ArticleHandler::viewFile': case 'ArticleHandler::downloadFile': $pubObject =& $args[1]; $assocType = ASSOC_TYPE_GALLEY; $canonicalUrlOp = 'download'; $article =& $args[0]; $canonicalUrlParams = array($article->getBestArticleId(), $pubObject->getBestGalleyId($journal)); $idParams = array('a' . $article->getId(), 'g' . $pubObject->getId()); break; // Supplementary file. // Supplementary file. case 'ArticleHandler::downloadSuppFile': $pubObject =& $args[1]; $assocType = ASSOC_TYPE_SUPP_FILE; $canonicalUrlOp = 'downloadSuppFile'; $article =& $args[0]; $canonicalUrlParams = array($article->getBestArticleId(), $pubObject->getBestSuppFileId($journal)); $idParams = array('a' . $article->getId(), 's' . $pubObject->getId()); break; // Issue galley. // Issue galley. case 'IssueHandler::viewFile': $pubObject =& $args[1]; $assocType = ASSOC_TYPE_ISSUE_GALLEY; $canonicalUrlOp = 'download'; $issue =& $args[0]; $canonicalUrlParams = array($issue->getBestIssueId(), $pubObject->getBestGalleyId($journal)); $idParams = array('i' . $issue->getId(), 'ig' . $pubObject->getId()); break; default: // Why are we called from an unknown hook? assert(false); } // Timestamp. $time = Core::getCurrentDate(); // Actual document size, MIME type. $htmlPageAssocTypes = array(ASSOC_TYPE_ARTICLE, ASSOC_TYPE_ISSUE, ASSOC_TYPE_JOURNAL); if (in_array($assocType, $htmlPageAssocTypes)) { // Article abstract or issue view page. $docSize = 0; $mimeType = 'text/html'; } else { // Files. $docSize = (int) $pubObject->getFileSize(); $mimeType = $pubObject->getFileType(); } // Canonical URL. switch ($assocType) { case ASSOC_TYPE_ISSUE: case ASSOC_TYPE_ISSUE_GALLEY: $canonicalUrlPage = 'issue'; break; case ASSOC_TYPE_ARTICLE: case ASSOC_TYPE_GALLEY: case ASSOC_TYPE_SUPP_FILE: $canonicalUrlPage = 'article'; break; case ASSOC_TYPE_JOURNAL: $canonicalUrlPage = 'index'; break; } $canonicalUrl = $router->url($request, null, $canonicalUrlPage, $canonicalUrlOp, $canonicalUrlParams); // Make sure we log the server name and not aliases. $configBaseUrl = Config::getVar('general', 'base_url'); $requestBaseUrl = $request->getBaseUrl(); if ($requestBaseUrl !== $configBaseUrl) { // Make sure it's not an url override (no alias on that case). if (!in_array($requestBaseUrl, Config::getContextBaseUrls()) && $requestBaseUrl !== Config::getVar('general', 'base_url[index]')) { // Alias found, replace it by base_url from config file. // Make sure we use the correct base url override value for the context, if any. $baseUrlReplacement = Config::getVar('general', 'base_url[' . $journal->getPath() . ']'); if (!$baseUrlReplacement) { $baseUrlReplacement = $configBaseUrl; } $canonicalUrl = str_replace($requestBaseUrl, $baseUrlReplacement, $canonicalUrl); } } // Public identifiers. // 1) A unique OJS-internal ID that will help us to easily attribute // statistics to a specific publication object. array_unshift($idParams, 'j' . $journal->getId()); $siteId = $this->getUniqueSiteId(); if (empty($siteId)) { // Create a globally unique, persistent site ID // so that we can uniquely identify publication // objects from this site, even if the URL or any // other externally influenced information changes. $siteId = uniqid(); $this->updateSetting(0, 'uniqueSiteId', $siteId); } array_unshift($idParams, $siteId); $ojsId = 'ojs:' . implode('-', $idParams); $identifiers = array('other::ojs' => $ojsId); // 2) Standardized public identifiers, e.g. DOI, URN, etc. if (!is_a($pubObject, 'IssueGalley') && !is_a($pubObject, 'Journal')) { $pubIdPlugins =& PluginRegistry::loadCategory('pubIds', true, $journal->getId()); if (is_array($pubIdPlugins)) { foreach ($pubIdPlugins as $pubIdPlugin) { if (!$pubIdPlugin->getEnabled()) { continue; } $pubId = $pubIdPlugin->getPubId($pubObject); if ($pubId) { $identifiers[$pubIdPlugin->getPubIdType()] = $pubId; } } } } // Service URI. $serviceUri = $router->url($request, $journal->getPath()); // IP and Host. $ip = $request->getRemoteAddr(); $host = null; if (isset($_SERVER['REMOTE_HOST'])) { // We do NOT actively look up the remote host to // avoid the performance penalty. We only set the remote // host if we get it "for free". $host = $_SERVER['REMOTE_HOST']; } // HTTP user agent. $userAgent = $request->getUserAgent(); // HTTP referrer. $referrer = isset($_SERVER['HTTP_REFERER']) ? $_SERVER['HTTP_REFERER'] : null; // User and roles. $user =& $request->getUser(); $roles = array(); if ($user) { $roleDao =& DAORegistry::getDAO('RoleDAO'); $rolesByContext =& $roleDao->getByUserIdGroupedByContext($user->getId()); foreach (array(CONTEXT_SITE, $journal->getId()) as $context) { if (isset($rolesByContext[$context])) { foreach ($rolesByContext[$context] as $role) { $roles[] = $role->getRoleId(); } } } } // Try a simple classification of the request. $classification = null; if (!empty($roles)) { // Access by editors, authors, etc. $internalRoles = array_diff($roles, array(ROLE_ID_READER)); if (!empty($internalRoles)) { $classification = USAGE_EVENT_PLUGIN_CLASSIFICATION_ADMIN; } } if ($request->isBot()) { // The bot classification overwrites other classifications. $classification = USAGE_EVENT_PLUGIN_CLASSIFICATION_BOT; } // TODO: Classify LOCKSS or similar as 'internal' access. /* * Comparison of our event log format with Apache log parameters... * * 1) default parameters: * %h: remote hostname or IP => $ip, $host * %l: remote logname (identd) => not supported, see $user, $roles instead * %u: remote user => not supported, see $user, $roles instead * %t: request time => $time * %r: query => derived objects: $pubObject, $assocType, $canonicalUrl, $identifiers, $serviceUri, $classification * %s: status => not supported (always 200 in our case) * %b: response size => $docSize * * 2) other common parameters * %O: bytes sent => not supported (cannot be reliably determined from within PHP) * %X: connection status => $downloadSuccess (not reliable!) * %{ContentType}o: => $mimeType * %{User-agent}i: => $userAgent * %{Referer}i: => $referrer * * Several items, e.g. time etc., may differ from what Apache * would actually log. But the differences do not matter for our use * cases. */ // Collect all information into an array. $usageEvent = compact('time', 'pubObject', 'assocType', 'canonicalUrl', 'mimeType', 'identifiers', 'docSize', 'downloadSuccess', 'serviceUri', 'ip', 'host', 'user', 'roles', 'userAgent', 'referrer', 'classification'); return $usageEvent; }