/** * Check if there are URLs in the string and if so, make hyperlinks out of them. */ private function find_urls($string) { $newstring = ''; $words = explode(' ', $string); foreach ($words as $word) { if (preg_match('/^(www\\.|https?:\\/\\/)/i', $word) && ($urldata = urltools::get_elements($word)) !== false) { $newstring .= '<a href="' . htmlspecialchars($urldata['url']) . '">' . htmlspecialchars($urldata['url']) . '</a> '; } else { $newstring .= htmlspecialchars($word) . ' '; } } return rtrim($newstring); }
/** * Normalize and validate a URL and return an array with its elements. */ public static function get_elements($url) { /** * Assemble the regular expression if not already done so. */ if (self::$regexp_complete === '') { $domain = '(?<domain>[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?(\\.[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?)*)'; $tld = '(?<tld>\\.[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?)'; $fqdn = '(?<fqdn>' . $domain . $tld . ')\\.?'; $ipv4address = '(?<ipv4address>(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})'; $port = '(?<port>(6553[0-5]|(655[0-2]|(65[0-4]|(6[0-4]|[1-5][0-9]|[1-9])[0-9]|[1-9])[0-9]|[1-9])?[0-9]))'; $authority = '(?<authority>(' . $ipv4address . '|' . $fqdn . ')(:' . $port . ')?)'; $unreserved = '[a-z0-9_.~-]'; $pct_encoded = '%[0-9a-f]{2}'; $sub_delims = '[!$&\'()*+,;=]'; $pchar = '(' . $unreserved . '|' . $pct_encoded . '|' . $sub_delims . '|[:@])'; $fragment = '(?<fragment>(#(' . $pchar . '|[\\/?])*)?)'; $path = '(?<path>(\\/\\/?(' . $pchar . '+\\/?)*)?)'; $query = '(?<query>(\\?(' . $pchar . '|[\\/?])*)?)'; $scheme = '(?<scheme>https?:\\/\\/)'; self::$regexp_callback = '/^' . $scheme . '?' . $authority . '/i'; self::$regexp_complete = '/^(?<url>' . $scheme . '?' . $authority . $path . $query . $fragment . ')$/i'; /** * Read "tlds-alpha-by-domain.txt" and put all TLDs in an array against which we * can validate found URLs. If the aforementioned file does not exist or fails * to be read, the TLD check will not be done. This would be an unexpected and * undesired exception though. */ if (($tlds = file(__DIR__ . '/tlds-alpha-by-domain.txt')) === false) { output::output('notice', __METHOD__ . '(): failed to open file: \'tlds-alpha-by-domain.txt\', tld validation disabled'); } else { foreach ($tlds as $tld) { $tld = trim($tld); if ($tld !== '' && strpos($tld, '#') === false) { self::$valid_tlds[] = '.' . strtolower($tld); } } } } /** * Convert scheme and authority to lower case. */ $url = preg_replace_callback(self::$regexp_callback, function ($matches) { return strtolower($matches[0]); }, $url); /** * Validate and further process the URL. */ if (!preg_match(self::$regexp_complete, $url, $matches)) { return false; } /** * Verify if the TLD is valid. If the validation array is empty we skip this * step. */ if (!empty(self::$valid_tlds) && !empty($matches['tld']) && !in_array($matches['tld'], self::$valid_tlds)) { return false; } /** * The maximum allowed length of the FQDN (root domain excluded) is 254 * characters. */ if (strlen($matches['fqdn']) > 254) { return false; } /** * If the URL has no scheme, http:// is assumed. Update the elements. */ if (empty($matches['scheme'])) { $matches['scheme'] = 'http://'; $matches['url'] = 'http://' . $matches['url']; } /** * Create and return an array with all the elements of this URL. */ $elements = ['url', 'scheme', 'authority', 'ipv4address', 'fqdn', 'domain', 'tld', 'path', 'query', 'fragment']; foreach ($elements as $element) { if (empty($matches[$element])) { /** * Always pass along an empty string for nonexistent elements. */ $urldata[$element] = ''; } else { $urldata[$element] = $matches[$element]; } } /** * Make sure the only numeric element isn't passed along as a string. */ if (empty($matches['port'])) { $urldata['port'] = 0; } else { $urldata['port'] = (int) $matches['port']; } return $urldata; }
protected function set_normal($time, $csnick, $line) { if (!$this->validate_nick($csnick)) { output::output('debug', __METHOD__ . '(): invalid nick: \'' . $csnick . '\' on line ' . $this->linenum); return null; } $nick = $this->add_nick($csnick, $time); $line_length = mb_strlen($line, 'UTF-8'); $this->nick_objs[$nick]->add_value('characters', $line_length); $this->nick_objs[$nick]->set_value('lasttalked', $this->date . ' ' . $time); /** * Keep track of monologues. */ if ($nick !== $this->prevnick) { /** * Someone else typed a line and the previous streak is interrupted. Check if * the streak qualifies as a monologue and store it. */ if ($this->streak >= 5) { /** * If the current line count is 0 then $prevnick is not known yet (only seen in * previous parse run). It's safe to assume that $prevnick is a valid nick since * it was set by set_normal(). Create an object for it here so the monologue * data can be added. It doesn't matter if $prevnick is lowercase since it won't * be updated before it is actually seen (i.e. on any other activity). */ if ($this->l_total === 0) { $this->add_nick($this->prevnick, null); } $this->nick_objs[$this->prevnick]->add_value('monologues', 1); if ($this->streak > $this->nick_objs[$this->prevnick]->get_value('topmonologue')) { $this->nick_objs[$this->prevnick]->set_value('topmonologue', $this->streak); } } $this->prevnick = $nick; $this->streak = 0; } $this->streak++; /** * Increase line counts for relevant day, part of day, and hour. */ $day = strtolower(date('D', strtotime($this->date))); $hour = (int) substr($time, 0, 2); if ($hour >= 0 && $hour <= 5) { $this->l_night++; $this->nick_objs[$nick]->add_value('l_' . $day . '_night', 1); $this->nick_objs[$nick]->add_value('l_night', 1); } elseif ($hour >= 6 && $hour <= 11) { $this->l_morning++; $this->nick_objs[$nick]->add_value('l_' . $day . '_morning', 1); $this->nick_objs[$nick]->add_value('l_morning', 1); } elseif ($hour >= 12 && $hour <= 17) { $this->l_afternoon++; $this->nick_objs[$nick]->add_value('l_' . $day . '_afternoon', 1); $this->nick_objs[$nick]->add_value('l_afternoon', 1); } elseif ($hour >= 18 && $hour <= 23) { $this->l_evening++; $this->nick_objs[$nick]->add_value('l_' . $day . '_evening', 1); $this->nick_objs[$nick]->add_value('l_evening', 1); } $this->nick_objs[$nick]->add_value('l_' . ($hour < 10 ? '0' . $hour : $hour), 1); $this->nick_objs[$nick]->add_value('l_total', 1); $this->{'l_' . ($hour < 10 ? '0' . $hour : $hour)}++; $this->l_total++; /** * Words are simply considered character groups separated by whitespace. */ $skipquote = false; $words = explode(' ', $line); $this->nick_objs[$nick]->add_value('words', count($words)); foreach ($words as $csword) { /** * Keep track of all character groups composed of the letters found in the Basic * Latin and Latin-1 Supplement character sets, the Hyphen (used properly), and * any multibyte characters beyond those two sets (found in UTF-8) regardless of * their meaning. The regular expression checks for any characters not wanted in * the word - from the aforementioned Latin sets. Note that normalize_line() * already took all the dirt out. This method of finding words is not 100% * accurate, but it serves its purpose. */ if ($this->wordtracking && !preg_match('/^-|-$|--|[\\x21-\\x2C\\x2E-\\x40\\x5B-\\x60\\x7B-\\x7E]|\\xC2[\\xA1-\\xBF]|\\xC3\\x97|\\xC3\\xB7|\\xEF\\xBF\\xBD/', $csword)) { $word_length = mb_strlen($csword, 'UTF-8'); /** * Words consisting of 30+ characters are most likely not real words. */ if ($word_length <= 30) { $this->add_word($csword, $word_length); } /** * Behold the amazing smileys regular expression. Cannot evaluate as a word (see * above). */ } elseif (preg_match('/^(:([][)(pd\\/ox\\\\|3<>s]|-[)d\\/p(]|\'\\()|;([])(pxd\\/o]|-\\)|_;)|[:;](\\)\\)|\\(\\()|\\\\o\\/|<3|=[])p\\/\\\\d(x]|d:|8\\)|-[_.]-|>:\\()$/i', $csword)) { $this->nick_objs[$nick]->add_value($this->smileys[strtolower($csword)], 1); /** * Only catch URLs which were intended to be clicked on. Most clients can handle * URLs that begin with "www." or a scheme like "http://". */ } elseif (preg_match('/^(www\\.|https?:\\/\\/)/i', $csword)) { /** * Regardless of it being a valid URL or not, set $skipquote to true, which * ensures that lines which contain a URL are not used as a quote. Quotes with * URLs in them often look messy/confusing on the stats page. */ $skipquote = true; if (($urldata = urltools::get_elements($csword)) !== false) { /** * Track URLs of up to a sensible limit of 1024 characters in length. */ if (strlen($urldata['url']) <= 1024) { $this->add_url($urldata, $time, $nick); $this->nick_objs[$nick]->add_value('urls', 1); } } else { output::output('debug', __METHOD__ . '(): invalid url: \'' . $csword . '\' on line ' . $this->linenum); } } } /** * Track quotes/example lines of up to a sensible limit of 255 characters in * length. This applies to all of the types seen below. */ if (!$skipquote && $line_length <= 255) { $this->nick_objs[$nick]->add_quote('quote', $line, $line_length); } /** * Uppercased lines should consist of 2 or more characters, be completely * uppercased, and have less than 50% non-letter characters from the Basic Latin * and Latin-1 Supplement character sets in them. */ if ($line_length >= 2 && mb_strtoupper($line, 'UTF-8') === $line && mb_strlen(preg_replace('/[\\x21-\\x40\\x5B-\\x60\\x7B-\\x7E]|\\xC2[\\xA1-\\xBF]|\\xC3\\x97|\\xC3\\xB7|\\xEF\\xBF\\xBD/S', '', $line), 'UTF-8') * 2 > $line_length) { $this->nick_objs[$nick]->add_value('uppercased', 1); if (!$skipquote && $line_length <= 255) { $this->nick_objs[$nick]->add_quote('ex_uppercased', $line, $line_length); } } if (preg_match('/!$/', $line)) { $this->nick_objs[$nick]->add_value('exclamations', 1); if (!$skipquote && $line_length <= 255) { $this->nick_objs[$nick]->add_quote('ex_exclamations', $line, $line_length); } } elseif (preg_match('/\\?$/', $line)) { $this->nick_objs[$nick]->add_value('questions', 1); if (!$skipquote && $line_length <= 255) { $this->nick_objs[$nick]->add_quote('ex_questions', $line, $line_length); } } }