示例#1
0
 /**
  * Check if there are URLs in the string and if so, make hyperlinks out of them.
  */
 private function find_urls($string)
 {
     $newstring = '';
     $words = explode(' ', $string);
     foreach ($words as $word) {
         if (preg_match('/^(www\\.|https?:\\/\\/)/i', $word) && ($urldata = urltools::get_elements($word)) !== false) {
             $newstring .= '<a href="' . htmlspecialchars($urldata['url']) . '">' . htmlspecialchars($urldata['url']) . '</a> ';
         } else {
             $newstring .= htmlspecialchars($word) . ' ';
         }
     }
     return rtrim($newstring);
 }
示例#2
0
 /**
  * Normalize and validate a URL and return an array with its elements.
  */
 public static function get_elements($url)
 {
     /**
      * Assemble the regular expression if not already done so.
      */
     if (self::$regexp_complete === '') {
         $domain = '(?<domain>[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?(\\.[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?)*)';
         $tld = '(?<tld>\\.[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?)';
         $fqdn = '(?<fqdn>' . $domain . $tld . ')\\.?';
         $ipv4address = '(?<ipv4address>(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})';
         $port = '(?<port>(6553[0-5]|(655[0-2]|(65[0-4]|(6[0-4]|[1-5][0-9]|[1-9])[0-9]|[1-9])[0-9]|[1-9])?[0-9]))';
         $authority = '(?<authority>(' . $ipv4address . '|' . $fqdn . ')(:' . $port . ')?)';
         $unreserved = '[a-z0-9_.~-]';
         $pct_encoded = '%[0-9a-f]{2}';
         $sub_delims = '[!$&\'()*+,;=]';
         $pchar = '(' . $unreserved . '|' . $pct_encoded . '|' . $sub_delims . '|[:@])';
         $fragment = '(?<fragment>(#(' . $pchar . '|[\\/?])*)?)';
         $path = '(?<path>(\\/\\/?(' . $pchar . '+\\/?)*)?)';
         $query = '(?<query>(\\?(' . $pchar . '|[\\/?])*)?)';
         $scheme = '(?<scheme>https?:\\/\\/)';
         self::$regexp_callback = '/^' . $scheme . '?' . $authority . '/i';
         self::$regexp_complete = '/^(?<url>' . $scheme . '?' . $authority . $path . $query . $fragment . ')$/i';
         /**
          * Read "tlds-alpha-by-domain.txt" and put all TLDs in an array against which we
          * can validate found URLs. If the aforementioned file does not exist or fails
          * to be read, the TLD check will not be done. This would be an unexpected and
          * undesired exception though.
          */
         if (($tlds = file(__DIR__ . '/tlds-alpha-by-domain.txt')) === false) {
             output::output('notice', __METHOD__ . '(): failed to open file: \'tlds-alpha-by-domain.txt\', tld validation disabled');
         } else {
             foreach ($tlds as $tld) {
                 $tld = trim($tld);
                 if ($tld !== '' && strpos($tld, '#') === false) {
                     self::$valid_tlds[] = '.' . strtolower($tld);
                 }
             }
         }
     }
     /**
      * Convert scheme and authority to lower case.
      */
     $url = preg_replace_callback(self::$regexp_callback, function ($matches) {
         return strtolower($matches[0]);
     }, $url);
     /**
      * Validate and further process the URL.
      */
     if (!preg_match(self::$regexp_complete, $url, $matches)) {
         return false;
     }
     /**
      * Verify if the TLD is valid. If the validation array is empty we skip this
      * step.
      */
     if (!empty(self::$valid_tlds) && !empty($matches['tld']) && !in_array($matches['tld'], self::$valid_tlds)) {
         return false;
     }
     /**
      * The maximum allowed length of the FQDN (root domain excluded) is 254
      * characters.
      */
     if (strlen($matches['fqdn']) > 254) {
         return false;
     }
     /**
      * If the URL has no scheme, http:// is assumed. Update the elements.
      */
     if (empty($matches['scheme'])) {
         $matches['scheme'] = 'http://';
         $matches['url'] = 'http://' . $matches['url'];
     }
     /**
      * Create and return an array with all the elements of this URL.
      */
     $elements = ['url', 'scheme', 'authority', 'ipv4address', 'fqdn', 'domain', 'tld', 'path', 'query', 'fragment'];
     foreach ($elements as $element) {
         if (empty($matches[$element])) {
             /**
              * Always pass along an empty string for nonexistent elements.
              */
             $urldata[$element] = '';
         } else {
             $urldata[$element] = $matches[$element];
         }
     }
     /**
      * Make sure the only numeric element isn't passed along as a string.
      */
     if (empty($matches['port'])) {
         $urldata['port'] = 0;
     } else {
         $urldata['port'] = (int) $matches['port'];
     }
     return $urldata;
 }
示例#3
0
 protected function set_normal($time, $csnick, $line)
 {
     if (!$this->validate_nick($csnick)) {
         output::output('debug', __METHOD__ . '(): invalid nick: \'' . $csnick . '\' on line ' . $this->linenum);
         return null;
     }
     $nick = $this->add_nick($csnick, $time);
     $line_length = mb_strlen($line, 'UTF-8');
     $this->nick_objs[$nick]->add_value('characters', $line_length);
     $this->nick_objs[$nick]->set_value('lasttalked', $this->date . ' ' . $time);
     /**
      * Keep track of monologues.
      */
     if ($nick !== $this->prevnick) {
         /**
          * Someone else typed a line and the previous streak is interrupted. Check if
          * the streak qualifies as a monologue and store it.
          */
         if ($this->streak >= 5) {
             /**
              * If the current line count is 0 then $prevnick is not known yet (only seen in
              * previous parse run). It's safe to assume that $prevnick is a valid nick since
              * it was set by set_normal(). Create an object for it here so the monologue
              * data can be added. It doesn't matter if $prevnick is lowercase since it won't
              * be updated before it is actually seen (i.e. on any other activity).
              */
             if ($this->l_total === 0) {
                 $this->add_nick($this->prevnick, null);
             }
             $this->nick_objs[$this->prevnick]->add_value('monologues', 1);
             if ($this->streak > $this->nick_objs[$this->prevnick]->get_value('topmonologue')) {
                 $this->nick_objs[$this->prevnick]->set_value('topmonologue', $this->streak);
             }
         }
         $this->prevnick = $nick;
         $this->streak = 0;
     }
     $this->streak++;
     /**
      * Increase line counts for relevant day, part of day, and hour.
      */
     $day = strtolower(date('D', strtotime($this->date)));
     $hour = (int) substr($time, 0, 2);
     if ($hour >= 0 && $hour <= 5) {
         $this->l_night++;
         $this->nick_objs[$nick]->add_value('l_' . $day . '_night', 1);
         $this->nick_objs[$nick]->add_value('l_night', 1);
     } elseif ($hour >= 6 && $hour <= 11) {
         $this->l_morning++;
         $this->nick_objs[$nick]->add_value('l_' . $day . '_morning', 1);
         $this->nick_objs[$nick]->add_value('l_morning', 1);
     } elseif ($hour >= 12 && $hour <= 17) {
         $this->l_afternoon++;
         $this->nick_objs[$nick]->add_value('l_' . $day . '_afternoon', 1);
         $this->nick_objs[$nick]->add_value('l_afternoon', 1);
     } elseif ($hour >= 18 && $hour <= 23) {
         $this->l_evening++;
         $this->nick_objs[$nick]->add_value('l_' . $day . '_evening', 1);
         $this->nick_objs[$nick]->add_value('l_evening', 1);
     }
     $this->nick_objs[$nick]->add_value('l_' . ($hour < 10 ? '0' . $hour : $hour), 1);
     $this->nick_objs[$nick]->add_value('l_total', 1);
     $this->{'l_' . ($hour < 10 ? '0' . $hour : $hour)}++;
     $this->l_total++;
     /**
      * Words are simply considered character groups separated by whitespace.
      */
     $skipquote = false;
     $words = explode(' ', $line);
     $this->nick_objs[$nick]->add_value('words', count($words));
     foreach ($words as $csword) {
         /**
          * Keep track of all character groups composed of the letters found in the Basic
          * Latin and Latin-1 Supplement character sets, the Hyphen (used properly), and
          * any multibyte characters beyond those two sets (found in UTF-8) regardless of
          * their meaning. The regular expression checks for any characters not wanted in
          * the word - from the aforementioned Latin sets. Note that normalize_line()
          * already took all the dirt out. This method of finding words is not 100%
          * accurate, but it serves its purpose.
          */
         if ($this->wordtracking && !preg_match('/^-|-$|--|[\\x21-\\x2C\\x2E-\\x40\\x5B-\\x60\\x7B-\\x7E]|\\xC2[\\xA1-\\xBF]|\\xC3\\x97|\\xC3\\xB7|\\xEF\\xBF\\xBD/', $csword)) {
             $word_length = mb_strlen($csword, 'UTF-8');
             /**
              * Words consisting of 30+ characters are most likely not real words.
              */
             if ($word_length <= 30) {
                 $this->add_word($csword, $word_length);
             }
             /**
              * Behold the amazing smileys regular expression. Cannot evaluate as a word (see
              * above).
              */
         } elseif (preg_match('/^(:([][)(pd\\/ox\\\\|3<>s]|-[)d\\/p(]|\'\\()|;([])(pxd\\/o]|-\\)|_;)|[:;](\\)\\)|\\(\\()|\\\\o\\/|<3|=[])p\\/\\\\d(x]|d:|8\\)|-[_.]-|>:\\()$/i', $csword)) {
             $this->nick_objs[$nick]->add_value($this->smileys[strtolower($csword)], 1);
             /**
              * Only catch URLs which were intended to be clicked on. Most clients can handle
              * URLs that begin with "www." or a scheme like "http://".
              */
         } elseif (preg_match('/^(www\\.|https?:\\/\\/)/i', $csword)) {
             /**
              * Regardless of it being a valid URL or not, set $skipquote to true, which
              * ensures that lines which contain a URL are not used as a quote. Quotes with
              * URLs in them often look messy/confusing on the stats page.
              */
             $skipquote = true;
             if (($urldata = urltools::get_elements($csword)) !== false) {
                 /**
                  * Track URLs of up to a sensible limit of 1024 characters in length.
                  */
                 if (strlen($urldata['url']) <= 1024) {
                     $this->add_url($urldata, $time, $nick);
                     $this->nick_objs[$nick]->add_value('urls', 1);
                 }
             } else {
                 output::output('debug', __METHOD__ . '(): invalid url: \'' . $csword . '\' on line ' . $this->linenum);
             }
         }
     }
     /**
      * Track quotes/example lines of up to a sensible limit of 255 characters in
      * length. This applies to all of the types seen below.
      */
     if (!$skipquote && $line_length <= 255) {
         $this->nick_objs[$nick]->add_quote('quote', $line, $line_length);
     }
     /**
      * Uppercased lines should consist of 2 or more characters, be completely
      * uppercased, and have less than 50% non-letter characters from the Basic Latin
      * and Latin-1 Supplement character sets in them.
      */
     if ($line_length >= 2 && mb_strtoupper($line, 'UTF-8') === $line && mb_strlen(preg_replace('/[\\x21-\\x40\\x5B-\\x60\\x7B-\\x7E]|\\xC2[\\xA1-\\xBF]|\\xC3\\x97|\\xC3\\xB7|\\xEF\\xBF\\xBD/S', '', $line), 'UTF-8') * 2 > $line_length) {
         $this->nick_objs[$nick]->add_value('uppercased', 1);
         if (!$skipquote && $line_length <= 255) {
             $this->nick_objs[$nick]->add_quote('ex_uppercased', $line, $line_length);
         }
     }
     if (preg_match('/!$/', $line)) {
         $this->nick_objs[$nick]->add_value('exclamations', 1);
         if (!$skipquote && $line_length <= 255) {
             $this->nick_objs[$nick]->add_quote('ex_exclamations', $line, $line_length);
         }
     } elseif (preg_match('/\\?$/', $line)) {
         $this->nick_objs[$nick]->add_value('questions', 1);
         if (!$skipquote && $line_length <= 255) {
             $this->nick_objs[$nick]->add_quote('ex_questions', $line, $line_length);
         }
     }
 }