Пример #1
0
 private function findBasicInfo()
 {
     if (!$this->sets) {
         return null;
     }
     $re = '/^([^<]*?<[^>]*>)*?[^<]*?(?<!([\\.,$\\/]))(\\b[2-9]\\d\\b)(?!((,\\s+200\\d|199\\d)|%|[,\\.]\\d|[-\\s]+([Yy]ears?\\s+(with|career)|[Dd]ays?|[Mm]onths?)\\b))/su';
     $age_match_sets = array();
     //go through the sets of name matches and find age matches for each
     foreach ($this->sets as $set) {
         $age_matches = array();
         for ($i = 0; $i < count($set); $i++) {
             $len = $i == count($set) - 1 ? 2000 : $set[$i + 1]['pos'] - $set[$i]['pos'];
             if ($len > 100000) {
                 continue;
             }
             $str = substr($this->text, $set[$i]['pos'], $len);
             if (preg_match($re, $str, $match)) {
                 $n = preg_match_all('/<(\\p{L}+)[^>]*>/s', $match[0], $m, PREG_SET_ORDER);
                 $tag = 'empty';
                 if ($n > 0) {
                     $tag = $m[count($m) - 1][1];
                 }
                 $stripped = LsHtml::stripTags($match[0]);
                 if (strlen($stripped) < 2000) {
                     $age_matches[] = array('ind' => $i, 'age_match' => $match, 'age' => $match[3], 'name_match' => $set[$i], 'num_tags' => $n, 'tag' => $tag, 'len' => strlen($match[0]));
                 }
                 //$this->printDebug($i . '. ' . $set[$i]['name'] . ' : ' . $match[3] . ' : ' . strlen($match[0]) . ' : ' . $n . ' : ' . $tag);
                 //$this->printDebug($set[$i]['match'][1][0]);
             }
             //else $this->printDebug('--');
             //$this->printDebug($set[$i]['match'][1][0]);
         }
         $this->printDebug('count age matches is ' . count($age_matches));
         $age_match_sets[] = $age_matches;
     }
     //find the best set (most unique names and ages)
     $max = 0;
     $best = array(array('unique' => array(), 'set' => array()));
     foreach ($age_match_sets as $age_matches) {
         if (count($age_matches) < 2) {
             continue;
         }
         $unique = array($age_matches[0]['name_match']['id']);
         $temp = array($age_matches[0]);
         for ($i = 1; $i < count($age_matches); $i++) {
             if ($age_matches[$i]['ind'] - 4 <= $age_matches[$i - 1]['ind']) {
                 $temp[] = $age_matches[$i];
                 if (!in_array($age_matches[$i]['name_match']['id'], $unique)) {
                     $unique[] = $age_matches[$i]['name_match']['id'];
                 }
             } else {
                 if (count($unique) > $max) {
                     $max = count($unique);
                     if (count(array_intersect($best[0]['unique'], $unique)) == 0 && count($best[0]['unique']) > 2) {
                         array_unshift($best, array('unique' => $unique, 'set' => $temp));
                     } else {
                         $best = array(array('unique' => $unique, 'set' => $temp));
                     }
                 } else {
                     if (count(array_intersect($best[0]['unique'], $unique)) == 0 && count($unique) > 2) {
                         $best[] = array('unique' => $unique, 'set' => $temp);
                     }
                 }
                 $unique = array($age_matches[$i]['name_match']['id']);
                 $temp = array($age_matches[$i]);
             }
         }
         if (count($unique) > $max) {
             $max = count($unique);
             if (count(array_intersect($best[0]['unique'], $unique)) == 0) {
                 array_unshift($best, array('unique' => $unique, 'set' => $temp));
             } else {
                 $best = array(array('unique' => $unique, 'set' => $temp));
             }
         }
     }
     $best = $best[0]['set'];
     //$this->printDebug('count best is ' . count($best));
     //find the tag all names have in common (if there is one)
     $tag_counts = array();
     foreach ($best as $b) {
         if (isset($tag_counts[$b['tag']])) {
             $tag_counts[$b['tag']]++;
         } else {
             $tag_counts[$b['tag']] = 1;
         }
         $this->printDebug($b['ind'] . '. ' . $b['name_match']['name'] . ' : ' . $b['age'] . ' : ' . strlen($b['age_match'][0]) . ' : ' . $b['num_tags'] . ' : ' . $b['tag']);
     }
     $tag = null;
     foreach ($tag_counts as $k => $v) {
         if ($v > 0.8 * count($best)) {
             $tag = $k;
             break;
         }
     }
     $age_set = array();
     if ($tag) {
         foreach ($best as $b) {
             if ($b['tag'] == $tag) {
                 $age_set[] = $b;
             }
         }
     } else {
         $age_set = $best;
     }
     $age_set = LsArray::multiSort($age_set, array('name_match', 'id'));
     //find duplicates and determine the best match out of the pair/set
     $singles = array();
     $doubles = array();
     $num_tags = 0;
     $len = 0;
     for ($i = 0; $i < count($age_set); $i++) {
         $double = array($age_set[$i]);
         while ($i < count($age_set) - 1 && $double[0]['name_match']['id'] == $age_set[$i + 1]['name_match']['id']) {
             $double[] = $age_set[$i + 1];
             $i++;
         }
         if (count($double) == 1) {
             $singles[] = $age_set[$i];
             $num_tags += $age_set[$i]['num_tags'];
             $len += $age_set[$i]['len'];
         } else {
             $doubles[] = $double;
         }
     }
     if (count($singles) < 3) {
         $unique = array();
         $sets = array(array());
         $age_set = LsArray::multiSort($age_set, array('name_match', 'pos'));
         foreach ($age_set as $a) {
             //$this->printDebug($a['name_match']['name'] . ": ");
             if (!in_array($a['name_match']['id'], $unique)) {
                 $unique[] = $a['name_match']['id'];
                 $sets[count($sets) - 1][] = $a;
             } else {
                 $unique = array($a['name_match']['id']);
                 $sets[] = array($a);
             }
         }
         $age_set = $sets[0];
     } else {
         $avg_len = $len / count($singles);
         $avg_tags = $num_tags / count($singles);
         //$this->printDebug('len is ' . $avg_len . ' and tags is ' . $avg_tags);
         foreach ($doubles as $double) {
             $best = null;
             foreach ($double as $d) {
                 $lf = $d['len'] / $avg_len;
                 $tf = $d['num_tags'] / $avg_tags;
                 $f = abs(2 - ($lf + $tf));
                 if (!$best) {
                     $best = $d;
                 } else {
                     if (abs($avg_tags - $best['num_tags']) > abs($avg_tags - $d['num_tags'])) {
                         $best = $d;
                     } else {
                         if (abs($avg_tags - $best['num_tags']) == abs($avg_tags - $d['num_tags']) && abs($avg_len - $best['len']) == abs($avg_len - $d['len'])) {
                             $best = $d;
                         }
                     }
                 }
             }
             $singles[] = $best;
         }
         $age_set = LsArray::multiSort($singles, array('name_match', 'pos'));
     }
     //determine which directors were found, which weren't
     $ids = array();
     foreach ($age_set as $a) {
         $ids[] = $a['name_match']['id'];
         //$this->printDebug($a['ind'] . '. ' . $a['name_match']['name'] . ' : ' . $a['age'] . ' : ' . strlen($a['age_match'][0]) . ' : ' . $a['num_tags'] . ' : ' . $a['tag']);
     }
     foreach ($this->people as $p) {
         if (!in_array($p->id, $ids)) {
             $category = Doctrine::getTable('RelationshipCategory')->findOneByName('Position');
             $relationship = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ?', $p->id)->addWhere('r.entity2_id = ?', $this->corp->id)->addWhere('r.category_id = ?', $category->id)->addWhere('r.description1 = ?', 'Director')->fetchOne();
             if ($relationship) {
                 $relationship->is_current = 0;
                 $relationship->save();
             }
         }
     }
     if (count($age_set) < 0.5 * count($this->people)) {
         $this->printDebug('not enough names in age set:' . count($age_set) . ' vs. ' . count($this->people));
         return null;
     }
     //figure out which tags surround name/age pairs
     $tag_arr = array('<table' => array(), '<tr' => array(), '<td' => array(), '<div' => array(), '<br' => array(), '<p' => array());
     $tag_arr = array('table' => array(), 'tr' => array(), 'td' => array(), 'div' => array(), 'br' => array(), 'p' => array());
     for ($i = 1; $i < count($age_set) - 1; $i++) {
         $str = substr($this->text, $age_set[$i - 1]['name_match']['pos'], $age_set[$i + 1]['name_match']['pos'] - $age_set[$i - 1]['name_match']['pos']);
         //$this->printDebug($str);
         foreach ($tag_arr as $tag => &$arr) {
             $tag_str = LsHtml::getStringInTag($str, $tag, $age_set[$i]['name_match']['pos'] - $age_set[$i - 1]['name_match']['pos']);
             if (strlen($tag_str) > 0) {
                 $arr[] = strlen($tag_str);
                 //$this->printDebug($tag_str);
                 //echo "\n*****\n";
             }
         }
     }
     arsort($tag_arr);
     //var_dump($tag_arr);
     //$this->printDebug(count($this->people));
     if (count(reset($tag_arr)) == 0) {
         $this->printDebug('problems with enclosing tag detection');
         return null;
     }
     foreach ($tag_arr as $tag => $arr) {
         $avg = array_sum($arr) / count($arr);
         $splitter = $tag;
         break;
     }
     $tag_counts = array();
     for ($i = 0; $i < count($age_set) - 1; $i++) {
         $str = substr($this->text, $age_set[$i]['name_match']['pos'], $age_set[$i + 1]['name_match']['pos'] - $age_set[$i]['name_match']['pos']);
         str_ireplace('<' . $splitter, ' ', $str, $count);
         $tag_counts[] = $count;
     }
     sort($tag_counts);
     $ct = $tag_counts[0];
     if (!$ct) {
         return null;
     }
     $post_strlen = 0;
     $info_arr = array();
     for ($i = 0; $i < count($age_set); $i++) {
         $a = $age_set[$i];
         $matches = LsString::striposMulti($this->text, '</' . $splitter, $ct, $a['name_match']['pos']);
         $end = $matches[count($matches) - 1];
         $start = strripos(substr($this->text, 0, $a['name_match']['pos']), '<' . $splitter);
         $str = substr($this->text, $start, $end - $start);
         if ($i == count($age_set) - 1 && count($matches) > 1) {
             $end = $matches[count($matches) - 2];
             $str2 = substr($this->text, $start, $end - $start);
             $avg = strlen(implode(' ', $segments)) / count($segments);
             if (abs(strlen($str2) - $avg) < abs(strlen($str) - $avg)) {
                 $str = $str2;
             }
         }
         $segments[] = $str;
         //$this->printDebug($str);
         $info = $this->parseSegment($str, $a['name_match']['pos'] - $start, $a['name_match']['pos'] - $start + strlen($a['name_match']['match'][2][0]));
         $info = $this->parseBlurb($info, $a);
         //looks to see if bio appears aftr the parsed segment
         if ($i < count($age_set) - 1) {
             $next_start = strripos(substr($this->text, 0, $age_set[$i + 1]['name_match']['pos']), '<' . $splitter);
             $post_str = substr($this->text, $end, $next_start - $end);
         } else {
             $avg = $post_strlen / (count($age_set) - 1);
             $post_str = substr($this->text, $end, $avg);
         }
         $post_strlen += strlen($post_str);
         $post_str = LsHtml::replaceFontStyleTags($post_str);
         $person = $a['name_match']['person'];
         $last = LsString::escapeStringForRegex($person->name_last);
         $info['post_blurb'] = '';
         if (preg_match_all('/>([^<]*' . $last . '[^<]*)</isu', $post_str, $matches)) {
             $post_blurb = implode(' ', $matches[1]);
             $post_blurb = trim(preg_replace('/\\s+/s', ' ', $post_blurb));
             if (strlen($post_blurb) > 40) {
                 $info['post_blurb'] = $post_blurb;
             }
         }
         $info_arr[] = $info;
         //echo "\n\n***\n\n";
     }
     $ct = 0;
     $unv_ct = 0;
     foreach ($info_arr as $info) {
         if (strlen($info['post_blurb']) > strlen($info['blurb'])) {
             $ct++;
         }
         if ($info['img'] == null && $info['unverified_img'] != null) {
             $unv_ct++;
         }
     }
     //if most of the profile segments have images at the end, check to see if they belong to the next profile segment
     if ($unv_ct > count($age_set) - 3) {
         for ($i = 0; $i < count($age_set); $i++) {
             $len = strripos(substr($this->text, 0, $age_set[$i]['name_match']['pos']), '<' . $splitter);
             $tag_start = strripos(substr($this->text, 0, $len), '<img');
             $str = substr($this->text, $tag_start, 200);
             if (preg_match('/^<img[^>]+src=[\'"]([^\'"]+)[\'"]/is', $str, $match) == 1) {
                 $info['img'] = $match[1];
             } else {
                 if ($i == 0) {
                     break;
                 }
             }
         }
     }
     for ($i = 0; $i < count($info_arr); $i++) {
         if ($ct > 0.8 * count($age_set)) {
             $info_arr[$i]['blurb'] = $info_arr[$i]['post_blurb'];
             if (!$info_arr[$i]['since']) {
                 $info_arr[$i]['since'] = $this->getStartDate($info_arr[$i]['blurb']);
             }
         }
         $this->importDirectorInfo($info_arr[$i], $age_set[$i]);
         $this->printDebug("\n***");
     }
     //$this->printDebug($splitter);
     //var_dump($tag_counts);
 }