Ejemplo n.º 1
0
 /**
  * @param $queryString
  * @param $simType  String ["innerProduct","$cosine","jaccard"]
  * @param $sortBy   String ["similarity","time"]
  * @return array : list of result
  */
 public function search($queryString, $simType, $sortBy, $topN, &$actualResultCount)
 {
     $resList = "";
     //simVal(similarity Value), title, url, time, abstract}
     $wordSplit = new WordSplit($queryString);
     $queryTermArray = $wordSplit->send_post();
     $termsDocList = $this->SelDocByQueryTerm($queryTermArray);
     $resList = $this->formDocsVectorList($termsDocList);
     $this->assignSimValue($queryTermArray, $resList, $simType);
     $this->sortResultList($resList, $sortBy);
     $actualResultCount = count($resList);
     $resList = $this->getTopNResult($resList, $topN);
     //get True content of topN resList
     $docmentContent = new DocumentContent($resList);
     $resList = $docmentContent->getDocumentContents();
     return $resList;
 }
Ejemplo n.º 2
0
 /**
  * 分词
  * @param   string $text 要分词字符串
  * @param  null $dict_file 词库路径
  * @return string
  */
 public static function split($text, $dict_file = null)
 {
     if ($dict_file) {
         self::$dictFile = $dict_file;
     } else {
         self::$dictFile = __DIR__ . '/lex.hash.php';
     }
     //加载词库
     if (self::$DIC_HASH == null) {
         self::$DIC_HASH = (include self::$dictFile);
     }
     $offset = 0;
     $length = strlen($text);
     $_result = array();
     while ($offset < $length) {
         $w = '';
         //基本字符和英文
         if (ord($text[$offset]) <= 127) {
             for (; ($u = ord($text[$offset])) <= 127 && $offset < $length;) {
                 if (self::is_en_punctuation($u)) {
                     $offset++;
                     break;
                 }
                 $w .= $text[$offset++];
             }
             if (strlen($w) > 1) {
                 $_result[] = $w;
             }
         } else {
             //中文标点
             $w = substr($text, $offset, 3);
             if (self::is_cn_punctuation($w)) {
                 $offset += 3;
                 continue;
             }
             //正向最大化匹配
             for ($i = self::$maxWordLength; $i > 0; $i--) {
                 $w = substr($text, $offset, $i * 3);
                 if (self::$DIC_HASH[$w] == 1) {
                     $offset += $i * 3;
                     $_result[] = $w;
                     break;
                 } else {
                     if ($i == 1) {
                         //如果没有在词库找到,则提取一元词
                         //$_result .= $_result==''? $w : '/ ' . $w;
                         $offset += 3;
                     }
                 }
             }
         }
     }
     return $_result;
 }
Ejemplo n.º 3
0
function computeTheSimValue()
{
    //分词,得到词项
    $sentence = $_POST["Query1"];
    $wordSplit = new WordSplit($sentence);
    $result1 = $wordSplit->send_post();
    echo '<div class="resultArea col-md-12" style="font-size: 30px;color: #ff0025">';
    echo 'Query1 : ' . $sentence . '<br />';
    $sentence = $_POST["Query2"];
    $wordSplit = new WordSplit($sentence);
    $result2 = $wordSplit->send_post();
    echo 'Query2 : ' . $sentence . '<br />';
    echo '</div>';
    $simcal = new SimCalculator();
    $innerProduct = "";
    $cosine = "";
    $jaccard = "";
    $simcal->getSimWithTwoTermArrays($result1, $result2, $innerProduct, $cosine, $jaccard);
    echo '<div class="resultArea col-md-12" style="font-size: 30px;color: #ff0025">';
    //打印结果
    echo '<table class="table table-condensed">';
    echo '<tr>';
    echo '<th> Compute Type </th> <th> Value </th>';
    echo '</tr>';
    echo '<tr>';
    echo '<th> InnerProduct </th> <th>' . $innerProduct . ' </th>';
    echo '</tr>';
    echo '<tr>';
    echo '<th> Cosine </th> <th>' . $cosine . ' </th>';
    echo '</tr>';
    echo '<tr>';
    echo '<th> Jaccard </th> <th>' . $jaccard . ' </th>';
    echo '</tr>';
    echo '</table>';
    echo '</div>';
}
 /**
  * 转移文章主任务线程,转移老驼牛的文章
  */
 private function transferArticle()
 {
     //获取文章
     $query = 'SELECT id,catid,title,kwords,bcontent,hits,author,tags,add_time,update_time,share_times,thumb_pic,trash,ischeck ' . 'FROM fiidee_article';
     $items = self::$_SRC_DB->getItems($query);
     //获取文章内容
     $ids = array();
     foreach ($items as $__val) {
         $ids[] = $__val['id'];
     }
     $articleData = self::$_SRC_DB->getItems('SELECT itemid,content FROM fiidee_articleData WHERE itemid IN (' . implode(',', $ids) . ')');
     $articleData = ArrayUtils::changeArrayKey($articleData, 'itemid');
     //转移文章
     $articleService = Beans::get('article.article.service');
     $articleTagsService = Beans::get('article.tags.service');
     $imageService = Beans::get('image.image.service');
     //userid,chanel_id,media_id,thumb,title,kwords,bcontent,author,tags,add_time,update_time,publish_time,share_times,hits,trash,ischeck
     foreach ($items as $value) {
         $data = array();
         $data['id'] = $value['id'];
         $data['userid'] = 0;
         $data['chanel_id'] = intval(self::$_CAT_TO_CHANEL[$value['catid']]);
         if ($data['chanel_id'] <= 0) {
             $data['chanel_id'] = 5;
         }
         if (mb_strlen($value['title'], 'UTF-8') < 4) {
             $message = "添加文章失败,源ID : {$value['id']}, 文章标题 : ‘{$value['title']}’ 长度太短";
             tprintError($message);
             $this->addErrorLog($message);
             continue;
         }
         $data['media_id'] = 0;
         $data['title'] = $value['title'];
         $data['kwords'] = $value['kwords'];
         $data['bcontent'] = $value['bcontent'];
         $data['author'] = $value['author'];
         $data['add_time'] = $value['add_time'];
         $data['update_time'] = $value['update_time'];
         $data['publish_time'] = $value['update_time'];
         $data['share_times'] = $value['share_times'];
         $data['hits'] = $value['hits'];
         $data['trash'] = $value['trash'];
         $data['ischeck'] = $value['ischeck'];
         //转移缩略图
         $data['thumb'] = str_replace('/res/attachs/', '/res/upload/', $value['thumb_pic']);
         //转移内容
         $content = $articleData[$value['id']]['content'];
         //过滤掉正文汉字长度小于100的文章
         $__content = '';
         $__length = strlen($content);
         for ($i = 0; $i < $__length; $i++) {
             if (ord($content[$i]) > 127) {
                 $__content .= $content[$i];
             }
         }
         if (mb_strlen($__content, 'UTF-8') < 100) {
             $message = "添加文章失败,源ID : {$value['id']}, 文章标题 : ‘{$value['title']}’ 文章内容正文小于100";
             tprintError($message);
             $this->addErrorLog($message);
             continue;
         }
         $articleService->beginTransaction();
         $data['content'] = str_replace('/res/attachs/', '/res/upload/', $content);
         $this->getContentImage($value['id'], 0, $data['content']);
         //保存缩略图到图片空间
         $pathinfo = pathinfo($data['thumb']);
         $imgData = array('userid' => 0, 'media_id' => 0, 'aid' => $value['id'], 'url' => $data['thumb'], 'type' => 'image', 'filename' => $pathinfo['basename'], 'add_time' => time(), 'grabed' => 1);
         $imageService->add($imgData);
         //如果标签为空则自动生成标签
         if (trim($value['tags']) != '') {
             $tags = \WordSplit::split($value['title']);
         } else {
             $tags = explode(' ', $value['tags']);
         }
         //如果标签的数量超过5个,则删除多余的标签
         if (count($tags) > 5) {
             $tags = array_slice($tags, 0, 5);
         }
         $tagIds = array();
         foreach ($tags as $_value) {
             if (trim($_value) == '') {
                 continue;
             }
             $tagIds[] = $articleTagsService->add(array('name' => $_value));
         }
         //添加文章
         $data['tags'] = implode(',', $tagIds);
         //var_dump($data);die();
         $result = $articleService->add($data);
         if ($result) {
             //提交
             $articleService->commit();
             tprintOk("添加文章成功! 源ID : {$value['id']}, 新ID : {$result}");
         } else {
             //回滚
             $articleService->rollback();
             $message = "添加文章失败,源ID : {$value['id']}";
             tprintError($message);
             $this->addErrorLog($message);
         }
     }
 }
 /**
  * 自动获取标签
  * @param HttpRequest $request
  */
 public function fetchTags(HttpRequest $request)
 {
     $data = $request->getParameter('data', 'trim');
     Loader::import('extend.word.WordSplit', IMPORT_CUSTOM);
     $words = \WordSplit::split($data);
     if (!empty($words)) {
         AjaxResult::ajaxResult('ok', implode(',', $words));
     } else {
         AjaxResult::ajaxFailtureResult();
     }
 }