Example #1
1
function fetch_keywords_list($threadinfo, $pagetext = '')
{
    global $vbphrase, $vbulletin;
    require_once DIR . '/includes/functions_search.php';
    require_once DIR . '/includes/class_taggablecontent.php';
    $keywords = vB_Taggable_Content_Item::filter_tag_list($threadinfo['taglist'], $errors, false);
    if (!empty($threadinfo['prefixid'])) {
        $prefix = $vbphrase["prefix_{$threadinfo['prefixid']}_title_plain"];
        $keywords[] = trim($prefix);
    }
    if (!empty($pagetext)) {
        // title has already been htmlspecialchar'd, pagetext has not
        $words = fetch_postindex_text(unhtmlspecialchars($threadinfo['title']) . ' ' . $pagetext);
        $wordarray = split_string($words);
        $sorted_counts = array_count_values($wordarray);
        arsort($sorted_counts);
        require DIR . '/includes/searchwords.php';
        // get the stop word list; allow multiple requires
        $badwords = array_merge($badwords, preg_split('/\\s+/s', $vbulletin->options['badwords'], -1, PREG_SPLIT_NO_EMPTY));
        foreach ($sorted_counts as $word => $count) {
            $word = trim($word);
            if (in_array(vbstrtolower($word), $badwords)) {
                continue;
            }
            if (vbstrlen($word) <= $vbulletin->options['minsearchlength'] and !in_array(vbstrtolower($word), $goodwords)) {
                continue;
            }
            $word = htmlspecialchars_uni($word);
            if (!in_array($word, $keywords)) {
                $keywords[] = $word;
            }
            if (sizeof($keywords) >= 50) {
                break;
            }
        }
    }
    return implode(', ', $keywords);
}
Example #2
0
 public function getInfo()
 {
     $rawInfo = self::getRaw($this->action, $this->ref, $this->data);
     //import('SHD.simple_html_dom');
     $infoArray = parse_array($rawInfo['FILE'], '<tr>', '</tr>');
     $tr = [];
     $this->page = end($infoArray);
     foreach ($infoArray as $k => $v) {
         $tb = parse_array($v, '<td', '</td>');
         $temp = get_attribute($tb[7], 'href');
         $temp = split_string($temp, 'proj_idDes=', AFTER, EXCL);
         $tb[7] = remove($tb[7], '<a', '>');
         $tb[7] = remove($tb[7], '</a', '>');
         $temp2 = $tb[7];
         $tb[7] = [];
         $tb[7][] = $temp2;
         $tb[7][] = $temp;
         $temp4 = get_attribute($tb[8], 'href');
         $temp4 = split_string($temp4, 'proj_idDes=', AFTER, EXCL);
         $tb[8] = remove($tb[8], '<a', '>');
         $tb[8] = remove($tb[8], '</a', '>');
         $temp3 = $tb[8];
         $tb[8] = [];
         $tb[8][] = $temp3;
         $tb[8][] = $temp4;
         $tr[] = $tb;
         //parse_array($v,'<td','</td>');
         //var_dump($tr);
     }
     /*
              [4]=> array(9) { [0]=> string(37) "3" [1]=> string(30) "信息" [2]=> string(49) "201510613089 " [3]=> string(78) "基于人体肢体语言的机械臂操控" [4]=> string(62) "15国家创新训练项目" [5]=> string(45) "张翠芳" [6]=> string(185) "20132235 刘炳楠
     20132312 覃勇杰
     20132230 李晓芳
     20132169 涂敏
     " [7]=> string(145) "查看 " [8]=> string(158) " 成果展" }
     */
     /*$html=new simple_html_dom();
             $html->load($rawInfo["FILE"]);
             //var_dump($html);
     //return $rawInfo;
             $infoArray = $html->find('tr');*/
     return $tr;
 }
Example #3
0
 public function index()
 {
     include "application/libraries/LIB_http.php";
     include "application/libraries/LIB_parse.php";
     $ref = "http://www.wenku8.cn";
     $method = "GET";
     $this->load->model("insertmodel");
     $success = "Catch OK";
     for ($xx = 1; $xx < 1700; $xx++) {
         $target = 'http://www.wenku8.cn/wap/article/packshow.php?id=' . $xx . '&type=txtfull';
         $web_page = http_get($target, $ref);
         //novel_name
         $label = '<card';
         $meta_tag_array = parse_array($web_page['FILE'], $label, ">");
         $meta_tag_array = str_replace(" ", "", $meta_tag_array);
         $meta_tag_array = str_replace("-", "", $meta_tag_array);
         $meta_tag = split_string($meta_tag_array[0], "title=\"", AFTER, EXCL);
         $novel_name = strip_tags(split_string($meta_tag, "TXT", BEFORE, EXCL));
         //update_time
         preg_match_all("/\\d{4}-\\d{1,2}-\\d{1,2}/", @$web_page['FILE'], $matches_array);
         foreach ($matches_array[0] as $key => $value) {
             $update_time = $value;
         }
         echo $update_time;
         //size
         preg_match_all("/\\d+K/", @$web_page['FILE'], $get_array);
         foreach ($get_array[0] as $key => $value) {
             $size = $value;
         }
         $data = array('novel_id' => $xx, 'novel_name' => empty($novel_name) ? '没有这本小说' : $novel_name, 'update_time' => $update_time, 'size' => $size . K);
         //print_r($data);
         echo "<br>";
         $this->insertmodel->insert_Novel($data);
         //echo $data;
     }
     $this->load->view('curl_result', $success);
 }
Example #4
0
	/**
	 * Prepare meta description to use first 20 keywords of the artile if it's not set. See bug #30456
	 */
	protected function prepareFields()
	{
		parent::prepareFields();

		if ((empty($this->set_fields['description']) OR $this->set_fields['description'] == (string) new vB_Phrase('vbcms', 'new_article'))
			AND !empty($this->type_set_fields['pagetext']))
		{
			require_once(DIR . '/includes/functions_databuild.php');

			$words = fetch_postindex_text($this->type_set_fields['pagetext']);

			$wordarray = split_string($words);
			$scores = array();
			foreach ($wordarray AS $word)
			{
				if (!is_index_word($word))
				{
					continue;
				}
				$scores[$word]++;
			}

			// Sort scores
			arsort($scores, SORT_NUMERIC);
			$scores = array_slice($scores, 0, 10, true);
			$this->set_fields['description'] = '';
			foreach ($scores as $k => $v)
			{
				$this->set_fields['description'] .= $k . ' ';
			}
			$this->set_fields['description'] = trim($this->set_fields['description']);
		}
	}
Example #5
0
     $verified_tag = $db->query_first_slave("\n\t\t\t\tSELECT tagid, tagtext\n\t\t\t\tFROM " . TABLE_PREFIX . "tag\n\t\t\t\tWHERE tagtext = '" . $db->escape_string(htmlspecialchars_uni($vbulletin->GPC['tag'])) . "'\n\t\t\t");
     if (!$verified_tag) {
         $errors[] = 'invalid_tag_specified';
     } else {
         $db->query_write("INSERT INTO " . TABLE_PREFIX . "tagsearch (tagid, dateline) VALUES (" . $verified_tag['tagid'] . ", " . TIMENOW . ")");
         $tag_join = "INNER JOIN " . TABLE_PREFIX . "tagthread AS tagthread ON (tagthread.tagid = {$verified_tag['tagid']} AND tagthread.threadid = thread.threadid)";
     }
 }
 if (empty($errors)) {
     // #############################################################################
     // ########################## START WORD QUERY LOGIC ###########################
     // #############################################################################
     if ($vbulletin->GPC['query'] and (!$vbulletin->options['fulltextsearch'] or $vbulletin->options['fulltextsearch'] and $vbulletin->GPC['searchtype'])) {
         $querysplit = $vbulletin->GPC['query'];
         // split string into seperate words and back again, this will deal with MB languages without space delimiters
         $querysplit = implode(' ', split_string($querysplit));
         // #############################################################################
         // if we are doing a relevancy sort, use all AND and OR words as OR
         if ($vbulletin->GPC['sortby'] == 'rank') {
             $not = '';
             while (preg_match_all('# -(.*) #siU', " {$querysplit} ", $regs)) {
                 foreach ($regs[0] as $word) {
                     $not .= ' ' . trim($word);
                     $querysplit = trim(str_replace($word, ' ', " {$querysplit} "));
                 }
             }
             $querysplit = preg_replace('# (OR )*#si', ' OR ', $querysplit) . $not;
         }
         // #############################################################################
         // strip out common words from OR clauses pt1
         if (preg_match_all('#OR ([^\\s]+) #sU', "{$querysplit} ", $regs)) {
function return_between($string, $start, $stop, $type)
{
    $temp = split_string($string, $start, AFTER, $type);
    return split_string($temp, $stop, BEFORE, $type);
}
Example #7
0
function fetch_postindex_text($text)
{
	static $find, $replace;
	global $vbulletin;

	// remove all bbcode tags
	$text = strip_bbcode($text);

	// there are no guarantees that any of the words will be delimeted by spaces so lets change that
	$text = implode(' ', split_string($text));

	// make lower case and pad with spaces
	//$text = strtolower(" $text ");
	$text = " $text ";

	if (!is_array($find))
	{
		$find = array(
			'#[()"\'!\#{};<>]|\\\\|:(?!//)#s',			// allow through +- for boolean operators and strip colons that are not part of URLs
			'#([.,?&/_]+)( |\.|\r|\n|\t)#s',			// \?\&\,
			'#\s+(-+|\++)+([^\s]+)#si',					// remove leading +/- characters
			'#(\s?\w*\*\w*)#s',							// remove words containing asterisks
			'#[ \r\n\t]+#s',							// whitespace to space
		);
		$replace = array(
			'',		// allow through +- for boolean operators and strip colons that are not part of URLs
			' ',	// \?\&\,
			' \2',	// remove leading +/- characters
			'',		// remove words containing asterisks
			' ',	// whitespace to space
		);
	}

	$text = strip_tags($text); // clean out HTML as it's probably not going to be indexed well anyway

	// use regular expressions above
	$text = preg_replace($find, $replace, $text);

	return trim(vbstrtolower($text));
}
Example #8
0
function fetch_postindex_text($text)
{
    static $find, $replace;
    global $vbulletin;
    // remove all bbcode tags
    $text = strip_bbcode($text);
    // there are no guarantees that any of the words will be delimeted by spaces so lets change that
    $text = implode(' ', split_string($text));
    // make lower case and pad with spaces
    //$text = strtolower(" $text ");
    $text = " {$text} ";
    if (!is_array($find)) {
        $find = array('#[()"\'!\\#{};<>]|\\\\|:(?!//)#s', '#([.,?&/_]+)( |\\.|\\r|\\n|\\t)#s', '#\\s+(-+|\\++)+([^\\s]+)#si', '#(\\s?\\w*\\*\\w*)#s', '#[ \\r\\n\\t]+#s');
        $replace = array('', ' ', ' \\2', '', ' ');
    }
    $text = strip_tags($text);
    // clean out HTML as it's probably not going to be indexed well anyway
    // use regular expressions above
    $text = preg_replace($find, $replace, $text);
    return trim(vbstrtolower($text));
}
include "application/libraries/LIB_parse.php";
include "application/libraries/LIB_thumbnail.php";
//
$ref = "http://www.wenku8.cn";
$method = "GET";
$this->load->model("insertmodel");
for ($xx = 1; $xx < 1800; $xx++) {
    $target = 'http://www.wenku8.cn/wap/article/packshow.php?id=' . $xx . '&type=txtfull';
    $web_page = http_get($target, $ref);
    //print_r($web_page);
    //<a href="articleinfo.php?id=1200">打工族买屋记</a>
    //$removed_string=remove($web_page," - TXT","全文下载");
    $label = '<card';
    $meta_tag_array = parse_array($web_page['FILE'], $label, ">");
    $meta_tag_array = str_replace(" ", "", $meta_tag_array);
    $meta_tag_array = str_replace("-", "", $meta_tag_array);
    $meta_tag = $meta_tag_array[0];
    //<cardid="packshow.php"title="文学少女TXT全文下载">
    $meta_tag = split_string($meta_tag, "title=\"", AFTER, EXCL);
    $novel_name = split_string($meta_tag, "TXT", BEFORE, EXCL);
    for ($i = 0; $i < count($meta_tag); $i++) {
        $data = array('novel_id' => $xx, 'novel_name' => $novel_name);
        $this->insertmodel->insert_Novel($data);
    }
}
?>
	</div>
	<p class="footer">Page rendered in <strong>{elapsed_time}</strong> seconds</p>
</div>
</body>
</html>
Example #10
0
function check_types($typestr)
{
    global $valid_types;
    $pieces = split_string($typestr);
    $notes = array();
    $stdtype = 0;
    foreach ($pieces as $key => $value) {
        if (in_array($value, $valid_types) or in_array('urn:lti:context-type:ims/lis/' . $value, $valid_types)) {
            $stdtype = $stdtype + 1;
            // print "Good ".$value."\n";
        } else {
            if (strpos($value, "urn:") === 0) {
                // print "OK WITH URN ".$value."\n";
            } else {
                // print "Bad ".$value."\n";
                $notes[$value] = "Non-standard types must be fully-qualified urns";
            }
        }
    }
    if ($stdtype == 0) {
        $notes[] = "Must include at least one standard type";
    }
    return $notes;
}
Example #11
0
 private function chunker(&$text, $style = "")
 {
     $errors = null;
     $text = collapse_spaces($text);
     $textarr = split_string($text);
     $chunksarr = split_on_spaces($textarr, $this->spaces);
     if ($style == "clean") {
         $chunksarr = remove_junk($chunksarr);
     }
     $chunkhashes = null;
     foreach ($chunksarr as $end => &$chunkarr) {
         $chunkhashes[$end] = count_words($chunkarr);
     }
     if (!$chunkhashes) {
         $errors[] = "Could not hash chunks. Huh.";
         trigger_error("Could not hash chunks. Huh.");
         return $errors;
     }
     // if cleaned style, remove all bad words and lc
     $max = array_pop(array_keys($textarr)) + 1;
     $maxlen = strlen("{$max}");
     $pad = "%0{$maxlen}s";
     foreach ($chunksarr as $end => $chunkarr) {
         $endpad = $end + 1;
         $endpad = sprintf($pad, $endpad);
         $out = $this->write_txt($chunkarr, $endpad, $style);
         $out2 = $this->write_csv($chunkhashes[$end], $endpad, $style);
         if ($out || $out2) {
             $errors = array_merge($out, $out2, $errors);
         }
     }
     if ($errors) {
         rrmdir($this->folder);
         trigger_error("Something in the chunking process went wrong.");
     }
     return $errors;
 }
Example #12
0
function crawlStatus($currElement)
{
    $src = $hyperlink = $locn = $userURL = $followers = $rts = $rtu = null;
    //$currElement = $statusArray[$i];
    $createdAt = return_between($currElement, "<created_at>", "</created_at>", EXCL);
    //format the date to Database datetime type (for date based comparisons)
    $dtFormat = dateFormat($createdAt);
    $tempsid = split_string($currElement, "</created_at>", AFTER, EXCL);
    $tempsid = split_string($tempsid, "</id>", BEFORE, EXCL);
    $sid = split_string($tempsid, "<id>", AFTER, EXCL);
    $text = return_between($currElement, "<text>", "</text>", EXCL);
    //this and next functions called to handle unicode characters or non english text
    $text = utf8_to_unicode($text);
    $text = unicode_to_entities_preserving_ascii($text);
    //preg match to extract URL from tweets, if present (currently for http), match string can be modified for better handling
    $do = preg_match('@(https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?)@', $text, $matches);
    if ($do = true) {
        //if url present
        $hyperlink = expandTinyURL(htmlentities($matches['0']));
    }
    //tweets usually contain tiny urls ->expansion needed
    $src = return_between($currElement, "<source>", "</source>", EXCL);
    $src = strip_tags($src);
    //gathering reply to information, if the tweet is a reply
    $rts = return_between($currElement, "<in_reply_to_status_id>", "</in_reply_to_status_id>", EXCL);
    $rtu = return_between($currElement, "<in_reply_to_user_id>", "</in_reply_to_user_id>", EXCL);
    //extracting user information as an array
    $userprofile = return_between($currElement, "<user>", "</user>", EXCL);
    $flag = 0;
    insertDB($sid, $text, $hyperlink, $dtFormat, $rts, $rtu, $src, $userprofile);
}
Example #13
0
function return_between($string, $start, $stop, $type)
{
    //Add by SAH to catch when start is not in string
    if (strpos($string, $start) === false) {
        return "";
    }
    $temp = split_string($string, $start, AFTER, $type);
    return split_string($temp, $stop, BEFORE, $type);
}
Example #14
0
ANY USE OF THE SOFTWARE OR DOCUMENTATION.

The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the 
software without specific, written prior permission. Title to copyright in this software and any associated 
documentation will at all times remain with copyright holders.

Copyright 2007, Michael Schrenk

THIS SCRIPT IS FOR DEMONSTRATION PURPOSES ONLY! 
    It is not suitable for any use other than demonstrating 
    the concepts presented in Webbots, Spiders and Screen Scrapers. 
########################################################################
*/
?>



<?php 
#PHP_LIBRARY_PATH
$PHP_LIBRARY_PATH = "../phplibs";
include "{$PHP_LIBRARY_PATH}/LIB_parse.php";
$string = "The quick brown fox";
echo "{$string} <br>\n";
# Parse what's before the delimiter, including the delimiter
$parsed_text = split_string($string, "quick", BEFORE, INCL);
// $parsed_text = "The quick"
echo "{$parsed_text}<br>\n";
# Parse what's after the delimiter, but don't include the delimiter
$parsed_text = split_string($string, "quick", AFTER, EXCL);
// $parsed_text = "brown fox"
echo "{$parsed_text}<br>\n";