/** * Add method * * @return void Redirects on successful add, renders view otherwise. */ public function add() { $minute = $this->Minutes->newEntity(); if ($this->request->is('post')) { //perform content extraction $f = $this->request->data['uploaded_file']; $extractor = new \ContentExtractor($f['type']); if ($extractor->supported()) { try { $minute['content'] = $extractor->extract($f['tmp_name']); } catch (Exception $e) { $minute['content'] = ''; } } //set meeting date $date = new \DateTime($this->request->data['meeting_date']); $minute['meeting_date'] = $date; //process upload data. Always set to private $ret = $this->Upload->attachToEntity($minute, $f); if ($ret['success'] && $this->Minutes->save($minute)) { $this->Flash->success(__('The minutes have been saved.')); return $this->redirect(['action' => 'index']); } //if we haven't been re-directed yet, we've failed $msg = $ret['message']; $this->Flash->error(__("The uploaded file could not be saved. Error message was: '{$msg}'")); } $this->set(compact('minute')); $this->set('_serialize', ['minute']); }
/** * * @return byte */ public function convertToKindleFile() { $html = $this->rowContents; if ($this->isExtractEnabled) { $extractor = new ContentExtractor(); $extractor->exec($this->encodedContents()); if ($this->isImageEnabled) { $imgDownloader = new ImageDownloader($extractor->getExtractedNode(), new Url($this->url), $this->dirBuilder); $imgDownloader->exec(); } $normalizer = new ContentsNormalizer($this->url, $extractor->title, $extractor->getExtractedNode()); $normalizer->exec(); $html = $normalizer->getHtml(); } $ret = $this->dirBuilder->putContents($html); $mobiFileName = pathinfo($this->dirBuilder->getMobiPath(), PATHINFO_BASENAME); $command = KindleGenCommand::newInstance($this->dirBuilder->getContentsPath(), $mobiFileName); $command->exec(); $mobiFile = file_get_contents($this->dirBuilder->getMobiPath()); return $mobiFile; }
public function testDomDocument() { $html = $this->loadDat('dom_document.html'); // $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); $extractor = new ContentExtractor(); $extractor->exec($html); $xpath = $extractor->calculateXpath(); $text = $extractor->scan($extractor->getExtractedNode()); d($extractor->getExtractedNode()->nodeName); d($text); d($xpath); d($extractor->params); d($extractor->text); d($extractor->title); d('pancutuationCountAll:' . $extractor->pancutuationCountAll); d('domCountAll:' . $extractor->domCountAll); d('textLengthAll:' . $extractor->textLengthAll); d('textAll:' . $extractor->textAll); d(mb_strlen('あ', 'utf-8')); d(mb_strlen('ほげ ')); }
public function testXpath() { $datPath = implode('/', [PATH_TEST, 'dat', 'ContentExtractor']); foreach ($this->getData() as $name => $testData) { // ファイル名が正解のxpathのキー $path = implode('/', [$datPath, $name]); if (is_dir($path)) { continue; } $html = file_get_contents($path); $extractor = new ContentExtractor(); $extractor->exec($html); $xpath = $extractor->calculateXpath(); $text = $extractor->scan($extractor->getExtractedNode()); $hit = false; foreach ($testData->xpathCandidates as $xpathExpected) { if ($xpathExpected === $xpath) { $hit = true; } } if (!$hit) { d($text); file_put_contents('./test2.txt', $text); d($extractor->getExtractedNode()->nodeName); d($xpath); d($testData->url); d($extractor->params); d('pancutuationCountAll:' . $extractor->pancutuationCountAll); d('domCountAll:' . $extractor->domCountAll); d('textLengthAll:' . $extractor->textLengthAll); d('textAll:' . $extractor->textAll); // d('preProcessedInput:' . $extractor->preProcessedInput); } $this->assertEquals(true, $hit, $xpath . ' ' . $testData->url); // . PHP_EOL . var_export($extractor->params, true) . PHP_EOL); } }
if (!$debug_mode) { header('Expires: ' . gmdate('D, d M Y H:i:s', time() + (isset($options->cache_ttl) ? $options->cache_ttl : 10 * 60)) . ' GMT'); } ////////////////////////////////// // Set up HTTP agent ////////////////////////////////// $http = new HumbleHttpAgent(); $http->debug = $debug_mode; $http->userAgentMap = $options->user_agents; $http->headerOnlyTypes = array_keys($options->content_type_exc); $http->rewriteUrls = $options->rewrite_url; //$http->initCache($options->cache_dir, $options->cache_directory_level, $options->cache_cleanup, isset($options->http_cache_ttl) ? $options->http_cache_ttl : 12*60*60); ////////////////////////////////// // Set up Content Extractor ////////////////////////////////// $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); $extractor->debug = $debug_mode; SiteConfig::$debug = $debug_mode; SiteConfig::use_apc($options->apc); $extractor->fingerprints = $options->fingerprints; $extractor->allowedParsers = $options->allowed_parsers; //////////////////////////////// // Get RSS/Atom feed //////////////////////////////// if (!$html_only) { debug('--------'); debug("Attempting to process URL as feed"); // Send user agent header showing PHP (prevents a HTML response from feedburner) $http->userAgentDefault = HumbleHttpAgent::UA_PHP; // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance SimplePie_HumbleHttpAgent::set_agent($http);
if ($valid_key) { header('Expires: ' . gmdate('D, d M Y H:i:s', time() + 60 * 10) . ' GMT'); } else { header('Expires: ' . gmdate('D, d M Y H:i:s', time() + 60 * 20) . ' GMT'); } ////////////////////////////////// // Set up HTTP agent ////////////////////////////////// $http = new HumbleHttpAgent(); $http->userAgentMap = $options->user_agents; $http->headerOnlyTypes = array_keys($options->content_type_exc); $http->rewriteUrls = $options->rewrite_url; ////////////////////////////////// // Set up Content Extractor ////////////////////////////////// $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); $extractor->fingerprints = $options->fingerprints; /* if ($options->caching) { $frontendOptions = array( 'lifetime' => 30*60, // cache lifetime of 30 minutes 'automatic_serialization' => true, 'write_control' => false, 'automatic_cleaning_factor' => $options->cache_cleanup, 'ignore_user_abort' => false ); $backendOptions = array( 'cache_dir' => $options->cache_dir.'/http-responses/', // directory where to put the cache files 'file_locking' => false, 'read_control' => true, 'read_control_type' => 'strlen',
<?php include_once './generics.php'; include_once './models/doctors.php'; include_once './models/departments.php'; include_once './models/ambulatories.php'; include_once './content_extractor.php'; $db = new Database($lang); $content_extractor = new ContentExtractor("content/{$lang}/team.xml"); ?> <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <link href='https://fonts.googleapis.com/css?family=Antic+Slab:400,600' rel='stylesheet' type='text/css'> <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,600' rel='stylesheet' type='text/css'> <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css"> <link rel="stylesheet" type="text/css" href="./css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="./css/layout.css"> <link rel="stylesheet" type="text/css" href="./css/team.css"> <title class="text-capitalize">Il Team</title> </head> <body> <div id="header_bg_image"> <?php include_once "./partials/_navbar.php"; ?> </div>
} } } return array($link_cnt, $word_cnt, $text_len, 'delete' => $delete); } function decode($str) { $str = str_replace(array('&#', ';'), array('', ''), $str); $dec = intval($str[0]); if ($dec < 128) { $utf .= chr($dec); } else { if ($dec < 2048) { $utf .= chr(192 + ($dec - $dec % 64) / 64); $utf .= chr(128 + $dec % 64); } else { $utf .= chr(224 + ($dec - $dec % 4096) / 4096); $utf .= chr(128 + ($dec % 4096 - $dec % 64) / 64); $utf .= chr(128 + $dec % 64); } } return $utf; } } /**************************** Simple usage example *****************************/ $html = file_get_contents('http://www.techweb.com.cn/news/2010-07-20/644329.shtml'); $extractor = new ContentExtractor(); $content = $extractor->extract($html); echo $content;
$_req_options = array('proxyhost' => $proxy['host']); if (isset($proxy['auth'])) { $_req_options['proxyauth'] = $proxy['auth']; } } $http = new HumbleHttpAgent($_req_options); $http->debug = $debug_mode; $http->userAgentMap = $options->user_agents; $http->headerOnlyTypes = array_keys($options->content_type_exc); $http->rewriteUrls = $options->rewrite_url; unset($_req_options); } ////////////////////////////////// // Set up Content Extractor ////////////////////////////////// $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); $extractor->debug = $debug_mode; SiteConfig::$debug = $debug_mode; SiteConfig::use_apc($options->apc); $extractor->fingerprints = $options->fingerprints; $extractor->allowedParsers = $options->allowed_parsers; $extractor->parserOverride = $parser; if ($options->user_submitted_config && $user_submitted_config) { $extractor->setUserSubmittedConfig($user_submitted_config); } //////////////////////////////// // Get RSS/Atom feed //////////////////////////////// if ($accept !== 'html') { debug('--------'); debug("Attempting to process URL as feed");
////////////////////////////////// // Set Expires header ////////////////////////////////// if ($valid_key) { header('Expires: ' . gmdate('D, d M Y H:i:s', time() + 60 * 10) . ' GMT'); } else { header('Expires: ' . gmdate('D, d M Y H:i:s', time() + 60 * 20) . ' GMT'); } ////////////////////////////////// // Set up HTTP agent ////////////////////////////////// $http = new HumbleHttpAgent(); ////////////////////////////////// // Set up Content Extractor ////////////////////////////////// $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); /* if ($options->caching) { $frontendOptions = array( 'lifetime' => 30*60, // cache lifetime of 30 minutes 'automatic_serialization' => true, 'write_control' => false, 'automatic_cleaning_factor' => $options->cache_cleanup, 'ignore_user_abort' => false ); $backendOptions = array( 'cache_dir' => $options->cache_dir.'/http-responses/', // directory where to put the cache files 'file_locking' => false, 'read_control' => true, 'read_control_type' => 'strlen', 'hashed_directory_level' => $options->cache_directory_level,
public function extractContentBlock($permalink) { $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard'); //$extractor = $this; $extractor->next_page_deep_count = $this->next_page_deep_count + 1; $extractor->next_pages = $this->next_pages; if (in_array($permalink, $extractor->next_pages)) { return FALSE; } $extractor->next_pages[] = $permalink; if ($extractor->next_page_deep_count > 3) { return FALSE; } $extractor->fingerprints = $this->options->fingerprints; $elem = new ContentExtractor($this->path, $this->fallback); $extractor->fingerprints = $this->fingerprints; $http = new HumbleHttpAgent(); $response = $http->get($permalink, true); //echo 'status_code: '. $response['status_code'] . "\n\n"; if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) { $html = $response['body']; //echo "html: " .$html; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $response['headers']); if (function_exists('mb_convert_encoding')) { $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); } $extract_result = $extractor->process($html, $permalink); //$readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n"; //$this->body->appendChild($elem); } $doc = new DOMDocument(); if (@$doc->loadHTML($content_block->innerHTML)) { $doc->saveHTML(); //$content = $this->readability->dom->loadHTML($content_block->innerHTML); $content = $this->readability->dom->createElement('div', $content_block->innerHTML); $content = $this->readability->dom->importNode($content_block, true); return $content; } else { return FALSE; } return FALSE; //return $content_block; }
<?php include_once './generics.php'; include_once './content_extractor.php'; include_once './models/doctors.php'; include_once './models/departments.php'; $db = new Database($lang); $department_id = array_key_exists('ID', $_GET) ? $_GET['ID'] : 1; $department = new Department($db); $department->get_by_id($department_id); $content_extractor = new ContentExtractor("content/{$lang}/departments/" . strtolower($department->acronim) . ".xml"); ?> <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <link href='https://fonts.googleapis.com/css?family=Antic+Slab:400,600' rel='stylesheet' type='text/css'> <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,600' rel='stylesheet' type='text/css'> <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css"> <link rel="stylesheet" type="text/css" href="./css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="./css/layout.css"> <link rel="stylesheet" type="text/css" href="./css/department.css"> <?php echo "<link rel='stylesheet' type='text/css' href='./css/" . strtolower($department->acronim) . ".css'>\n\t<title> {$department->name}</title>"; ?> </head> <body> <?php echo "<div id='header_bg_image' data-image='{$department->image}'>";