/**
  * Add method
  *
  * @return void Redirects on successful add, renders view otherwise.
  */
 public function add()
 {
     $minute = $this->Minutes->newEntity();
     if ($this->request->is('post')) {
         //perform content extraction
         $f = $this->request->data['uploaded_file'];
         $extractor = new \ContentExtractor($f['type']);
         if ($extractor->supported()) {
             try {
                 $minute['content'] = $extractor->extract($f['tmp_name']);
             } catch (Exception $e) {
                 $minute['content'] = '';
             }
         }
         //set meeting date
         $date = new \DateTime($this->request->data['meeting_date']);
         $minute['meeting_date'] = $date;
         //process upload data. Always set to private
         $ret = $this->Upload->attachToEntity($minute, $f);
         if ($ret['success'] && $this->Minutes->save($minute)) {
             $this->Flash->success(__('The minutes have been saved.'));
             return $this->redirect(['action' => 'index']);
         }
         //if we haven't been re-directed yet, we've failed
         $msg = $ret['message'];
         $this->Flash->error(__("The uploaded file could not be saved. Error message was: '{$msg}'"));
     }
     $this->set(compact('minute'));
     $this->set('_serialize', ['minute']);
 }
 /**
  * 
  * @return byte
  */
 public function convertToKindleFile()
 {
     $html = $this->rowContents;
     if ($this->isExtractEnabled) {
         $extractor = new ContentExtractor();
         $extractor->exec($this->encodedContents());
         if ($this->isImageEnabled) {
             $imgDownloader = new ImageDownloader($extractor->getExtractedNode(), new Url($this->url), $this->dirBuilder);
             $imgDownloader->exec();
         }
         $normalizer = new ContentsNormalizer($this->url, $extractor->title, $extractor->getExtractedNode());
         $normalizer->exec();
         $html = $normalizer->getHtml();
     }
     $ret = $this->dirBuilder->putContents($html);
     $mobiFileName = pathinfo($this->dirBuilder->getMobiPath(), PATHINFO_BASENAME);
     $command = KindleGenCommand::newInstance($this->dirBuilder->getContentsPath(), $mobiFileName);
     $command->exec();
     $mobiFile = file_get_contents($this->dirBuilder->getMobiPath());
     return $mobiFile;
 }
    public function testDomDocument()
    {
        $html = $this->loadDat('dom_document.html');
        //		$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
        $extractor = new ContentExtractor();
        $extractor->exec($html);
        $xpath = $extractor->calculateXpath();
        $text = $extractor->scan($extractor->getExtractedNode());
        d($extractor->getExtractedNode()->nodeName);
        d($text);
        d($xpath);
        d($extractor->params);
        d($extractor->text);
        d($extractor->title);
        d('pancutuationCountAll:' . $extractor->pancutuationCountAll);
        d('domCountAll:' . $extractor->domCountAll);
        d('textLengthAll:' . $extractor->textLengthAll);
        d('textAll:' . $extractor->textAll);
        d(mb_strlen('あ', 'utf-8'));
        d(mb_strlen('ほげ
				'));
    }
 public function testXpath()
 {
     $datPath = implode('/', [PATH_TEST, 'dat', 'ContentExtractor']);
     foreach ($this->getData() as $name => $testData) {
         // ファイル名が正解のxpathのキー
         $path = implode('/', [$datPath, $name]);
         if (is_dir($path)) {
             continue;
         }
         $html = file_get_contents($path);
         $extractor = new ContentExtractor();
         $extractor->exec($html);
         $xpath = $extractor->calculateXpath();
         $text = $extractor->scan($extractor->getExtractedNode());
         $hit = false;
         foreach ($testData->xpathCandidates as $xpathExpected) {
             if ($xpathExpected === $xpath) {
                 $hit = true;
             }
         }
         if (!$hit) {
             d($text);
             file_put_contents('./test2.txt', $text);
             d($extractor->getExtractedNode()->nodeName);
             d($xpath);
             d($testData->url);
             d($extractor->params);
             d('pancutuationCountAll:' . $extractor->pancutuationCountAll);
             d('domCountAll:' . $extractor->domCountAll);
             d('textLengthAll:' . $extractor->textLengthAll);
             d('textAll:' . $extractor->textAll);
             //				d('preProcessedInput:' . $extractor->preProcessedInput);
         }
         $this->assertEquals(true, $hit, $xpath . ' ' . $testData->url);
         // . PHP_EOL . var_export($extractor->params, true) . PHP_EOL);
     }
 }
if (!$debug_mode) {
    header('Expires: ' . gmdate('D, d M Y H:i:s', time() + (isset($options->cache_ttl) ? $options->cache_ttl : 10 * 60)) . ' GMT');
}
//////////////////////////////////
// Set up HTTP agent
//////////////////////////////////
$http = new HumbleHttpAgent();
$http->debug = $debug_mode;
$http->userAgentMap = $options->user_agents;
$http->headerOnlyTypes = array_keys($options->content_type_exc);
$http->rewriteUrls = $options->rewrite_url;
//$http->initCache($options->cache_dir, $options->cache_directory_level, $options->cache_cleanup, isset($options->http_cache_ttl) ? $options->http_cache_ttl : 12*60*60);
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
$extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard');
$extractor->debug = $debug_mode;
SiteConfig::$debug = $debug_mode;
SiteConfig::use_apc($options->apc);
$extractor->fingerprints = $options->fingerprints;
$extractor->allowedParsers = $options->allowed_parsers;
////////////////////////////////
// Get RSS/Atom feed
////////////////////////////////
if (!$html_only) {
    debug('--------');
    debug("Attempting to process URL as feed");
    // Send user agent header showing PHP (prevents a HTML response from feedburner)
    $http->userAgentDefault = HumbleHttpAgent::UA_PHP;
    // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
    SimplePie_HumbleHttpAgent::set_agent($http);
if ($valid_key) {
    header('Expires: ' . gmdate('D, d M Y H:i:s', time() + 60 * 10) . ' GMT');
} else {
    header('Expires: ' . gmdate('D, d M Y H:i:s', time() + 60 * 20) . ' GMT');
}
//////////////////////////////////
// Set up HTTP agent
//////////////////////////////////
$http = new HumbleHttpAgent();
$http->userAgentMap = $options->user_agents;
$http->headerOnlyTypes = array_keys($options->content_type_exc);
$http->rewriteUrls = $options->rewrite_url;
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
$extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard');
$extractor->fingerprints = $options->fingerprints;
/*
if ($options->caching) {
	$frontendOptions = array(
	   'lifetime' => 30*60, // cache lifetime of 30 minutes
	   'automatic_serialization' => true,
	   'write_control' => false,
	   'automatic_cleaning_factor' => $options->cache_cleanup,
	   'ignore_user_abort' => false
	); 
	$backendOptions = array(
		'cache_dir' => $options->cache_dir.'/http-responses/', // directory where to put the cache files
		'file_locking' => false,
		'read_control' => true,
		'read_control_type' => 'strlen',
Example #7
0
<?php

include_once './generics.php';
include_once './models/doctors.php';
include_once './models/departments.php';
include_once './models/ambulatories.php';
include_once './content_extractor.php';
$db = new Database($lang);
$content_extractor = new ContentExtractor("content/{$lang}/team.xml");
?>
<!DOCTYPE html>

<html>
<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<meta http-equiv="X-UA-Compatible" content="IE=edge">
	<link href='https://fonts.googleapis.com/css?family=Antic+Slab:400,600' rel='stylesheet' type='text/css'>
	<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,600' rel='stylesheet' type='text/css'>
	<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css">
	<link rel="stylesheet" type="text/css" href="./css/bootstrap.min.css">
	<link rel="stylesheet" type="text/css" href="./css/layout.css">
	<link rel="stylesheet" type="text/css" href="./css/team.css">	
	<title class="text-capitalize">Il Team</title>
</head>
<body>
	<div id="header_bg_image">
		<?php 
include_once "./partials/_navbar.php";
?>
	</div>
Example #8
0
                }
            }
        }
        return array($link_cnt, $word_cnt, $text_len, 'delete' => $delete);
    }
    function decode($str)
    {
        $str = str_replace(array('&#', ';'), array('', ''), $str);
        $dec = intval($str[0]);
        if ($dec < 128) {
            $utf .= chr($dec);
        } else {
            if ($dec < 2048) {
                $utf .= chr(192 + ($dec - $dec % 64) / 64);
                $utf .= chr(128 + $dec % 64);
            } else {
                $utf .= chr(224 + ($dec - $dec % 4096) / 4096);
                $utf .= chr(128 + ($dec % 4096 - $dec % 64) / 64);
                $utf .= chr(128 + $dec % 64);
            }
        }
        return $utf;
    }
}
/****************************
	Simple usage example
*****************************/
$html = file_get_contents('http://www.techweb.com.cn/news/2010-07-20/644329.shtml');
$extractor = new ContentExtractor();
$content = $extractor->extract($html);
echo $content;
        $_req_options = array('proxyhost' => $proxy['host']);
        if (isset($proxy['auth'])) {
            $_req_options['proxyauth'] = $proxy['auth'];
        }
    }
    $http = new HumbleHttpAgent($_req_options);
    $http->debug = $debug_mode;
    $http->userAgentMap = $options->user_agents;
    $http->headerOnlyTypes = array_keys($options->content_type_exc);
    $http->rewriteUrls = $options->rewrite_url;
    unset($_req_options);
}
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
$extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard');
$extractor->debug = $debug_mode;
SiteConfig::$debug = $debug_mode;
SiteConfig::use_apc($options->apc);
$extractor->fingerprints = $options->fingerprints;
$extractor->allowedParsers = $options->allowed_parsers;
$extractor->parserOverride = $parser;
if ($options->user_submitted_config && $user_submitted_config) {
    $extractor->setUserSubmittedConfig($user_submitted_config);
}
////////////////////////////////
// Get RSS/Atom feed
////////////////////////////////
if ($accept !== 'html') {
    debug('--------');
    debug("Attempting to process URL as feed");
//////////////////////////////////
// Set Expires header
//////////////////////////////////
if ($valid_key) {
    header('Expires: ' . gmdate('D, d M Y H:i:s', time() + 60 * 10) . ' GMT');
} else {
    header('Expires: ' . gmdate('D, d M Y H:i:s', time() + 60 * 20) . ' GMT');
}
//////////////////////////////////
// Set up HTTP agent
//////////////////////////////////
$http = new HumbleHttpAgent();
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
$extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard');
/*
if ($options->caching) {
	$frontendOptions = array(
	   'lifetime' => 30*60, // cache lifetime of 30 minutes
	   'automatic_serialization' => true,
	   'write_control' => false,
	   'automatic_cleaning_factor' => $options->cache_cleanup,
	   'ignore_user_abort' => false
	); 
	$backendOptions = array(
		'cache_dir' => $options->cache_dir.'/http-responses/', // directory where to put the cache files
		'file_locking' => false,
		'read_control' => true,
		'read_control_type' => 'strlen',
		'hashed_directory_level' => $options->cache_directory_level,
 public function extractContentBlock($permalink)
 {
     $extractor = new ContentExtractor(dirname(__FILE__) . '/site_config/custom', dirname(__FILE__) . '/site_config/standard');
     //$extractor = $this;
     $extractor->next_page_deep_count = $this->next_page_deep_count + 1;
     $extractor->next_pages = $this->next_pages;
     if (in_array($permalink, $extractor->next_pages)) {
         return FALSE;
     }
     $extractor->next_pages[] = $permalink;
     if ($extractor->next_page_deep_count > 3) {
         return FALSE;
     }
     $extractor->fingerprints = $this->options->fingerprints;
     $elem = new ContentExtractor($this->path, $this->fallback);
     $extractor->fingerprints = $this->fingerprints;
     $http = new HumbleHttpAgent();
     $response = $http->get($permalink, true);
     //echo 'status_code: '. $response['status_code'] . "\n\n";
     if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
         $html = $response['body'];
         //echo "html: " .$html;
         // remove strange things
         $html = str_replace('</[>', '', $html);
         $html = convert_to_utf8($html, $response['headers']);
         if (function_exists('mb_convert_encoding')) {
             $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
         }
         $extract_result = $extractor->process($html, $permalink);
         //$readability = $extractor->readability;
         $content_block = $extract_result ? $extractor->getContent() : null;
         //echo "content_block->innerHTML: ". $content_block->innerHTML . "\n\n";
         //$this->body->appendChild($elem);
     }
     $doc = new DOMDocument();
     if (@$doc->loadHTML($content_block->innerHTML)) {
         $doc->saveHTML();
         //$content = $this->readability->dom->loadHTML($content_block->innerHTML);
         $content = $this->readability->dom->createElement('div', $content_block->innerHTML);
         $content = $this->readability->dom->importNode($content_block, true);
         return $content;
     } else {
         return FALSE;
     }
     return FALSE;
     //return $content_block;
 }
<?php

include_once './generics.php';
include_once './content_extractor.php';
include_once './models/doctors.php';
include_once './models/departments.php';
$db = new Database($lang);
$department_id = array_key_exists('ID', $_GET) ? $_GET['ID'] : 1;
$department = new Department($db);
$department->get_by_id($department_id);
$content_extractor = new ContentExtractor("content/{$lang}/departments/" . strtolower($department->acronim) . ".xml");
?>
<!DOCTYPE html>
<html>
<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<meta http-equiv="X-UA-Compatible" content="IE=edge">
	<link href='https://fonts.googleapis.com/css?family=Antic+Slab:400,600' rel='stylesheet' type='text/css'>
	<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,600' rel='stylesheet' type='text/css'>
	<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css">
	<link rel="stylesheet" type="text/css" href="./css/bootstrap.min.css">
	<link rel="stylesheet" type="text/css" href="./css/layout.css">
	<link rel="stylesheet" type="text/css" href="./css/department.css">
	<?php 
echo "<link rel='stylesheet' type='text/css' href='./css/" . strtolower($department->acronim) . ".css'>\n\t<title> {$department->name}</title>";
?>
</head>
<body>
	<?php 
echo "<div id='header_bg_image' data-image='{$department->image}'>";