/**
  * 
  * @return byte
  */
 public function convertToKindleFile()
 {
     $html = $this->rowContents;
     if ($this->isExtractEnabled) {
         $extractor = new ContentExtractor();
         $extractor->exec($this->encodedContents());
         if ($this->isImageEnabled) {
             $imgDownloader = new ImageDownloader($extractor->getExtractedNode(), new Url($this->url), $this->dirBuilder);
             $imgDownloader->exec();
         }
         $normalizer = new ContentsNormalizer($this->url, $extractor->title, $extractor->getExtractedNode());
         $normalizer->exec();
         $html = $normalizer->getHtml();
     }
     $ret = $this->dirBuilder->putContents($html);
     $mobiFileName = pathinfo($this->dirBuilder->getMobiPath(), PATHINFO_BASENAME);
     $command = KindleGenCommand::newInstance($this->dirBuilder->getContentsPath(), $mobiFileName);
     $command->exec();
     $mobiFile = file_get_contents($this->dirBuilder->getMobiPath());
     return $mobiFile;
 }
    public function testDomDocument()
    {
        $html = $this->loadDat('dom_document.html');
        //		$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
        $extractor = new ContentExtractor();
        $extractor->exec($html);
        $xpath = $extractor->calculateXpath();
        $text = $extractor->scan($extractor->getExtractedNode());
        d($extractor->getExtractedNode()->nodeName);
        d($text);
        d($xpath);
        d($extractor->params);
        d($extractor->text);
        d($extractor->title);
        d('pancutuationCountAll:' . $extractor->pancutuationCountAll);
        d('domCountAll:' . $extractor->domCountAll);
        d('textLengthAll:' . $extractor->textLengthAll);
        d('textAll:' . $extractor->textAll);
        d(mb_strlen('あ', 'utf-8'));
        d(mb_strlen('ほげ
				'));
    }
 public function testXpath()
 {
     $datPath = implode('/', [PATH_TEST, 'dat', 'ContentExtractor']);
     foreach ($this->getData() as $name => $testData) {
         // ファイル名が正解のxpathのキー
         $path = implode('/', [$datPath, $name]);
         if (is_dir($path)) {
             continue;
         }
         $html = file_get_contents($path);
         $extractor = new ContentExtractor();
         $extractor->exec($html);
         $xpath = $extractor->calculateXpath();
         $text = $extractor->scan($extractor->getExtractedNode());
         $hit = false;
         foreach ($testData->xpathCandidates as $xpathExpected) {
             if ($xpathExpected === $xpath) {
                 $hit = true;
             }
         }
         if (!$hit) {
             d($text);
             file_put_contents('./test2.txt', $text);
             d($extractor->getExtractedNode()->nodeName);
             d($xpath);
             d($testData->url);
             d($extractor->params);
             d('pancutuationCountAll:' . $extractor->pancutuationCountAll);
             d('domCountAll:' . $extractor->domCountAll);
             d('textLengthAll:' . $extractor->textLengthAll);
             d('textAll:' . $extractor->textAll);
             //				d('preProcessedInput:' . $extractor->preProcessedInput);
         }
         $this->assertEquals(true, $hit, $xpath . ' ' . $testData->url);
         // . PHP_EOL . var_export($extractor->params, true) . PHP_EOL);
     }
 }