<?php $tidy = tidy_parse_file("clean_ex1.html", array("clean" => true)); tidy_clean_repair($tidy); echo $tidy;
<?php $tidy = tidy_parse_file("http://www.foodpro.huds.harvard.edu/foodpro/menu_items.asp?date=12-2-2009&type=30&meal=2", array("numeric-entities" => true, "output-xhtml" => true)); $tidy->cleanRepair(); $xhtml = (string) $tidy; $dom = simplexml_load_string($xhtml); $dom->registerXPathNamespace("xhtml", "http://www.w3.org/1999/xhtml"); $trs = $dom->xpath("//xhtml:form[@id='report_form']/xhtml:table/xhtml:tr"); unset($category); foreach ($trs as $tr) { // remember category if ($tr["class"] == "category") { $category = trim((string) $tr->td); } else { if (!isset($category)) { continue; } else { // get item $a = $tr->td->div->span->a; if (!($item = trim($a))) { continue; } // determine recipe if (!preg_match("/recipe=(\\d+)/", $a["href"], $matches)) { continue; } $recipe = $matches[1]; // INSERT INTO into items $sql = sprintf("INSERT IGNORE INTO items (recipe, item) VALUES('%s', '%s')", mysql_real_escape_string($recipe), mysql_real_escape_string($item)); mysql_query($sql); // INSERT INTO legend
<?php /* * dumpit5.php * * a command-line script which dumps the given HTML, PHP, ASP, XHTML, etc. * file as it is represented in the document model. * * NOTE: Only works with tidy for PHP 5+, for tidy in 4.3.x, see dumpit.php * * By: John Coggeshall <*****@*****.**> * * Usage; php dumpit5.php <filename> */ $tidy = tidy_parse_file($_SERVER['argv'][1]); /* Optionally you can do this here if you want to fix up the document */ /* $tidy->clean_repair() */ $tree = $tidy->root(); dump_tree($tree); echo "\n"; function node_type($type) { switch ($type) { case TIDY_NODETYPE_ROOT: return "Root Node"; case TIDY_NODETYPE_DOCTYPE: return "DocType Node"; case TIDY_NODETYPE_COMMENT: return "Comment Node"; case TIDY_NODETYPE_PROCINS: return "ProcIns Node";
function parseFile($fn) { // Save all '<' symbols //$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc); // Web documents shouldn't contains \x00 symbol //$doc = str_replace("\x00", '', $doc); // Opera6 bug workaround //$doc = str_replace("\xC0\xBC", '<', $doc); // UTF-7 encoding ASCII decode //$doc = $this->repackUTF7($doc); if (!extension_loaded('tidy')) { die("Add tidy extension to extension.ini"); } $tree = tidy_parse_file($fn, array(), 'UTF8'); return $this->tidyTree($tree->root()); // use tidy!!!! }
<?php $tidy = tidy_parse_file("intro2_ex1.html"); tidy_clean_repair($tidy); echo tidy_get_output($tidy);
} if (!isset($get[$e])) { continue; } $key = strtolower(basename($url)); $files[$key] = array('url' => $url, 'referer' => $file); } dump_nodes($file, $child); } } $a = scandir($scandir); foreach ($a as $file) { if (!preg_match('/\\.html?/i', $file)) { continue; } $tidy = tidy_parse_file($scandir . '/' . $file); dump_nodes($scandir . $file, $tidy->root()); } $len = strlen($base) + 1; ksort($files); foreach ($files as $key => $value) { $url = $value['url']; $referer = $value['referer']; $file = substr($url, $len); #printf("file=%s url=%s\n", $file, $url); $zip = $dir . '/' . $file; if (!file_exists($zip)) { # fprintf(STDERR, "File not found: %s\n", $zip); continue; } $pathinfo = pathinfo($zip);
<?php $tidy = tidy_parse_file("example.html"); /* Optionally you can do this here if you want to fix up the document */ /* $tidy->clean_repair() */ $tree = $tidy->root(); dump_tree($tree); echo "\n"; function node_type($type) { switch ($type) { case TIDY_NODETYPE_ROOT: return "Root Node"; case TIDY_NODETYPE_DOCTYPE: return "DocType Node"; case TIDY_NODETYPE_COMMENT: return "Comment Node"; case TIDY_NODETYPE_PROCINS: return "ProcIns Node"; case TIDY_NODETYPE_TEXT: return "Text Node"; case TIDY_NODETYPE_START: return "Start Node"; case TIDY_NODETYPE_END: return "End Node"; case TIDY_NODETYPE_STARTEND: return "Start/End Node"; case TIDY_NODETYPE_CDATA: return "CDATA Node"; case TIDY_NODETYPE_SECTION: return "Section Node";
private function getRawHtml($file = '') { $url = $this->feedUrl . "/" . $file; if ($file == '') { $file = "index.html"; } // just for local file, not url. $path = md5($this->feedUrl); $local_file = $path . "/" . $file; $this->localPath = e_TEMP . $path . "/"; if (!is_dir(e_TEMP . $path)) { mkdir(e_TEMP . $path, 0755); } if (!file_exists(e_TEMP . $local_file)) { e107::getFile()->getRemoteFile($url, $local_file); // downloads to e107_system/.../temp } if ($this->useTidy) { $tidy = new tidy(); $options = array("output-xhtml" => true, "clean" => true); $parsed = tidy_parse_file(e_TEMP . $local_file, $options); return $parsed->value; } elseif (!($html = file_get_contents(e_TEMP . $local_file))) { return "Couldn't read file"; } return $html; }
<?php $tidy = tidy_parse_file(dirname(__FILE__) . "/015.html", array('show-body-only' => true)); tidy_clean_repair($tidy); echo tidy_get_output($tidy);
#!/usr/bin/env php <?php // Web scrape Pilot/FlyingJ truckstop lat/lng for each store (rather than copy/paste 647 times) // Insert latlngs // constants and functions require "../includes/config.php"; $values = []; $ids = array(1, 2, 3, 4, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 21, 23, 24, 26, 28, 29, 30, 31, 34, 35, 36, 37, 39, 40, 41, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 75, 76, 77, 79, 81, 82, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 114, 118, 130, 131, 133, 134, 137, 140, 141, 144, 145, 146, 147, 149, 150, 151, 152, 154, 156, 157, 159, 163, 164, 165, 167, 168, 171, 174, 179, 180, 190, 192, 195, 196, 198, 199, 200, 206, 208, 209, 210, 211, 213, 219, 222, 224, 226, 231, 232, 233, 234, 236, 237, 238, 239, 240, 242, 243, 245, 247, 249, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 265, 266, 267, 268, 270, 271, 274, 275, 278, 279, 280, 281, 282, 284, 285, 286, 287, 289, 290, 293, 294, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 316, 317, 318, 319, 321, 322, 324, 326, 328, 329, 330, 331, 332, 335, 336, 337, 338, 339, 340, 341, 343, 346, 347, 348, 350, 351, 352, 353, 354, 356, 358, 360, 362, 363, 365, 366, 367, 368, 369, 370, 372, 373, 374, 375, 377, 378, 380, 381, 384, 385, 386, 387, 388, 390, 391, 392, 393, 394, 396, 398, 399, 402, 403, 404, 405, 406, 407, 408, 409, 411, 412, 413, 415, 416, 417, 420, 421, 422, 424, 425, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 453, 454, 455, 457, 458, 459, 460, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 481, 482, 483, 485, 486, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 500, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 522, 523, 524, 525, 526, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 546, 547, 549, 550, 551, 553, 554, 555, 556, 557, 559, 568, 571, 572, 575, 576, 579, 580, 581, 583, 584, 586, 589, 590, 592, 593, 594, 595, 596, 597, 599, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 630, 631, 632, 633, 634, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 649, 650, 652, 653, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 671, 672, 673, 674, 675, 676, 677, 678, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 746, 747, 748, 749, 750, 752, 753, 754, 756, 758, 759, 760, 761, 762, 763, 764, 765, 768, 770, 772, 773, 774, 775, 777, 784, 871, 873, 874, 875, 876, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 920, 922, 923, 924, 925, 931, 932, 934, 963, 964, 965, 966, 967, 968, 970, 1001, 1002, 1003, 1004, 1005, 1006, 1012, 1013, 1015, 1020, 1021, 1023, 1024, 1025, 1026, 1027, 1028, 1030, 1033, 1043, 1054, 1058, 4619, 4622, 4642, 4649, 4651, 4656, 8601, 8604, 8605, 8616, 8621, 8624, 8625, 8628, 8630, 8658); foreach ($ids as $id) { echo " \n "; // parse source if (!($tidy = tidy_parse_file("http://www.pilotflyingj.com/view-location?id={$id}", array("numeric-entities" => true, "output-xhtml" => true)))) { continue; } // convert to XHTML $tidy->cleanRepair(); $xhtml = (string) $tidy; // parse XHTML $dom = simplexml_load_string($xhtml); // register XHTML namespace $dom->registerXPathNamespace("xhtml", "http://www.w3.org/1999/xhtml"); // get store, lat/lng, and diesel price from paragraphs $paras = $dom->xpath("//xhtml:div[@id='indiv-location-store-info'][1]//xhtml:p[position()=1 or position()=6]"); foreach ($paras as $para) { $strong = trim((string) $para[0]->strong); if ($strong === "Store Number:") { $store_number = trim((string) $para); } if ($strong === "Coordinates:") { $coords = trim((string) $para); }
<?php $a = tidy_parse_file(dirname(__FILE__) . "/005.html"); echo tidy_get_output($a);
<?php /* Parse a new document */ $tidy = tidy_parse_file("http://www.coggeshall.org/"); /* Clean and repair the document */ $tidy->clean_repair(); /* Output the results; */ echo $tidy;
<?php $tidy = tidy_parse_file(dirname(__FILE__) . "/016.html", dirname(__FILE__) . "/016.tcfg"); tidy_clean_repair($tidy); echo tidy_get_output($tidy);
<?php $opts = array("clean" => true, "drop-proprietary-attributes" => true, "drop-font-tags" => true, "drop-empty-paras" => true, "hide-comments" => true, "join-classes" => true, "join-styles" => true); $tidy = tidy_parse_file("php.html", $opts); tidy_clean_repair($tidy); echo $tidy;
<?php $tidy = tidy_parse_file("http://www.php.net/", array('output-xhtml' => true)); $tidy->cleanRepair(); echo $tidy;
<?php /* Parse a file */ $tidy1 = tidy_parse_file("myfile.html"); /* Parse a string */ $tidy2 = tidy_parse_string("<HTML><B>Hello!</B>"); /* Clean up the markup */ tidy_clean_repair($tidy1); tidy_clean_repair($tidy2); /* Get the error buffer */ $errors = tidy_get_error_buffer($tidy1); /* Get the output */ $output = tidy_get_output($tidy2);