function html2xhtml($html) { process_pagebreak_commands($html); // Remove SCRIPT tags from the page being processed, as script content may // mess the firther html-parsing utilities $html = process_script($html); // Remove STYLE tags for the same reason and store them in the temporary variable // later they will be added back to HEAD section $styles = process_style($html); // Convert HTML character references to their Unicode analogues process_character_references($html); remove_comments($html); fix_attrs_spaces($html); $html = quote_attrs($html); $html = escape_attrs_entities($html); $html = lowercase_tags($html); $html = lowercase_closing_tags($html); $html = fix_closing_tags($html); $html = close_tag("area", $html); $html = close_tag("base", $html); $html = close_tag("basefont", $html); $html = close_tag("br", $html); $html = close_tag("col", $html); $html = close_tag("embed", $html); $html = close_tag("frame", $html); $html = close_tag("hr", $html); $html = close_tag("img", $html); $html = close_tag("input", $html); $html = close_tag("isindex", $html); $html = close_tag("link", $html); $html = close_tag("meta", $html); $html = close_tag("param", $html); $html = make_attr_value("checked", $html); $html = make_attr_value("compact", $html); $html = make_attr_value("declare", $html); $html = make_attr_value("defer", $html); $html = make_attr_value("disabled", $html); $html = make_attr_value("ismap", $html); $html = make_attr_value("multiple", $html); $html = make_attr_value("nohref", $html); $html = make_attr_value("noresize", $html); $html = make_attr_value("noshade", $html); $html = make_attr_value("nowrap", $html); $html = make_attr_value("readonly", $html); $html = make_attr_value("selected", $html); $html = process_html($html); $html = process_body($html); $html = process_head($html); $html = process_p($html); $html = escape_amp($html); $html = escape_lt($html); $html = escape_gt($html); $html = escape_textarea_content($html); process_tables($html, 0); process_lists($html, 0); process_deflists($html, 0); process_selects($html, 0); $html = fix_tags($html); $html = fix_attrs($html); $html = insert_styles($html, $styles); return $html; }
$uri = $_GET['uri']; //develop XML serialization $writer = new XMLWriter(); $writer->openURI('php://output'); $writer->startDocument('1.0', 'UTF-8'); $writer->setIndent(true); $writer->setIndentString(" "); //validate URI if (preg_match('/https:\\/\\/[a-z]+\\.academia.edu\\/[A-Za-z]+/', $uri)) { //initiate curl $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $uri); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $output = curl_exec($ch); if (curl_exec($ch) !== FALSE) { process_html($output, $writer); } else { $writer->startElement('response'); $writer->writeElement('error', 'Unable to retrieve data from Academia.edu URI.'); $writer->endElement(); } curl_close($ch); } else { $writer->startElement('response'); $writer->writeElement('error', 'URI does not validate.'); $writer->endElement(); } function process_html($output, $writer) { //get creator metadata preg_match('/c\\.User\\.set_viewed\\((.*)\\);\\n/', $output, $matches);