function retriever_apply_dom_filter($retriever, &$item, $resource) { logger('retriever_apply_dom_filter: applying XSLT to ' . $item['id'] . ' ' . $item['plink'], LOGGER_DEBUG); require_once 'include/html2bbcode.php'; if (!$retriever['data']['include']) { return; } if (!$resource['data']) { logger('retriever_apply_dom_filter: no text to work with', LOGGER_NORMAL); return; } $encoding = retriever_get_encoding($resource); logger('@@@ item type ' . $resource['type'] . ' encoding ' . $encoding); $extracter_template = get_markup_template('extract.tpl', 'addon/retriever/'); $doc = new DOMDocument('1.0', 'utf-8'); if (strpos($resource['type'], 'html') !== false) { @$doc->loadHTML($resource['data']); } else { $doc->loadXML($resource['data']); } logger('@@@ actual encoding of document is ' . $doc->encoding); $components = parse_url($item['plink']); $rooturl = $components['scheme'] . "://" . $components['host']; $dirurl = $rooturl . dirname($components['path']) . "/"; $params = array('$include' => retriever_construct_xpath($retriever['data']['include']), '$exclude' => retriever_construct_xpath($retriever['data']['exclude']), '$pageurl' => $item['plink'], '$dirurl' => $dirurl, '$rooturl' => $rooturl); $xslt = replace_macros($extracter_template, $params); $xmldoc = new DOMDocument(); $xmldoc->loadXML($xslt); $xp = new XsltProcessor(); $xp->importStylesheet($xmldoc); $transformed = $xp->transformToXML($doc); $item['body'] = html2bbcode($transformed); if (!strlen($item['body'])) { logger('retriever_apply_dom_filter retriever ' . $retriever['id'] . ' item ' . $item['id'] . ': output was empty', LOGGER_NORMAL); return; } $item['body'] .= "\n\n" . t('Retrieved') . ' ' . date("Y-m-d") . ': [url='; $item['body'] .= $item['plink']; $item['body'] .= ']' . $item['plink'] . '[/url]'; q("UPDATE `item` SET `body` = '%s' WHERE `id` = %d", dbesc($item['body']), intval($item['id'])); }
function retriever_apply_dom_filter($retriever, &$item, $resource) { logger('retriever_apply_dom_filter: applying XSLT to ' . $item['id'] . ' ' . $item['uri'] . ' contact ' . $item['contact-id'], LOGGER_DEBUG); if (!$retriever['data']['include'] && !$retriever['data']['customxslt']) { return; } if (!$resource['data']) { logger('retriever_apply_dom_filter: no text to work with', LOGGER_NORMAL); return; } $encoding = retriever_get_encoding($resource); $content = mb_convert_encoding($resource['data'], 'HTML-ENTITIES', $encoding); $doc = new DOMDocument('1.0', 'UTF-8'); if (strpos($resource['type'], 'html') !== false) { @$doc->loadHTML($content); } else { $doc->loadXML($content); } $params = array('$spec' => $retriever['data']); $extract_template = get_markup_template('extract.tpl', 'addon/retriever/'); $extract_xslt = replace_macros($extract_template, $params); if ($retriever['data']['include']) { $doc = retriever_apply_xslt_text($extract_xslt, $doc); } if ($retriever['data']['customxslt']) { $doc = retriever_apply_xslt_text($retriever['data']['customxslt'], $doc); } if (!$doc) { logger('retriever_apply_dom_filter: failed to apply extract XSLT template', LOGGER_NORMAL); return; } $components = parse_url($resource['redirect-url']); $rooturl = $components['scheme'] . "://" . $components['host']; $dirurl = $rooturl . dirname($components['path']) . "/"; $params = array('$dirurl' => $dirurl, '$rooturl' => $rooturl); $fix_urls_template = get_markup_template('fix-urls.tpl', 'addon/retriever/'); $fix_urls_xslt = replace_macros($fix_urls_template, $params); $doc = retriever_apply_xslt_text($fix_urls_xslt, $doc); if (!$doc) { logger('retriever_apply_dom_filter: failed to apply fix urls XSLT template', LOGGER_NORMAL); return; } $item['body'] = html2bbcode($doc->saveXML()); if (!strlen($item['body'])) { logger('retriever_apply_dom_filter retriever ' . $retriever['id'] . ' item ' . $item['id'] . ': output was empty', LOGGER_NORMAL); return; } $item['body'] .= "\n\n" . t('Retrieved') . ' ' . date("Y-m-d") . ': [url='; $item['body'] .= $item['plink']; $item['body'] .= ']' . $item['plink'] . '[/url]'; q("UPDATE `item` SET `body` = '%s' WHERE `id` = %d", dbesc($item['body']), intval($item['id'])); }