function parse_xml($html) { $parts = explode("<story", $html); array_shift($parts); $items = array(); for ($i = 0; $i < count($parts); $i++) { $item = array(); $item["type"] = "xml_story"; $item["title"] = extract_raw_tag($parts[$i], "title"); $item["title"] = html_decode($item["title"]); $item["title"] = html_decode($item["title"]); $item["title"] = replace_ctrl_chars($item["title"], " "); $item["title"] = str_replace(" ", " ", $item["title"]); $url = str_replace("&", "&", strip_ctrl_chars(extract_raw_tag($parts[$i], "url"))); term_echo("*** raw story url: " . $url); $item["url"] = get_redirected_url($url); $item["timestamp"] = time(); if ($item["title"] === False or $item["url"] === False) { continue; } $items[] = $item; } return $items; }
# TODO: INCORPORATE HYPERLINK IN SUMMARY TEXT # TODO: USE #rss-bot FOR MASS TEST INPUTS BUT DON'T ACTUALLY SUBMIT TO SITE require_once "lib.php"; $trailing = $argv[1]; $dest = $argv[2]; $nick = $argv[3]; $alias = $argv[4]; if ($alias == "~submit-advert") { pm("#soylent", "*** to try automagically submitting a story to SoylentNews: ~submit <url>"); return; } if ($trailing == "") { privmsg("usage: ~submit <url>"); return; } $url = get_redirected_url($trailing); if ($url === False) { privmsg("error: unable to download source (get_redirected_url)"); return; } $host = ""; $uri = ""; $port = 80; if (get_host_and_uri($url, $host, $uri, $port) == False) { privmsg("error: unable to download source (get_host_and_uri)"); return; } $response = wget($host, $uri, $port); $source_html = strip_headers($response); $source_title = extract_raw_tag($source_html, "title"); term_echo($source_title);
function get_redirected_url($from_url, $url_list = "", $last_loc = "", $cookies = "") { $url = trim($from_url); if ($url == "") { term_echo("get_redirected_url: empty url"); return False; } #term_echo(" get_redirected_url: $url"); $comp = parse_url($url); $host = ""; if (isset($comp["host"]) == False) { if (is_array($url_list) == True) { if (count($url_list) > 0) { $host = parse_url($url_list[count($url_list) - 1], PHP_URL_HOST); $scheme = parse_url($url_list[count($url_list) - 1], PHP_URL_SCHEME); $url = $scheme . "://" . $host . $url; } } } else { $host = $comp["host"]; } if ($host == "") { term_echo("get_redirected_url: redirect without host: " . $url); return False; } $uri = "/"; if (isset($comp["path"]) == True) { $uri = $comp["path"]; } if (isset($comp["query"]) == True) { if ($comp["query"] != "") { $uri = $uri . "?" . $comp["query"]; } } if (isset($comp["fragment"]) == True) { if ($comp["fragment"] != "") { $uri = $uri . "#" . $comp["fragment"]; } } $port = 80; if (isset($comp["scheme"]) == True) { if ($comp["scheme"] == "https") { $port = 443; } } if ($host == "" or $uri == "") { term_echo("get_redirected_url: empty host or uri"); return False; } $extra_headers = ""; if (isset($cookies[$host]) == True) { $cookie_strings = array(); foreach ($cookies[$host] as $key => $value) { $cookie_strings[] = $key . "=" . $value; } $extra_headers = array(); $extra_headers["Cookie"] = implode("; ", $cookie_strings); } #$breakcode="return (substr(\$response,strlen(\$response)-4)==\"\r\n\r\n\");"; $breakcode = "return ((strlen(\$response)>10000) or (substr(\$response,strlen(\$response)-7)==\"</head>\"));"; $response = wget($host, $uri, $port, ICEWEASEL_UA, $extra_headers, 10, $breakcode); if (is_array($cookies) == True) { $new_cookies = exec_get_cookies($response); if (count($new_cookies) > 0) { for ($i = 0; $i < count($new_cookies); $i++) { $parts = explode("; ", $new_cookies[$i]); $keyval = explode("=", $parts[0]); if (count($keyval) >= 2) { $key = $keyval[0]; array_shift($keyval); $value = implode("=", $keyval); $cookies[$host][$key] = $value; } } } } #var_dump($response); $loc_header = trim(exec_get_header($response, "location", False)); $location = $loc_header; # <META http-equiv="refresh" content="0;URL='http://www.goodgearguide.com.au/article/577990/how-encryption-keys-could-stolen-by-your-lunch/'"> if ($location == "" or $location == $last_loc) { if (is_array($cookies) == False) { return $url; } else { return array("url" => $url, "cookies" => $cookies, "extra_headers" => $extra_headers); } } else { if ($location[0] == "/") { $location = $url . $location; } if (is_array($url_list) == True) { $n = 0; for ($i = 0; $i < count($url_list); $i++) { if ($url_list[$i] == $url_list) { $n++; } } if ($n > 1) { term_echo("get_redirected_url: redirected url already been visited twice"); return False; } else { $list = $url_list; $list[] = $url; if (count($list) < 10) { return get_redirected_url($location, $list, $loc_header, $cookies); } else { if (is_array($cookies) == False) { return $url; } else { return array("url" => $url, "cookies" => $cookies, "extra_headers" => $extra_headers); } } } } else { $list = array($url); return get_redirected_url($location, $list, $loc_header, $cookies); } } }
function sn_submit($url) { if ($url == "") { return False; } $url = get_redirected_url($url); if ($url === False) { privmsg("error: unable to download source (get_redirected_url)"); return False; } $host = ""; $uri = ""; $port = 80; if (get_host_and_uri($url, $host, $uri, $port) == False) { privmsg("error: unable to download source (get_host_and_uri)"); return False; } $response = wget($host, $uri, $port); if (get_host_and_uri($url, $host, $uri, $port) == False) { privmsg("error: unable to download source (wget)"); return False; } $source_html = strip_headers($response); $source_title = extract_raw_tag($source_html, "title"); $delimiters = array("--", "|", " - ", " : ", " — ", " • "); for ($i = 0; $i < count($delimiters); $i++) { $j = strpos($source_title, $delimiters[$i]); if ($j !== False) { $source_title = trim(substr($source_title, 0, $j)); } } if ($source_title === False or $source_title == "") { privmsg("error: title not found or empty"); return False; } $source_title = html_decode($source_title); $source_title = html_decode($source_title); $source_body = extract_meta_content($source_html, "description"); if ($source_body === False or $source_body == "") { $source_body = extract_meta_content($source_html, "og:description", "property"); if ($source_body === False or $source_body == "") { privmsg("error: description meta content not found or empty"); return False; } } $html = $source_html; $article = extract_raw_tag($html, "article"); if ($article !== False) { $html = $article; } strip_all_tag($html, "head"); strip_all_tag($html, "script"); strip_all_tag($html, "style"); #strip_all_tag($html,"a"); strip_all_tag($html, "strong"); $html = strip_tags($html, "<p>"); $html = lowercase_tags($html); $html = explode("<p", $html); $source_body = array(); for ($i = 0; $i < count($html); $i++) { $parts = explode(">", $html[$i]); if (count($parts) >= 2) { array_shift($parts); $html[$i] = implode(">", $parts); } $html[$i] = strip_tags($html[$i]); $html[$i] = clean_text($html[$i]); $host_parts = explode(".", $host); for ($j = 0; $j < count($host_parts); $j++) { if (strlen($host_parts[$j]) > 3) { if (strpos(strtolower($html[$i]), strtolower($host_parts[$j])) !== False) { continue 2; } } } if (filter($html[$i], "0123456789") != "") { continue; } if (strlen($html[$i]) > 1) { if ($html[$i][strlen($html[$i]) - 1] != ".") { continue; } while (True) { $j = strlen($html[$i]) - 1; if ($j < 0) { break; } $c = $html[$i][$j]; if ($c == ".") { break; } $html[$i] = substr($html[$i], 0, $j); } } if (strlen($html[$i]) > 100) { $source_body[] = $html[$i]; } } $source_body = implode("\n\n", $source_body); $source_body = html_decode($source_body); $source_body = html_decode($source_body); $host = "dev.soylentnews.org"; $port = 443; $uri = "/submit.pl"; $response = wget($host, $uri, $port, ICEWEASEL_UA); $html = strip_headers($response); $reskey = extract_text($html, "<input type=\"hidden\" id=\"reskey\" name=\"reskey\" value=\"", "\">"); if ($reskey === False) { privmsg("error: unable to extract reskey"); return False; } sleep(25); $params = array(); $params["reskey"] = $reskey; #$params["name"]=trim(substr($nick,0,50)); $params["name"] = get_bot_nick(); $params["email"] = ""; $params["subj"] = trim(substr($source_title, 0, 100)); $params["primaryskid"] = "1"; $params["tid"] = "6"; $params["sub_type"] = "plain"; $params["story"] = $source_body . "\n\n" . $url . "\n\n-- submitted from IRC"; $params["op"] = "SubmitStory"; $response = wpost($host, $uri, $port, ICEWEASEL_UA, $params); $html = strip_headers($response); strip_all_tag($html, "head"); strip_all_tag($html, "script"); strip_all_tag($html, "style"); strip_all_tag($html, "a"); $html = strip_tags($html); $html = clean_text($html); if (strpos($html, "Perhaps you would like to enter an email address or a URL next time. Thanks for the submission.") !== False) { privmsg("submission successful - https://{$host}/submit.pl?op=list"); return True; } else { privmsg("error: something went wrong with your submission"); return False; } }
function title_privmsg($trailing, $channel, $show_rd) { $list_http = explode("http://", $trailing); array_shift($list_http); for ($i = 0; $i < count($list_http); $i++) { $parts = explode(" ", $list_http[$i]); $list_http[$i] = "http://" . $parts[0]; if (substr($list_http[$i], 0, 7) != "http://") { unset($list_http[$i]); } } $list_http = array_values($list_http); $list_https = explode("https://", $trailing); array_shift($list_https); for ($i = 0; $i < count($list_https); $i++) { $parts = explode(" ", $list_https[$i]); $list_https[$i] = "https://" . $parts[0]; if (substr($list_https[$i], 0, 8) != "https://") { unset($list_https[$i]); } } $list_https = array_values($list_https); $list = array_merge($list_http, $list_https); $out = array(); for ($i = 0; $i < min(4, count($list)); $i++) { $redirect_data = get_redirected_url($list[$i], "", "", array()); if ($redirect_data === False) { continue; } $rd_url = $redirect_data["url"]; # INCORPORATED THE FOLLOWING CONDITION TO ACCOMMODATE ohmibod YOUTUBE TITLES if (strpos($rd_url, "youtube") !== False and $channel == "##anime-japanese") { continue; } $raw = get_raw_title($redirect_data); if ($raw !== False) { $def = translate("auto", "en", $raw); $msg = chr(3) . "13" . $raw . chr(3); if ($def != $raw and $def != "") { $msg = $msg . " [" . chr(3) . "04" . $def . chr(3) . "]"; } if ($rd_url != $list[$i] and $show_rd == True) { $msg = $msg . " - " . chr(3) . "03" . $rd_url; } $out[] = $msg; } else { term_echo("title: get_raw_title returned false"); } } $n = count($out); if ($n == 0) { term_echo("title: no titles to output"); } for ($i = 0; $i < $n; $i++) { if ($i == $n - 1) { pm($channel, "└─ " . $out[$i]); } else { pm($channel, "├─ " . $out[$i]); } } }
} } elseif (strtolower($trailing) == "off") { if ($bucket == "") { privmsg(" titles already disabled for " . chr(3) . "10{$dest}"); } else { unset_bucket("<exec_title_{$dest}>"); privmsg(" titles disabled for " . chr(3) . "10{$dest}"); } } elseif (strtolower($trailing) == "url on") { set_bucket("<exec_title_url_{$dest}>", "on"); privmsg(" enabled redirected url output for titles in " . chr(3) . "10{$dest}"); } elseif (strtolower($trailing) == "url off") { unset_bucket("<exec_title_url_{$dest}>"); privmsg(" disabled redirected url output for titles in " . chr(3) . "10{$dest}"); } else { $redirect_data = get_redirected_url($trailing, "", "", array()); if ($redirect_data === False) { term_echo(" title: get_redirected_url=false"); return; } $rd_url = $redirect_data["url"]; $raw = get_raw_title($redirect_data); if ($raw !== False) { $def = translate("auto", "en", $raw); $msg = chr(3) . "13" . $raw . chr(3); if ($def != $raw) { $msg = $msg . " [" . chr(3) . "04" . $def . chr(3) . "]"; } if ($rd_url != $trailing) { $msg = $msg . " - " . chr(3) . "03" . $rd_url; }