function make_php_exec_cmd($params) { $args = ''; $prefix = ''; $redirect = '1>/dev/null'; $bg_token = '&'; if (strtoupper(substr(PHP_OS, 0, 3)) === 'WIN') { $prefix = 'start /b '; $redirect = '1>NUL'; $bg_token = ''; } if (isset($params['token'])) { if ($args) { $args .= ' '; } $args .= '-t ' . escapeshellarg($params['token']); $errs_fname = make_errs_filename($params['token']); $redirect = ' 1>>' . $errs_fname . ' 2>&1'; } if (isset($params['settings_filename'])) { if ($args) { $args .= ' '; } $args .= '-i ' . escapeshellarg($params['settings_filename']); } if (isset($params['output_filename'])) { if ($args) { $args .= ' '; } $args .= '-o ' . escapeshellarg($params['output_filename']); } if (isset($params['chained']) && $params['chained'] == true) { if ($args) { $args .= ' '; } $args .= '-c'; } if (isset($params['quiet']) && $params['quiet'] == true) { if ($args) { $args .= ' '; } $args .= '-q'; } $fups_path = realpath(__DIR__ . '/fups.php'); if ($fups_path === false) { $fups_path = 'fups.php'; } // Early return possible return $prefix . FUPS_CMDLINE_PHP_PATH . ' -d max_execution_time=0 ' . $fups_path . ' ' . $args . ' ' . $redirect . ' ' . $bg_token; }
public function run() { $valid_protocols = CURLPROTO_HTTP | CURLPROTO_HTTPS; $this->cookie_filename = make_cookie_filename($this->web_initiated ? $this->token : $this->settings_filename); if ($this->dbg) { $this->write_err('Set cookie_filename to "' . $this->cookie_filename . '".'); } if (!$this->was_chained) { @unlink($this->cookie_filename); // Ensure that any existing cookie file on commandline reruns doesn't mess with us. } $this->ch = curl_init(); if ($this->ch === false) { $this->exit_err('Failed to initialise cURL.', __FILE__, __METHOD__, __LINE__); } $opts = array(CURLOPT_USERAGENT => FUPS_USER_AGENT, CURLOPT_FOLLOWLOCATION => false, CURLOPT_RETURNTRANSFER => true, CURLOPT_HEADER => true, CURLOPT_TIMEOUT => 20, CURLOPT_COOKIEJAR => $this->cookie_filename, CURLOPT_COOKIEFILE => $this->cookie_filename, CURLOPT_PROTOCOLS => $valid_protocols, CURLOPT_REDIR_PROTOCOLS => $valid_protocols); if (!curl_setopt_array($this->ch, $opts)) { $this->exit_err('Failed to set the following cURL options:' . PHP_EOL . var_export($opts, true), __FILE__, __METHOD__, __LINE__); } # Login if necessary if ($this->supports_feature('login')) { if ($this->was_chained) { if ($this->dbg) { $this->write_err('Not bothering to check whether to log in again, because we\'ve just chained.'); } } else { $this->check_do_login(); } } # Find all of the user's posts through the search feature if ($this->progress_level == 0) { if ($this->dbg) { $this->write_err('Entered progress level ' . $this->progress_level); } $this->check_get_username(); $this->search_page_num = 1; $this->init_post_search_counter(); $this->init_search_user_posts(); $hook_method = 'hook_after__' . $this->progress_levels[$this->progress_level]; $this->progress_level++; $this->{$hook_method}(); // hook_after__init_user_post_search(); } if ($this->progress_level == 1) { if ($this->dbg) { $this->write_err('Entered progress level ' . $this->progress_level); } do { $this->write_status('Scraping search page for posts starting from page #' . $this->search_page_num . '.'); $num_posts_found = $this->find_author_posts_via_search_page(); if ($this->dbg) { $this->write_err('Found ' . $num_posts_found . ' posts.'); } $this->total_posts += $num_posts_found; $this->search_page_num++; $this->check_do_chain(); } while ($this->progress_level == 1); $hook_method = 'hook_after__' . $this->progress_levels[$this->progress_level - 1]; $this->{$hook_method}(); // hook_after__user_post_search(); } # Sort topics and posts if ($this->progress_level == 2) { if ($this->dbg) { $this->write_err('Entered progress level ' . $this->progress_level); } $this->write_status('Sorting posts and topics prior to scraping posts\' content.'); # Sort topics in ascending alphabetical order uasort($this->posts_data, 'cmp_topics_topic'); # Sort posts within each topic into ascending timestamp order foreach ($this->posts_data as $topicid => $dummy) { $posts =& $this->posts_data[$topicid]['posts']; uasort($posts, 'cmp_posts_date'); } if ($this->dbg) { $this->write_err('SORTED POSTS::'); foreach ($this->posts_data as $topicid => $topic) { $this->write_err("\tTopic: {$topic['topic']}\tTopic ID: {$topicid}"); foreach ($topic['posts'] as $postid => $p) { $newts = strftime('%c', $p['timestamp']); $this->write_err("\t\tTime: {$newts} ({$p['ts']}); Post ID: {$postid}"); } } } $this->write_status('Finished sorting posts and topics. Now scraping contents of ' . $this->total_posts . ' posts.'); $hook_method = 'hook_after__' . $this->progress_levels[$this->progress_level]; $this->progress_level++; $this->{$hook_method}(); // hook_after__topic_post_sort(); } # Retrieve the contents of all of the user's posts if ($this->progress_level == 3) { if ($this->dbg) { $this->write_err('Entered progress level ' . $this->progress_level); } # If the current topic ID is already set, then we are continuing after having chained. $go = is_null($this->current_topic_id); foreach ($this->posts_data as $topicid => $dummy) { if (!$go && $this->current_topic_id == $topicid) { $go = true; } if ($go) { $this->current_topic_id = $topicid; $t =& $this->posts_data[$topicid]; $posts =& $t['posts']; $done = false; while (!$done) { $done = true; foreach ($posts as $postid => $dummy2) { $p =& $posts[$postid]; if ($p['content'] == null && !isset($this->posts_not_found[$postid])) { $this->get_post_contents($t['forumid'], $topicid, $postid); $this->write_status('Retrieved ' . $this->num_posts_retrieved . ' of ' . $this->total_posts . ' posts.'); $done = false; } $this->check_do_chain(); } } } } $this->current_topic_id = null; # Reset this for progress level 4 $hook_method = 'hook_after__' . $this->progress_levels[$this->progress_level]; $this->progress_level++; $this->{$hook_method}(); // hook_after__posts_retrieval(); } # Extract per-thread information: thread author and forum if ($this->progress_level == 4) { if ($this->dbg) { $this->write_err('Entered progress level ' . $this->progress_level); } # If the current topic ID is already set, then we are continuing after having chained. $go = is_null($this->current_topic_id); $total_threads = count($this->posts_data); foreach ($this->posts_data as $topicid => $dummy) { if (!$go) { if ($this->current_topic_id == $topicid) { $go = true; } } else { $topic =& $this->posts_data[$topicid]; $url = $this->get_topic_url($topic['forumid'], $topicid); $this->set_url($url); $html = $this->do_send(); if (!$this->skins_preg_match('thread_author', $html, $matches)) { $this->write_and_record_err_admin("Error: couldn't find a match for the author of the thread with topic id '{$topicid}'. The URL of the page is <" . $url . '>.', __FILE__, __METHOD__, __LINE__, $html); $topic['startedby'] = '???'; } else { $topic['startedby'] = $matches[1]; if ($this->dbg) { $this->write_err("Added author of '{$topic['startedby']}' for topic id '{$topicid}'."); } $this->num_thread_infos_retrieved++; $this->write_status('Retrieved author and topic name for ' . $this->num_thread_infos_retrieved . ' of ' . $total_threads . ' threads.'); } $this->current_topic_id = $topicid; $this->check_do_chain(); } } $hook_method = 'hook_after__' . $this->progress_levels[$this->progress_level]; $this->progress_level++; $this->{$hook_method}(); // hook_after__extract_per_thread_info(); } # Warn about missing posts if ($this->progress_level == 5) { if ($this->dbg) { $this->write_err('Entered progress level ' . $this->progress_level); } if ($this->posts_not_found) { $this->write_err(PHP_EOL . PHP_EOL . PHP_EOL . "The contents of the following posts were not found::" . PHP_EOL . PHP_EOL . PHP_EOL); foreach ($this->posts_not_found as $postid => $dummy) { $a = $this->find_post($postid); if ($a == false) { $this->write_err("\tError: failed to find post with ID '{$postid}' in internal data."); } else { list($p, $t, $topicid) = $a; $this->write_err("\t{$p['posttitle']} ({$t['topic']}; {$p['timestamp']}; {$t['forum']}; forumid: {$t['forumid']}; topicid: {$topicid}; postid: {$postid}; " . $this->get_post_url($t['forumid'], $topicid, $postid) . ')'); } } } $hook_method = 'hook_after__' . $this->progress_levels[$this->progress_level]; $this->progress_level++; $this->{$hook_method}(); // hook_after__handle_missing_posts(); } # Write output if ($this->progress_level == 6) { if ($this->dbg) { $this->write_err('Entered progress level ' . $this->progress_level); } $this->write_status('Writing output.'); # Write all output variants $this->write_output(); # Signal that we are done $this->write_status('DONE'); $hook_method = 'hook_after__' . $this->progress_levels[$this->progress_level]; $this->progress_level++; $this->{$hook_method}(); // hook_after__write_output(); } # Potentially send an admin email re non-fatal errors. if ($this->progress_level == 7) { if ($this->dbg) { $this->write_err('Entered progress level ' . $this->progress_level); } if ($this->web_initiated) { $errs = file_get_contents(make_errs_filename($this->token)); // Disable error messages because if there are no errors then this file // won't exist - we want to avoid an error message telling us as much. $errs_admin = @file_get_contents(make_errs_admin_filename($this->token)); if ($errs || $errs_admin) { $err_msg = ''; if ($errs) { $len = strlen($errs); $trunc_msg = ''; if ($len > FUPS_MAX_ERROR_FILE_EMAIL_LENGTH) { $errs = substr($errs, 0, FUPS_MAX_ERROR_FILE_EMAIL_LENGTH); $trunc_msg = ' (truncated from ' . number_format($len) . ' bytes to ' . number_format(FUPS_MAX_ERROR_FILE_EMAIL_LENGTH) . ' bytes)'; } // No need to include the settings and classname if admin error info exists too, // because settings and classname are already included each time the admin error // file is appended to. if (!$errs_admin) { $settings_msg = static::get_settings_msg_s(static::get_settings_str()); $classname_msg = static::get_classname_msg_s(get_class($this)); $err_msg .= $settings_msg . PHP_EOL . PHP_EOL . $classname_msg . PHP_EOL; } $err_msg .= 'The following non-fatal errors were recorded in the error file' . $trunc_msg . ':' . PHP_EOL . PHP_EOL . $errs . PHP_EOL; } if ($errs_admin) { if ($errs) { $err_msg .= PHP_EOL . PHP_EOL; } $len = strlen($errs_admin); $trunc_msg = ''; if ($len > FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH) { $errs_admin = substr($errs_admin, 0, FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH); $trunc_msg = ' (truncated from ' . number_format($len) . ' bytes to ' . number_format(FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH) . ' bytes)'; } $err_msg .= 'The following extended non-fatal error messages were recorded in the admin error file' . $trunc_msg . ':' . PHP_EOL . PHP_EOL . $errs_admin . PHP_EOL; } static::send_err_mail_to_admin_s($err_msg, $this->token, false); } } $hook_method = 'hook_after__' . $this->progress_levels[$this->progress_level]; $this->progress_level++; $this->{$hook_method}(); // hook_after__check_send_non_fatal_err_email(); } }
$file_errs .= ' '; } $file_errs .= 'Error: unable to write to the serialization file.'; } $cmd = make_php_exec_cmd(array('token' => $token)); if (!try_run_bg_proc($cmd)) { $err = 'Apologies, the server encountered a technical error: it was unable to initiate the background process to perform the task of scraping, sorting and finally presenting your posts. The command used was:<br />' . PHP_EOL . '<br />' . PHP_EOL . $cmd . '<br />' . PHP_EOL . '<br />' . PHP_EOL . 'You might like to try again or <a href="' . FUPS_CONTACT_URL . '">contact me</a> about this error.'; } } } } } else { $token = $_GET['token']; if (validate_token($token, $err)) { $status_filename = make_status_filename($token); $errs_filename = make_errs_filename($token); $errs_admin_filename = make_errs_admin_filename($token); } } if (!$err) { $ts = @filemtime($status_filename); if ($ts === false) { $err = 'The status file for your FUPS process with token "' . $token . '" does not exist - possibly because you have already deleted it.'; } $status = @file_get_contents($status_filename); $errs = @file_get_contents($errs_filename); $errs_admin = @file_get_contents($errs_admin_filename); } $head_extra = ''; if (!$err) { global $fups_url_run, $fups_url_homepage;
$op_info_filename = make_output_info_filename($token); if (is_file($op_info_filename)) { $output_info = json_decode(file_get_contents($op_info_filename), true); if (is_array($output_info)) { $output_dir = null; foreach ($output_info as $opv) { try_delete_file($opv['filepath'], '"' . $opv['filepath'] . '"', false, $err, $num_files_deleted, false); $output_dir = dirname($opv['filepath']); } @rmdir($output_dir); } } if (validate_token($token, $err)) { try_delete_file(make_settings_filename($token), 'settings', true, $err, $num_files_deleted); try_delete_file(make_status_filename($token), 'status', false, $err, $num_files_deleted); try_delete_file(make_errs_filename($token), 'error', false, $err, $num_files_deleted); try_delete_file(make_errs_admin_filename($token), 'errors (admin)', false, $err, $num_files_deleted, false); try_delete_file(make_output_info_filename($token), 'output info', false, $err, $num_files_deleted, false); try_delete_file(make_serialize_filename($token), 'serialisation', true, $err, $num_files_deleted); try_delete_file(make_cookie_filename($token), 'cookie', true, $err, $num_files_deleted, false); try_delete_file(make_cancellation_filename($token), 'cancellation', true, $err, $num_files_deleted, false); } } function try_delete_file($filename, $name, $sensitive, &$err, &$num_files_deleted, $add_err_if_file_not_present = true) { global $fups_url_homepage; if (!is_file($filename)) { if ($add_err_if_file_not_present) { $err .= $err ? ' Another' : 'An'; $err .= ' error occurred: the ' . $name . ' file does not exist on disk; possibly you have already deleted it or it was never created in the first place.'; }