forked from lairdshaw/fups
/
common.php
367 lines (318 loc) · 15.3 KB
/
common.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
<?php
/*
* FUPS: Forum user-post scraper. An extensible PHP framework for scraping and
* outputting the posts of a specified user from a specified forum/board
* running supported forum software. Can be run as either a web app or a
* commandline script.
*
* Copyright (C) 2013-2015 Laird Shaw.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
/* File : common.php.
* Description: Contains defines and functions shared between FUPS scripts.
*/
// These are not defines because we want it to be possible to override them
// in settings.php, and this is also why they occur before the require of
// settings.php.
if (isset($_SERVER['REQUEST_URI'])) {
list($tmp) = explode('?', $_SERVER['REQUEST_URI'], 2);
$fups_url_base = ($tmp[strlen($tmp)-1] == '/' ? $tmp : dirname($tmp));
if ($fups_url_base[strlen($fups_url_base)-1] != '/') {
$fups_url_base .= '/';
}
} else $fups_url_base = '';
$fups_url_homepage = $fups_url_base;
$fups_url_ajax_get_status = $fups_url_base.'ajax-get-status.php';
$fups_url_cancel = $fups_url_base.'cancel.php';
$fups_url_delete_files = $fups_url_base.'delete-files.php';
$fups_url_enter_options = $fups_url_base.'enter-options.php';
$fups_url_notify_email_address = $fups_url_base.'notify-email-address.php';
$fups_url_run = $fups_url_base.'run.php';
require_once __DIR__.'/settings.php';
// Check before doing anything else that FUPS_CMDLINE_PHP_PATH is valid.
// This is especially important under Windows because if we supply an invalid command
// to "start" (generated by make_php_exec_cmd() below), it pops up an invisible
// error message which causes a hang until PHP timeout.
// If not Windows, assume a UNIX-like "which" command is present, otherwise use
// a custom batch file.
$cmd = (strtoupper(substr(PHP_OS, 0, 3)) === 'WIN' ? __DIR__.'\\would_run.bat' : 'which').' '.escapeshellarg(FUPS_CMDLINE_PHP_PATH);
exec($cmd, $dummy, $res);
if ($res !== 0) {
exit('Fatal error: The value defined in settings.php for FUPS_CMDLINE_PHP_PATH, "'.FUPS_CMDLINE_PHP_PATH.'", does not appear to be runnable given the current working directory and your path. Exiting.');
}
define('FUPS_DONE_STR' , 'DONE' );
define('FUPS_FAILED_STR' , 'EXITING' );
define('FUPS_CANCELLED_STR' , 'CANCELLED');
define('FUPS_MAX_TOKEN_ATTEMPTS' , 10);
define('FUPS_FALLBACK_FUPS_CHAIN_DURATION', 1200);
function format_html($html) {
$flags = defined('ENT_SUBSTITUTE') ? ENT_SUBSTITUTE : (ENT_COMPAT | ENT_HTML401);
return str_replace("\n", "<br />\n", htmlspecialchars($html, $flags));
}
function make_cancellation_filename($token) {
return FUPS_DATADIR.$token.'.cancel.txt';
}
function make_cookie_filename($token_or_settings_filename) {
return FUPS_DATADIR.sanitise_filename($token_or_settings_filename).'.cookies.txt';
}
function make_errs_filename($token) {
return FUPS_DATADIR.$token.'.errs.txt';
}
function make_errs_admin_filename($token) {
return FUPS_DATADIR.$token.'.errs.admin.txt';
}
function make_output_dirname($token, $for_web = false, $appendix = '') {
return ($for_web ? FUPS_OUTPUTDIR_WEB : FUPS_OUTPUTDIR).$token.$appendix.'/';
}
// $output_dirname must end in a slash
function make_output_filename($output_dirname, $appendix) {
return $output_dirname.'fups.output'.$appendix;
}
function make_output_info_filename($token) {
return FUPS_DATADIR.$token.'.output-info.json';
}
function make_php_exec_cmd($params) {
$args = '';
$prefix = '';
$redirect = '1>/dev/null';
$bg_token = '&';
if (strtoupper(substr(PHP_OS, 0, 3)) === 'WIN') {
$prefix = 'start /b ';
$redirect = '1>NUL';
$bg_token = '';
}
if (isset($params['token'])) {
if ($args) $args .= ' ';
$args .= '-t '.escapeshellarg($params['token']);
$errs_fname = make_errs_filename($params['token']);
$redirect = ' 1>>'.$errs_fname.' 2>&1';
}
if (isset($params['settings_filename'])) {
if ($args) $args .= ' ';
$args .= '-i '.escapeshellarg($params['settings_filename']);
}
if (isset($params['output_filename'])) {
if ($args) $args .= ' ';
$args .= '-o '.escapeshellarg($params['output_filename']);
}
if (isset($params['chained']) && $params['chained'] == true) {
if ($args) $args .= ' ';
$args .= '-c';
}
if (isset($params['quiet']) && $params['quiet'] == true) {
if ($args) $args .= ' ';
$args .= '-q';
}
$fups_path = realpath(__DIR__.'/fups.php');
if ($fups_path === false) {
$fups_path = 'fups.php';
}
// Early return possible
return $prefix.FUPS_CMDLINE_PHP_PATH.' -d max_execution_time=0 '.$fups_path.' '.$args.' '.$redirect.' '.$bg_token;
}
function try_run_bg_proc($cmd) {
$res = popen($cmd, 'r');
$ret = $res !== false;
pclose($res);
return $ret;
}
function sanitise_filename($filename) {
$tmp = preg_replace('/[^A-Za-z0-9_\-\.]/', '_', $filename);
$sanitised = ($tmp !== null ? $tmp : $filename);
$sanitised2 = str_replace('..', '__', $sanitised);
return $sanitised2;
}
function make_serialize_filename($token_or_settings_filename) {
return FUPS_DATADIR.sanitise_filename($token_or_settings_filename).'.serialize.txt';
}
function make_settings_filename($token) {
return FUPS_DATADIR.$token.'.settings.txt';
}
function make_status_filename($token) {
return FUPS_DATADIR.$token.'.status.txt';
}
function validate_token($token, &$err) {
$err = '';
if (strlen($token) <> 32) {
$err = 'A fatal error occurred: token is malformed (length).';
} else {
$malformed_char = false;
for ($i = 0; $i < strlen($token); $i++) {
$ch = $token[$i];
if (!($ch >= '0' && $ch <= '9') && !($ch >= 'a' && $ch <= 'z')) {
$malformed_char = true;
break;
}
}
if ($malformed_char) {
$err = 'A fatal error occurred: token is malformed (character).';
}
}
return $err == '';
}
function get_failed_done_cancelled($status, &$done, &$cancelled, &$failed) {
$failed = (substr($status, -strlen(FUPS_FAILED_STR)) == FUPS_FAILED_STR);
$done = (substr($status, -strlen(FUPS_DONE_STR)) == FUPS_DONE_STR);
$cancelled = (substr($status, -strlen(FUPS_CANCELLED_STR)) == FUPS_CANCELLED_STR);
}
function show_delete($token, $had_success = false) {
global $fups_url_delete_files;
?>
<p>For your privacy, you might wish to delete from this web server all session and output files associated with this request, especially if you have supplied a login username and password (files that store your username and password details are not publicly visible, but it is wise to delete them anyway).<?php echo FUPS_ROUTINE_DELETION_POLICY; ?></p>
<?php if ($had_success) { ?>
<p>Be sure to do this only <strong>after</strong> you have clicked the above "View result" link, and saved the contents at that page, because they will no longer be accessible after clicking the following link.</p>
<?php } ?>
<p><a href="<?php echo $fups_url_delete_files; ?>?token=<?php echo htmlspecialchars(urlencode($token)); ?>">Delete all files</a> associated with your scrape from my web server - this includes your settings, including your password if you entered one.</p>
<?php
}
function output_update_html($token, $status, $done, $cancelled, $failed, $err, $errs, $errs_admin = false, $ajax = false) {
global $fups_url_cancel, $fups_url_notify_email_address, $fups_url_run;
if ($err) {
?>
<div class="fups_error"><?php echo format_html($err); ?></div>
<?php
return;
}
?>
<h3>Status</h3>
<div id="fups_div_status">
<?php echo htmlspecialchars($status); ?>
</div>
<?php
if ($done) {
$output_info = json_decode(file_get_contents(make_output_info_filename($token)), true);
if ($output_info == null) {
?>
<p>We are sorry, but an unexpected error occurred. The scraping process completed, and the output was created, however we were unable to decode the list of output files. Please feel free to <a href="<?php echo FUPS_CONTACT_URL; ?>">contact me</a> for help with accessing your output files, quoting token "<?php echo $token; ?>".</p>
<?php
} else {
?>
<p>Success! Your posts were retrieved and the output is ready. The following output files are available:</p>
<table style="border-collapse: collapse;">
<tr><th style="border: 1px solid black;">Description</th><th style="border: 1px solid black;">View/download file (opens in a new window)</th><th style="border: 1px solid black;">File size</th></tr>
<?php
foreach ($output_info as $opv) {
?>
<tr><td style="border: 1px solid black;"><?php echo $opv['description']; ?></td><td style="border: 1px solid black;"><a target="_blank" href="<?php echo $opv['url']; ?>">View/download file</a></td><td style="border: 1px solid black;"><?php echo number_format($opv['size']).' bytes'; ?></td></tr>
<?php
}
?>
</table>
<p>If you're wondering what to do next, here are some possible steps:</p>
<ol>
<li>Click on the "View/download file" link beside the HTML file which is sorted according to your preference. This will open up a new window/tab for that file. Switch to this window/tab if necessary, and then save the page, e.g. in Firefox click the "File" menu option and under that click "Save Page As". Select the directory/folder and filename you wish to save this output as (remember this location for the next step).</li>
<li>Start up a word processor such as LibreOffice/OpenOffice or Microsoft Word. Open up in that word processor the HTML file that you saved in the previous step, e.g. click the "File" menu option and under that click "Open", then select the file you saved in the previous step. You are now free to edit the file as you like. You can now (if you so desire) save the file in a friendlier format than HTML, a format such as your editor's default format, e.g. in LibreOffice, click the "File" menu option and then click "Save As" or "Export", and choose the format you desire.</li>
</ol>
<?php
show_delete($token, true);
}
} else if ($cancelled) {
?>
<p>Cancelled by your request.</p>
<?php
show_delete($token, false);
} else if ($failed) {
?>
<p>The script appears to have exited due to an error; the error message is shown below. I have been notified of this error by email; if you would like me to get back to you if/when I have fixed the error, then please enter your email address into the following box and press the button to notify me of it.</p>
<div>
<form method="post" action="<?php echo $fups_url_notify_email_address; ?>">
<input type="hidden" name="token" value="<?php echo $token; ?>" />
<label for="email_address.id">Your contact email address:</label><br />
<input type="text" name="email_address" id="email_address.id" /><br />
<label for="message.id">Any message you'd like to include (leaving this blank is fine):</label><br />
<textarea rows="5" cols="80" name="message" id="message.id"></textarea><br />
<input type="submit" value="Notify the FUPS maintainer" />
</form>
</div>
<p>Alternatively, feel free to retry or to <a href="<?php echo FUPS_CONTACT_URL; ?>">contact me</a> manually about this error, quoting your run token of "<?php echo $token; ?>".</p>
<?php
show_delete($token, false);
} else {
$same_status = (isset($_GET['last_status']) && $status == $_GET['last_status']);
?>
<p>
<a href="<?php echo $fups_url_run.'?token='.$token.($same_status ? '&last_status='.htmlspecialchars(urlencode($status)) : '').($ajax ? '&ajax=yes' : ''); ?>"><?php echo ($ajax ? 'Refresh page' : 'Check progress'); ?></a><?php if ($ajax): echo ' (it should not be necessary to click this link unless something goes wrong)'; endif; ?>.
<?php if ($same_status) { ?>
(It appears that progress has halted unexpectedly - the current status is the same as the previous status. It is likely that an error has caused the process to exit before finishing. We are sorry about this failure. In case you want to be sure that progress has indeed halted, you are welcome to click the preceding link, but otherwise, this page will no longer automatically refresh.)
<?php
show_delete($token, false);
} else { ?>
<?php echo (!$ajax ? 'Your browser should automatically refresh this page every '.FUPS_META_REDIRECT_DELAY.' seconds or so to update progress, but if you don\'t want to wait, you\'re welcome to click the link. ' : ''); ?>If you have changed your mind about wanting to run this script through to the end, <strong>please</strong> click this <a href="<?php echo $fups_url_cancel; ?>?token=<?php echo $token.($ajax ? '&ajax=yes' : ''); ?>">cancel</a> link rather than just closing this page - clicking the cancel link will free up the resources (in particular a background process) associated with your task.
<?php } ?>
</p>
<?php
}
$paren_msg_will_be_emailed = '(Unless a mailing error occurs, these will be emailed to me as-is if/when FUPS finishes running, with your token, "'.htmlspecialchars($token).'", included in the email\'s subject)';
$paren_msg_emailed = '(Unless a mailing error occurred, these have been emailed to me as-is, with your token, "'.htmlspecialchars($token).'", included in the email\'s subject)';
if ($errs) {
?>
<h3>Errors</h3>
<p><?php echo ($done || $failed ? $paren_msg_emailed : $paren_msg_will_be_emailed); ?></p>
<?php
$len = strlen($errs);
if ($len > FUPS_MAX_ERROR_FILE_EMAIL_LENGTH) {
$errs = substr($errs, 0, FUPS_MAX_ERROR_FILE_EMAIL_LENGTH);
$trunc_msg = '[Truncated from '.number_format($len).' bytes to '.number_format(FUPS_MAX_ERROR_FILE_EMAIL_LENGTH).' bytes]';
?>
<p><?php echo $trunc_msg; ?></p>
<?php
}
?>
<div class="fups_error">
<?php echo format_html($errs); ?>
</div>
<?php
if ($errs_admin && ($done || $failed)) {
// The toggle_ext_errs() Javascript function below is defined in run.php
?>
<p><a href="javascript:toggle_ext_errs();">Show/hide extended error messages</a> <?php echo $paren_msg_emailed; ?></p>
<div id="id_ext_err" style="display: none;">
<h3>Extended error messages</h3>
<?php
$len = strlen($errs_admin);
if ($len > FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH) {
$errs_admin = substr($errs_admin, 0, FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH);
$trunc_msg = '[Truncated from '.number_format($len).' bytes to '.number_format(FUPS_MAX_ADMIN_FILE_EMAIL_LENGTH).' bytes]';
?>
<p><?php echo $trunc_msg; ?></p>
<?php
}
?>
<div class="fups_error"><?php echo format_html($errs_admin); ?></div>
</div>
<?php }
}
// Early return possible
}
// Utility functions follow
// Helper function for arrays_combos()
function get_arrays_combos_r($arrays, $depth, $combo, &$combos) {
foreach ($arrays[$depth] as $item) {
$combo[$depth] = $item;
if ($depth == count($arrays) - 1) {
$combos[] = $combo;
} else get_arrays_combos_r($arrays, $depth + 1, $combo, $combos);
}
}
// Returns an array of all array combinations generated
// by taking a single element from each of the arrays
// in $arrays.
function arrays_combos($arrays) {
$ret = array();
$combo = array_fill(0, count($arrays), null);
get_arrays_combos_r($arrays, 0, $combo, $ret);
return $ret;
}