function view() { $url = get_http_var('url', ''); $action = get_http_var('action'); $art = null; $return_code = null; $raw_output = ''; $summary = 'FAILED'; $urls_form = new ArticleURLsForm($_GET); if ($urls_form->is_valid()) { $url = $urls_form->cleaned_data['url']; // already got it? $art_id = article_find($url); if (is_null($art_id)) { list($return_code, $raw_output) = scrape_ScrapeURL($url); $scraped = scrape_ParseOutput($raw_output); if (sizeof($scraped) > 0) { $art_id = $scraped[0]['id']; $summary = 'ARTICLE SCRAPED'; } else { $summary = 'NOT SCRAPED'; } } else { $summary = 'ALREADY IN DATABASE'; } if (!is_null($art_id)) { $art = fetch_art($art_id); } } $v = array('urls_form' => $urls_form, 'return_code' => $return_code, 'raw_output' => $raw_output, 'summary' => $summary, 'art' => $art); template($v); }
public function scrape() { $expected_ref = null; if (!is_null($this->expected_journo)) { $expected_ref = $this->expected_journo->ref; } list($ret, $txt) = scrape_ScrapeURL($this->url, $expected_ref); $art_id = null; if ($ret == 0) { // scraped ran $arts = scrape_ParseOutput($txt); if (sizeof($arts) > 0) { // scraped at least one article $this->set_article($arts[0]); } } $this->update_status(); return $txt; }
function __construct($journo, $blank = FALSE) { $this->journo = $journo; if (!$blank) { $this->url = get_http_var('url', ''); if ($this->url) { $this->url = clean_url($this->url); } // so we can detect if url is changed $this->prev_url = get_http_var('prev_url', ''); } if ($blank || !$this->url && !$this->prev_url) { $this->state = 'initial'; return; } $msg = is_sane_article_url($this->url); if (!is_null($msg)) { $this->errs['url'] = $msg; $this->state = 'bad_url'; return; } // article already in DB? $art_id = article_find($this->url); if (is_null($art_id)) { // nope - try and scrape it list($ret, $txt) = scrape_ScrapeURL($this->url, $this->journo['ref']); if ($ret != 0) { $this->errs['error_message'] = "Journa<i>listed</i> had problems reading this article"; $this->state = 'scrape_failed'; $this->_register_error(); return; } $arts = scrape_ParseOutput($txt); if (sizeof($arts) < 1) { $this->errs['error_message'] = "Journa<i>listed</i> had problems reading this article"; $this->state = 'scrape_failed'; $this->_register_error(); return; } $art_id = $arts[0]; } // if we get this far, $art_id will be set // fetch some basic details about the article $art = db_getRow("SELECT id,title,permalink,pubdate,srcorg FROM article WHERE id=?", $art_id); $sql = <<<EOT SELECT j.id,j.prettyname,j.ref FROM (journo j INNER JOIN journo_attr attr ON attr.journo_id=j.id) WHERE attr.article_id=? EOT; $journos = db_getAll($sql, $art_id); $art['journos'] = $journos; $this->article = $art; // attributed to the expected journo? $got_expected_journo = FALSE; foreach ($journos as $j) { if ($j['id'] == $this->journo['id']) { $got_expected_journo = TRUE; break; } } if ($got_expected_journo) { // all is well. $this->state = 'done'; return; } else { // $this->errs['error_message'] = "Journa<i>listed</i> had trouble reading the byline"; $this->state = 'journo_mismatch'; $this->_register_error(); return; } }