Exemple #1
0
function view()
{
    $url = get_http_var('url', '');
    $action = get_http_var('action');
    $art = null;
    $return_code = null;
    $raw_output = '';
    $summary = 'FAILED';
    $urls_form = new ArticleURLsForm($_GET);
    if ($urls_form->is_valid()) {
        $url = $urls_form->cleaned_data['url'];
        // already got it?
        $art_id = article_find($url);
        if (is_null($art_id)) {
            list($return_code, $raw_output) = scrape_ScrapeURL($url);
            $scraped = scrape_ParseOutput($raw_output);
            if (sizeof($scraped) > 0) {
                $art_id = $scraped[0]['id'];
                $summary = 'ARTICLE SCRAPED';
            } else {
                $summary = 'NOT SCRAPED';
            }
        } else {
            $summary = 'ALREADY IN DATABASE';
        }
        if (!is_null($art_id)) {
            $art = fetch_art($art_id);
        }
    }
    $v = array('urls_form' => $urls_form, 'return_code' => $return_code, 'raw_output' => $raw_output, 'summary' => $summary, 'art' => $art);
    template($v);
}
 public function scrape()
 {
     $expected_ref = null;
     if (!is_null($this->expected_journo)) {
         $expected_ref = $this->expected_journo->ref;
     }
     list($ret, $txt) = scrape_ScrapeURL($this->url, $expected_ref);
     $art_id = null;
     if ($ret == 0) {
         // scraped ran
         $arts = scrape_ParseOutput($txt);
         if (sizeof($arts) > 0) {
             // scraped at least one article
             $this->set_article($arts[0]);
         }
     }
     $this->update_status();
     return $txt;
 }
Exemple #3
0
    function __construct($journo, $blank = FALSE)
    {
        $this->journo = $journo;
        if (!$blank) {
            $this->url = get_http_var('url', '');
            if ($this->url) {
                $this->url = clean_url($this->url);
            }
            // so we can detect if url is changed
            $this->prev_url = get_http_var('prev_url', '');
        }
        if ($blank || !$this->url && !$this->prev_url) {
            $this->state = 'initial';
            return;
        }
        $msg = is_sane_article_url($this->url);
        if (!is_null($msg)) {
            $this->errs['url'] = $msg;
            $this->state = 'bad_url';
            return;
        }
        // article already in DB?
        $art_id = article_find($this->url);
        if (is_null($art_id)) {
            // nope - try and scrape it
            list($ret, $txt) = scrape_ScrapeURL($this->url, $this->journo['ref']);
            if ($ret != 0) {
                $this->errs['error_message'] = "Journa<i>listed</i> had problems reading this article";
                $this->state = 'scrape_failed';
                $this->_register_error();
                return;
            }
            $arts = scrape_ParseOutput($txt);
            if (sizeof($arts) < 1) {
                $this->errs['error_message'] = "Journa<i>listed</i> had problems reading this article";
                $this->state = 'scrape_failed';
                $this->_register_error();
                return;
            }
            $art_id = $arts[0];
        }
        // if we get this far, $art_id will be set
        // fetch some basic details about the article
        $art = db_getRow("SELECT id,title,permalink,pubdate,srcorg FROM article WHERE id=?", $art_id);
        $sql = <<<EOT
            SELECT j.id,j.prettyname,j.ref
                FROM (journo j INNER JOIN journo_attr attr ON attr.journo_id=j.id)
                WHERE attr.article_id=?
EOT;
        $journos = db_getAll($sql, $art_id);
        $art['journos'] = $journos;
        $this->article = $art;
        // attributed to the expected journo?
        $got_expected_journo = FALSE;
        foreach ($journos as $j) {
            if ($j['id'] == $this->journo['id']) {
                $got_expected_journo = TRUE;
                break;
            }
        }
        if ($got_expected_journo) {
            // all is well.
            $this->state = 'done';
            return;
        } else {
            //            $this->errs['error_message'] = "Journa<i>listed</i> had trouble reading the byline";
            $this->state = 'journo_mismatch';
            $this->_register_error();
            return;
        }
    }