function process_raw($htmdata, $page)
{
    global $process_result;
    // contains metainformation from the process_raw() function
    global $test_100_resultpage;
    global $NL;
    global $B;
    global $B_;
    $dom = new domDocument();
    $dom->strictErrorChecking = false;
    $dom->preserveWhiteSpace = true;
    @$dom->loadHTML($htmdata);
    $lists = $dom->getElementsByTagName('li');
    $num = 0;
    $results = array();
    foreach ($lists as $list) {
        unset($ar);
        unset($divs);
        unset($div);
        unset($cont);
        unset($result);
        unset($tmp);
        $ar = dom2array_full($list);
        if (count($ar) < 2) {
            verbose("s");
            continue;
            // skipping advertisements
        }
        if (!isset($ar['class']) || $ar['class'] != 'g') {
            verbose("x");
            continue;
            // skipping non-search result entries
        }
        // adaption to new google layout
        if (isset($ar['div'][1])) {
            $ar['div'] =& $ar['div'][0];
        }
        if (isset($ar['div'][1])) {
            $ar['div'] =& $ar['div'][0];
        }
        //$ar=&$ar['div']['span']; // changes 2011 - Google changed layout
        //$ar=&$ar['div']; // changes 2011 - Google changed layout // change again, 2012-2013
        $orig_ar = $ar;
        // 2012-2013
        // adaption finished
        $divs = $list->getElementsByTagName('div');
        $div = $divs->item(1);
        getContent($cont, $div);
        $num++;
        $result['title'] =& $ar['h3']['a']['textContent'];
        $tmp = strstr(&$ar['h3']['a']['@attributes']['href'], "http");
        $result['url'] = $tmp;
        if (strstr(&$ar['h3']['a']['@attributes']['href'], "interstitial")) {
            echo "!";
        }
        $tmp = parse_url(&$result['url']);
        $result['host'] =& $tmp['host'];
        $desc = strstr($cont, "<span class='st'>");
        // instead of using DOM the string is parsed traditional due to frequent layout changes by Google
        $desc = substr($desc, 17);
        $desc = strip_tags($desc);
        $result['desc'] = $desc;
        // 2012-2013 addon, might be extended with on request
        if (isset($ar['table']) && strlen($result['title'] < 2)) {
            // if interesting the object can be parsed here
            $result['title'] = "embedded object";
            $result['url'] = "embedded object";
        }
        //echo "$B Result parsed:$B_ $result[title]$NL";
        verbose("r");
        flush();
        $results[] = $result;
        // This adds the result to our large result array
    }
    verbose(" !{$NL}");
    // Analyze if more results are available (next page)
    $next = 0;
    $tables = $dom->getElementsByTagName('table');
    if (strstr($htmdata, "Next</a>")) {
        $next = 1;
    } else {
        if ($test_100_resultpage) {
            $needstart = ($page + 1) * 100;
        } else {
            $needstart = ($page + 1) * 10;
        }
        $findstr = "start={$needstart}";
        if (strstr($htmdata, $findstr)) {
            $next = 1;
        }
    }
    $page++;
    if ($next) {
        $process_result = "PROCESS_SUCCESS_MORE";
        // more data available
    } else {
        $process_result = "PROCESS_SUCCESS_LAST";
    }
    // last page reached
    //var_dump($results);
    return $results;
}
 $dom = new domDocument();
 $dom->strictErrorChecking = false;
 $dom->preserveWhiteSpace = true;
 @$dom->loadHTML($htmdata);
 $lists = $dom->getElementsByTagName('li');
 $num = 0;
 foreach ($lists as $list) {
     unset($ar);
     unset($divs);
     unset($div);
     unset($cont);
     unset($result);
     unset($tmp);
     $result['main_keyword'] = $main_keyword;
     $result['sub_keyword'] = $keyword;
     $ar = dom2array_full($list);
     if (count($ar) < 2) {
         echo "S";
         continue;
         // skipping advertisement and similar spam
     }
     if (!isset($ar['class']) || $ar['class'] != 'g') {
         echo "?";
         continue;
         // skipping non-search results
     }
     // adaption to new google layout
     //if ($num==2)var_dump($ar);
     //if ($num==3)var_dump($ar);
     if (isset($ar['div'][1])) {
         $ar['div'] =& $ar['div'][0];
Beispiel #3
0
 function loadSalsaActionContentDetail()
 {
     if ($this->isValid() && !empty($this->mInfo['action_content_detail_key'])) {
         $url = $this->getServiceUrl('load');
         $reqHash = array('object' => 'action_content_detail', 'key' => $this->mInfo['action_content_detail_key']);
         $rslt = $this->curlExec($url, $reqHash);
         $xml = $this->string2XML($rslt);
         if ($this->validateXMLRequest($xml)) {
             $item = $xml->getElementsByTagName('item')->item(0);
             $data = dom2array_full($item);
             $this->mInfo['recommended_subject'] = !empty($data['Recommended_Subject']) ? $data['Recommended_Subject']['#text'] : "";
             $this->mInfo['recommended_content'] = !empty($data['Recommended_Content']) ? $data['Recommended_Content']['#text'] : "";
             // flip these since affirming checkboxes are easier to understand then negating checkboxes
             $this->mInfo['subject_editable'] = !empty($data['Fixed_Subject']) && $data['Fixed_Subject']['#text'] == 'true' ? false : true;
             $this->mInfo['content_editable'] = !empty($data['Fixed_Content']) && $data['Fixed_Content']['#text'] == 'true' ? false : true;
         }
     } else {
         $this->mErrors['diasalsa_load'] = tra('Error attemped to load data from DIA for an invalid Action.');
     }
 }