function getNextSibling($HTML_Anchor, $index) { if ($index == 1) { //echo "<br/>__________________<br/>Before:".$HTML_Anchor." "; //echo "<br/><br/>END :: getNextSibling(index:$index)<br/>".$HTML_Anchor->next_sibling()." "; return $HTML_Anchor->next_sibling(); } else { if ($index > 1) { //echo "<br/>__________________<br/>getNextSibling(HTML_Anchor,index:$index)"; return getNextSibling($HTML_Anchor->next_sibling(), --$index); } else { //echo "<br/>__________________<br/>Error"; return null; } } }
function parseAFDContent() { $GLOBALS['log'] .= "<br/> <span class='startCall'> ****************** Call ParseAFD->parseAFDContent() <a target='_blank' <a href='GetAFDListbyDebateDateListID.php?DebateDateListID=" . $this->afd->debateDateListID . "#" . $this->afd->AFDTitleID . "'>" . $this->afd->AFDTitle . "</a> </span>"; try { $dom = new simple_html_dom(); $dom->load($this->givenHTML, false); $i = 0; //main section foreach ($dom->find('div.afd') as $div_afd) { $i++; //get the result of discussion $endResult_Html = $div_afd->find("p", 0); $flag_error = ""; //echo $div_afd; $result = ereg("^The", $endResult_Html->plaintext); $result_2 = ereg("UTC)", $endResult_Html->plaintext); $container_nourlexpansion = ""; $container_sibiling_index = 1; $container_nourlexpansion_temp = getNextSibling($div_afd->find("p", 0), $container_sibiling_index); //1 $checking = ereg("talk", $container_nourlexpansion_temp->plaintext); $checking_2 = ereg("watch", $container_nourlexpansion_temp->plaintext); $checking_3 = ereg("views", $container_nourlexpansion_temp->plaintext); while (!($checking != 0 && $checking_2 != 0 && $checking_3 != 0)) { ++$container_sibiling_index; $container_nourlexpansion_temp = getNextSibling($div_afd->find("p", 0), $container_sibiling_index); //2..n $checking = ereg("talk", $container_nourlexpansion_temp->plaintext); $checking_2 = ereg("watch", $container_nourlexpansion_temp->plaintext); $checking_3 = ereg("views", $container_nourlexpansion_temp->plaintext); if ($container_sibiling_index > 7) { $GLOBALS['log'] .= "<br/><span class='bad'>Error in MainParsing (container_sibiling_index={$container_sibiling_index})</span>"; break; } } if ($checking != 0 && $checking_2 != 0 && $checking_3 != 0) { $container_nourlexpansion = getNextSibling($div_afd->find("p", 0), $container_sibiling_index); } //1..n //Add extraNote to endResult as well as fix the bug of endResult $endResult_ExtraNote = ""; for ($endresult_i = 1; $endresult_i < $container_sibiling_index - 1; $endresult_i++) { $endResult_Html .= getNextSibling($div_afd->find("p", 0), $endresult_i); } $endResult_SeprateCondition = preg_match_all("/\\((UTC)\\)/", $endResult_Html, $matches_endResultSeprate, PREG_OFFSET_CAPTURE); $matches_endResultSeprate_0 = $matches_endResultSeprate[0]; if (count($matches_endResultSeprate_0) > 0) { $result_2 = 1; $endResult_Html_temp = substr($endResult_Html, 0, $matches_endResultSeprate_0[0][1] + 5); $endResult_ExtraNote = substr($endResult_Html, $matches_endResultSeprate_0[0][1] + 5, strlen($endResult_Html) - $matches_endResultSeprate_0[0][1] + 5); } else { $endResult_Html_temp = $endResult_Html; } // such as . or '' might be in the end that would be removed. if (strlen(trim($endResult_ExtraNote)) < 6) { $endResult_ExtraNote = ""; } $endResult_Html = $endResult_Html_temp; if ($result_2 == 0) { $this->parsed_resultError++; $GLOBALS['log'] .= "<div class='bad'> Result failed. (parsedError = True)</div> debateDateListID=<a target='_blank' href='getHtmlByID.php?id=" . $this->afd->debateDateListID . "'>" . $this->afd->debateDateListID . "</a>, AFDID=<a target='_blank' href='getAFDHtmlByID.php?id=" . $this->afd->AFDID . "'>" . $this->afd->AFDID . "</a>"; $parse_e_result_s = 0; $parse_e_result_e = 0; } else { $GLOBALS['log'] .= "<div class='good'> Result Matched. </div> debateDateListID=<a target='_blank' href='getHtmlByID.php?id=" . $this->afd->debateDateListID . "'>" . $this->afd->debateDateListID . "</a>, AFDID=<a target='_blank' href='getAFDHtmlByID.php?id=" . $this->afd->AFDID . "'>" . $this->afd->AFDID . "</a>"; $parse_e_result_s = 1; $parse_e_result_e = 1; } //check end result now $GLOBALS['log'] .= "<div class='percentage'>After Result Sibiling index: {$container_sibiling_index} </div>"; $GLOBALS['log'] .= "<div class='percentage'> endResult_Html:</div> {$endResult_Html}"; $GLOBALS['log'] .= "<div class='percentage'> endResult_ExtraNote:</div> {$endResult_ExtraNote}"; //Main Comment Secetion $sibling_index = 0; $mainComment_Html = ""; $otherComment_Html = ""; ++$sibling_index; $mainComment_Html_temp = getNextSibling($container_nourlexpansion, $sibling_index); //1 $mainComment_EndCondition = preg_match_all("/\\((UTC)\\)/", $mainComment_Html_temp, $matches_mainComment, PREG_OFFSET_CAPTURE); $mainComment_Html_temp_2 = ""; //+++++ fix the bug when have multiple (UTC) in $mainComment_Html_temp and add the rest (2..n) to the other_comments if ($mainComment_EndCondition > 1) { $matches_mainComment_array = $matches_mainComment[0]; for ($substring_i = 0; $substring_i < count($matches_mainComment_array); $substring_i++) { if ($substring_i == 0) { $start_position = 0; $end_position = $matches_mainComment_array[$substring_i][1] - $start_position + 5; //for 1st occurence $mainComment_Html_temp_2 = substr($mainComment_Html_temp, $start_position, $end_position); } else { $start_position = $matches_mainComment_array[$substring_i - 1][1] + 5; $end_position = $matches_mainComment_array[$substring_i][1] - $start_position + 5; //for 2nd to n occurence $otherComment_Html .= substr($mainComment_Html_temp, $start_position, $end_position); } //echo "substring_i=$substring_i($start_position,".($matches_mainComment_array[$substring_i][1]+5).")<br/>". substr($mainComment_Html_temp,$start_position, $end_position )." <br/><hr/> "; } $flag_error = 'check'; } //---- fix the bug when have multiple (UTC) in $mainComment_Html_temp and add the rest (2..n) to the other_comments if (empty($mainComment_Html_temp_2)) { $mainComment_Html = $mainComment_Html_temp; } else { $mainComment_Html .= $mainComment_Html_temp_2; } while ($mainComment_EndCondition == 0) { ++$sibling_index; $mainComment_Html_temp = getNextSibling($container_nourlexpansion, $sibling_index); //2..n //echo "<div class='newFunction'> check sibling_index=$sibling_index<br/>$mainComment_Html_temp</div>"; //$mainComment_EndCondition = ereg("UTC)", trim(strip_tags($mainComment_Html))); //New Way for calculation $mainComment_EndCondition = preg_match_all("/\\((UTC)\\)/", $mainComment_Html_temp, $matches_mainComment, PREG_OFFSET_CAPTURE); $mainComment_Html_temp_2 = ""; //+++++ fix the bug when have multiple (UTC) in $mainComment_Html_temp and add the rest (2..n) to the other_comments if ($mainComment_EndCondition > 1) { $matches_mainComment_array = $matches_mainComment[0]; //echo $mainComment_Html."<hr/>SSSS"; for ($substring_i = 0; $substring_i < count($matches_mainComment_array); $substring_i++) { if ($substring_i == 0) { $start_position = 0; $end_position = $matches_mainComment_array[$substring_i][1] - $start_position + 5; //for 1st occurence $mainComment_Html_temp_2 = substr($mainComment_Html_temp, $start_position, $end_position); } else { $start_position = $matches_mainComment_array[$substring_i - 1][1] + 5; $end_position = $matches_mainComment_array[$substring_i][1] - $start_position + 5; //for 2nd to n occurence $otherComment_Html .= substr($mainComment_Html_temp, $start_position, $end_position); } //echo "substring_i=$substring_i($start_position,".($matches_mainComment_array[$substring_i][1]+5).")<br/>". substr($mainComment_Html_temp,$start_position, $end_position )." <br/><hr/> "; } $flag_error = 'check'; } //---- fix the bug when have multiple (UTC) in $mainComment_Html_temp and add the rest (2..n) to the other_comments if (empty($mainComment_Html_temp_2)) { $mainComment_Html = $mainComment_Html_temp; } else { $mainComment_Html .= $mainComment_Html_temp_2; } if ($sibling_index == 10) { throw new Exception('There is record in DB!'); } } if ($sibling_index > 1) { echo "<div class='newFunction'> from 2 to {$sibling_index} for (sibling_index) </div>"; for ($mainComment_i = 2; $mainComment_i < $sibling_index; $mainComment_i++) { $mainComment_Html .= getNextSibling($container_nourlexpansion, $mainComment_i); } } //Add extraNote to endResult as well as fix the bug of endResult $mainComment_ExtraNote = ""; /*$mainComment_ExtraSeprateCondition = preg_match_all ("/\((UTC)\)/", $mainComment_Html, $matches_mainCommentSeprate, PREG_OFFSET_CAPTURE); $matches_mainCommentSeprate_0 = $matches_mainCommentSeprate[0]; $mainComment_Html_temp = substr($mainComment_Html,0, $matches_mainCommentSeprate_0[0][1]+5 ); $mainComment_ExtraNote = substr($mainComment_Html, $matches_mainCommentSeprate_0[0][1]+5 , strlen($mainComment_Html) - $matches_mainCommentSeprate_0[0][1]+5 ); $mainComment_Html = $mainComment_Html_temp; */ $otherComment_Html_pre = $otherComment_Html; //other Comment Secetion $otherComment_Html = ""; $sibling_index_otherComment = $sibling_index; ++$sibling_index_otherComment; $otherComment_Html .= getNextSibling($container_nourlexpansion, $sibling_index_otherComment); //1 $otherComment_EndCondition = ereg("No further edits should be made to this page", trim(strip_tags($otherComment_Html))); $extra_i = 0; // other comment is not empty if ($otherComment_EndCondition == 0) { //$GLOBALS['log'] .= "<br/><span class='percentage'> otherComment_EndCondition=$otherComment_EndCondition, sibling_index_otherComment=$sibling_index_otherComment</span>"; $flag_otherComment_empty = 0; $otherComment_Html_next = getNextSibling($container_nourlexpansion, $sibling_index_otherComment + 1); //2 $otherComment_EndCondition = ereg("No further edits should be made to this page", trim(strip_tags($otherComment_Html_next))); while ($otherComment_EndCondition == 0) { ++$sibling_index_otherComment; $otherComment_Html .= getNextSibling($container_nourlexpansion, $sibling_index_otherComment); //1 $otherComment_Html_next = getNextSibling($container_nourlexpansion, $sibling_index_otherComment + 1); //2 $otherComment_EndCondition = ereg("No further edits should be made to this page", trim(strip_tags($otherComment_Html_next))); if ($sibling_index == 30) { throw new Exception('There is record in DB!'); } } //add extra note before the very end of other comment ( before "No further edit should be made to this page") $otherComment_Html_next_dd = find_innder_dd($otherComment_Html_next); $otherComment_EndCondition_dd = ereg("No further edits should be made to this page", trim(strip_tags($otherComment_Html_next_dd))); while ($otherComment_EndCondition_dd == 0) { $extra_i++; $otherComment_Html .= $otherComment_Html_next_dd; $otherComment_Html_next = str_replace($otherComment_Html_next_dd, "", $otherComment_Html_next); $otherComment_Html_next_dd = find_innder_dd($otherComment_Html_next); $otherComment_EndCondition_dd = ereg("No further edits should be made to this page", trim(strip_tags($otherComment_Html_next_dd))); } } else { //$GLOBALS['log'] .= "<br/><span class='percentage'> (empty) otherComment_EndCondition=$otherComment_EndCondition, sibling_index_otherComment=$sibling_index_otherComment</span>"; $otherComment_Html_next = $otherComment_Html; $otherComment_Html = ""; //add extra note before the very end of other comment ( before "No further edit should be made to this page") $otherComment_Html_next_dd = find_innder_dd($otherComment_Html_next); $otherComment_EndCondition_dd = ereg("No further edits should be made to this page", trim(strip_tags($otherComment_Html_next_dd))); while ($otherComment_EndCondition_dd == 0) { $extra_i++; $otherComment_Html .= $otherComment_Html_next_dd; $otherComment_Html_next = str_replace($otherComment_Html_next_dd, "", $otherComment_Html_next); $otherComment_Html_next_dd = find_innder_dd($otherComment_Html_next); $otherComment_EndCondition_dd = ereg("No further edits should be made to this page", trim(strip_tags($otherComment_Html_next_dd))); } if ($extra_i == 0) { $flag_otherComment_empty = 1; } else { $flag_otherComment_empty = 0; } } $otherComment_Html = $otherComment_Html_pre . $otherComment_Html; $GLOBALS['log'] .= "<div class='percentage'> mainComment_Html(sibling_index={$sibling_index}):</div>{$mainComment_Html}"; $GLOBALS['log'] .= "<div class='percentage'> mainComment_ExtraNote:</div>{$mainComment_ExtraNote}"; $GLOBALS['log'] .= str_replace("background-color: #F3F9FF;", " ", $this->givenHTML); $GLOBALS['log'] .= "<div class='percentage'>extra_i:{$extra_i} for AFDID:" . $this->givenAFDID . "</div>, <a target='_blank' href='https://en.wikipedia.org{$this->afdURL}'>https://en.wikipedia.org{$this->afdURL}</a>"; $GLOBALS['log'] .= "<div class='percentage'>otherComment_Html:</div><br/>(flag_otherComment_empty={$flag_otherComment_empty})<br/> {$otherComment_Html} <hr/>"; $GLOBALS['log'] .= "<div class='percentage'> otherComment_Html_next:</div><br/> {$otherComment_Html_next} "; //storeInformation $this->afd->flag_error = $flag_error; $this->afd->flag_otherComment_empty = $flag_otherComment_empty; $this->afd->endResult_Html = $endResult_Html; $this->afd->endResult_ExtraNote = $endResult_ExtraNote; $this->afd->parse_e_result_s = $result; $this->afd->parse_e_result_e = $result_2; $this->afd->mainComment_Html = $mainComment_Html; $this->afd->mainComment_ExtraNote = $mainComment_ExtraNote; $this->afd->otherComment_Html = $otherComment_Html; $this->afd->plainlinks_Html = $container_nourlexpansion; $this->afd->parse_e_result_s = $parse_e_result_s; $this->afd->parse_e_result_e = $parse_e_result_e; $this->afd->updateAFD_withoutAFDHTML_byAFDID(); } } catch (Exception $e) { echo '<br/>Caught exception: ', $e->getMessage(), "\n"; } $GLOBALS['log'] .= "<br/><span class='endCall'>**** End Called ParseAFD->parseAFDContent()*******************</span>"; echo $GLOBALS['log']; $GLOBALS['log'] = ""; flush(); }
function parseContent($totalCalculated) { $GLOBALS['log'] .= "<br/> <span class='startCall'> ****************** Call ParseDebateDate->parseContent() <a target='_blank' <a href='getHtmlByID.php?id=" . $this->debateDate->crawlerID . "'>" . $this->debateDate->url . "</a> </span>"; try { $dom = new simple_html_dom(); $dom->load($this->givenHTML, false); $i = 0; //main section foreach ($dom->find('div.afd') as $div_afd) { $i++; //get the result of discussion $endResult_Html = $div_afd->find("p", 0); $GLOBALS['log'] .= "<hr/>{$i} out of {$totalCalculated} (<span class='good'>" . round($i / $totalCalculated, 2) . "%</span>) For "; $GLOBALS['log'] .= "<a target='_blank' <a href='getHtmlByID.php?id=" . $this->debateDate->crawlerID . "'>" . $this->debateDate->url . "</a>"; $GLOBALS['log'] .= ", crawlerID =" . $this->debateDate->crawlerID . ", and debateDateListID =" . $this->debateDate->debateDateListID . "<br/> {$endResult_Html}"; $flag_error = ""; $result = ereg("^The", $div_afd->find("p", 0)->plaintext); $result_2 = ereg("UTC)\$", $div_afd->find("p", 0)->plaintext); if ($result == 0) { $GLOBALS['log'] .= "<br/><span class='bad'>{$i}. Result did Not match. (parsedError = {$this->parsedError})</span>"; $this->parsedError++; $flag_error = "endResult_Start"; $parse_e_result_s = 0; if ($result_2 == 0) { $GLOBALS['log'] .= "<br/><span class='bad'>{$i}. Result_end failed. (parsedError = {$this->parsedError})</span>"; $this->parsedError++; $flag_error = $flag_error . "; endResult_End"; $parse_e_result_e = 0; } else { $GLOBALS['log'] .= "<br/><span class='good'>{$i}. Result_end Matched.</span>"; $parse_e_result_e = 1; } } else { $GLOBALS['log'] .= "<br/><span class='good'>{$i}. Result Matched.</span>"; $parse_e_result_s = 1; if ($result_2 == 0) { $GLOBALS['log'] .= "<br/><span class='bad'>{$i}. Result_end failed. (parsedError = {$this->parsedError})</span>"; $this->parsedError++; $flag_error = $flag_error . "endResult_End"; $parse_e_result_e = 0; } else { $GLOBALS['log'] .= "<br/><span class='good'>{$i}. Result_end Matched.</span>"; $parse_e_result_e = 1; } } $container_nourlexpansion = ""; $container_sibiling_index = 1; $container_nourlexpansion_temp = getNextSibling($div_afd->find("p", 0), $container_sibiling_index); //1 $checking = ereg("talk", $container_nourlexpansion_temp->plaintext); $checking_2 = ereg("watch", $container_nourlexpansion_temp->plaintext); $checking_3 = ereg("views", $container_nourlexpansion_temp->plaintext); while (!($checking != 0 && $checking_2 != 0 && $checking_3 != 0)) { ++$container_sibiling_index; $container_nourlexpansion_temp = getNextSibling($div_afd->find("p", 0), $container_sibiling_index); //2..n $checking = ereg("talk", $container_nourlexpansion_temp->plaintext); $checking_2 = ereg("watch", $container_nourlexpansion_temp->plaintext); $checking_3 = ereg("views", $container_nourlexpansion_temp->plaintext); if ($container_sibiling_index > 7) { $GLOBALS['log'] .= "<br/><span class='bad'>Error in MainParsing (container_sibiling_index={$container_sibiling_index})</span>"; break; } } if ($checking != 0 && $checking_2 != 0 && $checking_3 != 0) { $container_nourlexpansion = getNextSibling($div_afd->find("p", 0), $container_sibiling_index); //1..n $GLOBALS['log'] .= "<div class='percentage'>After Result Sibiling index: {$container_sibiling_index} </div>"; } //if($container_sibiling_index>2) for ($result_i = 2; $result_i < $container_sibiling_index; $result_i++) { $endResult_Html .= getNextSibling($div_afd->find("p", 0), $result_i); } $AFDTitleID = $container_nourlexpansion->find('span.nourlexpansion', 0)->prev_sibling()->id; //echo $container_nourlexpansion ->find('span.nourlexpansion', 0); //echo "<br/>"; $articleURL = $container_nourlexpansion->find('span.nourlexpansion', 0)->find('a', 0)->href; $AFDTitle = $container_nourlexpansion->find('span.nourlexpansion', 0)->find('a', 0)->plaintext; $flag_deletedArticle = ""; $flag_articleURL_Working = 0; if (!empty($container_nourlexpansion->find('span.nourlexpansion', 0)->find('a', 0)->class)) { $flag_deletedArticle = $container_nourlexpansion->find('span.nourlexpansion', 0)->find('a', 0)->class; $GLOBALS['log'] .= "{$flag_deletedArticle}<br/>"; $flag_articleURL_Working = 0; } else { $flag_articleURL_Working = 1; $flag_deletedArticle = ""; } //echo $container_nourlexpansion ->find('span.nourlexpansion', 0)->next_sibling(); //echo "<br/>"; $AFDURL = $container_nourlexpansion->find('span.nourlexpansion', 0)->next_sibling()->href; //storeInformation $afd = new AFD($AFDTitle, $this->debateDate->debateDateListID, $this->debateDate->conn); $afd->AFDTitleID = $AFDTitleID; $afd->articleURL = $articleURL; $afd->flag_deletedArticle = $flag_deletedArticle; $afd->flag_articleURL_Working = $flag_articleURL_Working; $afd->AFDURL = $AFDURL; $afd->AFDHTML = $div_afd; $afd->endResult_Html = $endResult_Html; $afd->parse_e_result_s = $parse_e_result_s; $afd->parse_e_result_e = $parse_e_result_e; $this->debateDate->addNewAFDByTitle($afd); $this->totalParsed++; //echo $container_nourlexpansion; $GLOBALS['log'] .= "\"<span class='percentage'>{$AFDTitle}</span>\" crawlerID=" . $this->debateDate->crawlerID . ", afd->debateDateListID={$afd->debateDateListID} , afd->AFDID= <a target='_blank' href='getAFDHtmlByID.php?id=" . $afd->AFDID . "'>" . $afd->AFDID . "</a> <br/>"; $GLOBALS['log'] .= "<br/>" . $div_afd->plaintext; echo $GLOBALS['log']; $GLOBALS['log'] = ""; flush(); } $this->debateDate->updateTotalAFDContent($i); } catch (Exception $e) { echo '<br/>Caught exception: ', $e->getMessage(), "\n"; } }