Example #1
0
 public function finish()
 {
     $builder = BuilderIndex::find()->indexBy('url')->asArray()->all();
     $index = Index::find()->asArray()->indexBy('url')->all();
     if (count($builder) == 0) {
         throw new Exception('The array index have length 0, stop script exec.');
         exit;
     }
     $compare = ['new' => [], 'update' => [], 'delete' => [], 'delete_issue' => [], 'unchanged' => []];
     foreach ($builder as $url => $page) {
         if (isset($index[$url])) {
             // page exists in index
             if ($index[$url]['content'] == $page['content']) {
                 $compare['unchanged'][] = $url;
             } else {
                 $compare['update'][] = $url;
                 $update = Index::findOne(['url' => $url]);
                 $update->attributes = $page;
                 $update->last_update = time();
                 $update->save(false);
             }
             unset($index[$url]);
         } else {
             $compare['new'][] = $url;
             $insert = new Index();
             $insert->attributes = $page;
             $insert->added_to_index = time();
             $insert->last_update = time();
             $insert->save(false);
         }
     }
     // delete not unseted urls from index
     foreach ($index as $deleteUrl => $deletePage) {
         $compare['delete'][] = $deleteUrl;
         $model = Index::findOne($deletePage['id']);
         $model->delete(false);
     }
     // delete empty content empty title
     foreach (Index::find()->where(['=', 'content', ''])->orWhere(['=', 'title', ''])->all() as $page) {
         $compare['delete_issue'][] = $page->url;
         $page->delete(false);
     }
     $this->report = $compare;
 }
Example #2
0
 public function finish()
 {
     $builder = BuilderIndex::find()->where(['is_dublication' => 0])->indexBy('url')->asArray()->all();
     $index = Index::find()->asArray()->indexBy('url')->all();
     if (count($builder) == 0) {
         throw new Exception('The crawler have not found any results. Wrong base url? Or set a rule which tracks all urls?');
         exit;
     }
     foreach ($builder as $url => $page) {
         if (isset($index[$url])) {
             // page exists in index
             if ($index[$url]['content'] == $page['content']) {
                 $this->addLog('unchanged', $url);
             } else {
                 $this->addLog('update', $url);
                 $update = Index::findOne(['url' => $url]);
                 $update->attributes = $page;
                 $update->last_update = time();
                 $update->save(false);
             }
             unset($index[$url]);
         } else {
             $this->addLog('new', $url);
             $insert = new Index();
             $insert->attributes = $page;
             $insert->added_to_index = time();
             $insert->last_update = time();
             $insert->save(false);
         }
     }
     // delete not unseted urls from index
     foreach ($index as $deleteUrl => $deletePage) {
         $this->addLog('delete', $url);
         $model = Index::findOne($deletePage['id']);
         $model->delete(false);
     }
     // delete empty content empty title
     foreach (Index::find()->where(['=', 'content', ''])->orWhere(['=', 'title', ''])->all() as $page) {
         $this->addLog('delete_issue', $url);
         $page->delete(false);
     }
 }