public function finish() { $builder = BuilderIndex::find()->indexBy('url')->asArray()->all(); $index = Index::find()->asArray()->indexBy('url')->all(); if (count($builder) == 0) { throw new Exception('The array index have length 0, stop script exec.'); exit; } $compare = ['new' => [], 'update' => [], 'delete' => [], 'delete_issue' => [], 'unchanged' => []]; foreach ($builder as $url => $page) { if (isset($index[$url])) { // page exists in index if ($index[$url]['content'] == $page['content']) { $compare['unchanged'][] = $url; } else { $compare['update'][] = $url; $update = Index::findOne(['url' => $url]); $update->attributes = $page; $update->last_update = time(); $update->save(false); } unset($index[$url]); } else { $compare['new'][] = $url; $insert = new Index(); $insert->attributes = $page; $insert->added_to_index = time(); $insert->last_update = time(); $insert->save(false); } } // delete not unseted urls from index foreach ($index as $deleteUrl => $deletePage) { $compare['delete'][] = $deleteUrl; $model = Index::findOne($deletePage['id']); $model->delete(false); } // delete empty content empty title foreach (Index::find()->where(['=', 'content', ''])->orWhere(['=', 'title', ''])->all() as $page) { $compare['delete_issue'][] = $page->url; $page->delete(false); } $this->report = $compare; }
public function finish() { $builder = BuilderIndex::find()->where(['is_dublication' => 0])->indexBy('url')->asArray()->all(); $index = Index::find()->asArray()->indexBy('url')->all(); if (count($builder) == 0) { throw new Exception('The crawler have not found any results. Wrong base url? Or set a rule which tracks all urls?'); exit; } foreach ($builder as $url => $page) { if (isset($index[$url])) { // page exists in index if ($index[$url]['content'] == $page['content']) { $this->addLog('unchanged', $url); } else { $this->addLog('update', $url); $update = Index::findOne(['url' => $url]); $update->attributes = $page; $update->last_update = time(); $update->save(false); } unset($index[$url]); } else { $this->addLog('new', $url); $insert = new Index(); $insert->attributes = $page; $insert->added_to_index = time(); $insert->last_update = time(); $insert->save(false); } } // delete not unseted urls from index foreach ($index as $deleteUrl => $deletePage) { $this->addLog('delete', $url); $model = Index::findOne($deletePage['id']); $model->delete(false); } // delete empty content empty title foreach (Index::find()->where(['=', 'content', ''])->orWhere(['=', 'title', ''])->all() as $page) { $this->addLog('delete_issue', $url); $page->delete(false); } }