protected function execute($arguments = array(), $options = array()) { if (!$this->safeToRun()) { print "Process already running!\n"; die; } $timer = sfTimerManager::getTimer('execute'); $databaseManager = new sfDatabaseManager($this->configuration); $databaseManager->initialize($this->configuration); //set up index $index = EntityTable::getLuceneIndex(); //delete deleted entities $q = LsDoctrineQuery::create()->from('Entity e')->where('e.is_deleted = ?', true)->setHydrationMode(Doctrine::HYDRATE_ARRAY); foreach ($q->execute() as $entity) { if ($hits = $index->find('key:' . $entity['id'])) { if ($options['debug_mode']) { printf("Deleting index for Entity %s\n", $entity['id']); } foreach ($hits as $hit) { $index->delete($hit->id); } } } printf("Memory used: %s\n", LsNumber::makeBytesReadable(memory_get_usage())); printf("Index size: %s\n", $index->count()); $timer->addTime(); printf("Run time: %s\n", $timer->getElapsedTime()); sfTimerManager::clearTimers(); }
public function execute() { try { $this->urls = $this->getUrls(); if (!$this->urls) { $this->printDebug("Could not retrieve list of URLS"); } } catch (Exception $e) { throw $e; } $this->setListOptions(); if ($this->list_name == null || $this->list_description == null || $this->list_fields == null) { throw new Exception('setListOptions must define: list_name, list_description, list_fields'); } $this->setList($this->list_name, $this->list_description, $this->list_fields); if (count($this->urls)) { foreach ($this->urls as $count => $url) { //get DB connection for transactions try { //begin transaction $this->db->beginTransaction(); $this->printDebug("\n***** Searching *****"); $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage())); $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A')); $urlkey = md5($url); /*if ($this->hasMeta($urlkey, 'refesh_time') && time() < (int)$this->getMeta($urlkey, 'refesh_time') ) { $this->printDebug("Refresh time: " . date('l jS \of F Y h:i:s A', (int)$this->getMeta($urlkey, 'refesh_time') ) ); $this->printDebug("Already scraped; skipping"); $this->db->rollback(); continue; }*/ $this->import($url); if ($this->limit === $count) { break; } if ($this->testMode) { continue; } //commit transaction $this->db->commit(); $refresh_days = time() + $this->refreshDays * 24 * 60 * 60; $this->saveMeta($urlkey, 'refesh_time', $refresh_days); $this->printDebug("OK"); } catch (Exception $e) { //something bad happened, rollback $this->db->rollback(); throw $e; } } } else { $this->printDebug('No URLs found'); } }
protected function execute($arguments = array(), $options = array()) { if (!$this->safeToRun()) { print "Process already running!\n"; die; } $timer = sfTimerManager::getTimer('execute'); //get index and optimize $index = EntityTable::getLuceneIndex(); $index->optimize(); printf("Memory used: %s\n", LsNumber::makeBytesReadable(memory_get_usage())); printf("Index size: %s\n", $index->count()); $timer->addTime(); printf("Run time: %s\n", $timer->getElapsedTime()); sfTimerManager::clearTimers(); }
public function execute() { //loop through entities $entities = $this->getEntitiesByExtension('Org')->execute(); if ($entities->count()) { foreach ($entities as $count => $entity) { //get DB connection for transactions try { //begin transaction $this->db->beginTransaction(); $this->printDebug("\n***** Searching entity: " . $entity->getName() . " *****"); $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage())); $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A')); /* if ($this->hasMeta($entity->id, 'refesh_time') && time() < (int)$this->getMeta($entity->id, 'refesh_time') && !$this->forceScaper) { $this->printDebug("Refresh time: " . date('l jS \of F Y h:i:s A', (int)$this->getMeta($entity->id, 'refesh_time') ) ); $this->printDebug($entity->name . " already scraped; skipping"); $this->db->rollback(); continue; } */ $this->import($entity); if ($this->limit === $count) { break; } //if ($this->testMode) { continue; } //commit transaction //$this->db->commit(); //die(); /* $refresh_days = time() + ($this->refreshDays * 24 * 60 * 60); $last_scraped = time(); $this->saveMeta($entity->id, 'refesh_time', $refresh_days); $this->saveMeta($entity->id, 'last_scraped', $last_scraped); $this->printDebug( $entity->name . ": OK"); */ } catch (Exception $e) { //something bad happened, rollback $this->db->rollback(); throw $e; } } } else { $this->printDebug('No entities found on database'); } }
public function execute() { if (!$this->safeToRun('fec')) { $this->printDebug('script already running'); die; } $this->beginTimer(); $this->stopTimer(); $this->_time = $this->timer->getElapsedTime(); $this->beginTimer(); //loop through persons $persons = $this->getPersonsQuery($this->entity_id)->execute(); if ($persons->count()) { foreach ($persons as $count => $person) { //get DB connection for transactions try { $this->temp_postal = array(); $this->printDebug("\n***** Searching person: " . $person->getName() . " *****"); $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage())); $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A')); if ($this->hasMeta($person->id, 'scraped') && $this->getMeta($person->id, 'scraped') && $this->forceScraper != true) { $this->printDebug($person->name . " already scraped; skipping"); continue; } $this->getDonations($person); if ($this->limit === $count) { break; } if ($this->testMode) { continue; } //commit transaction $this->saveMeta($person->id, 'scraped', 1); if (!$this->entity_id) { $this->saveMeta('first_round', 'last_processed', $person->id); } $this->printDebug($person->name . ": OK"); } catch (Exception $e) { //something bad happened, rollback throw $e; } } } else { $this->printDebug('No persons found on database'); } }
public function execute() { if (!$this->safeToRun('education')) { $this->printDebug('script already running'); die; } $q = EntityTable::getByExtensionQuery(array('Person', 'BusinessPerson'))->limit($this->_limit); if ($this->hasMeta('first_round', 'last_processed')) { $q->addWhere('e.id > ?', $this->getMeta('first_round', 'last_processed')); } $people = $q->execute(); foreach ($people as $key => $person) { //get DB connection for transactions try { //begin transaction $this->db->beginTransaction(); $this->printDebug("\n***** Searching " . $person->name . " *****"); $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage())); $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A')); if (0) { $this->printDebug("Refresh time: " . date('l jS \\of F Y h:i:s A', (int) $this->getMeta($person->id, 'refresh_time'))); $this->printDebug($person->name . " already scraped; skipping"); $this->db->rollback(); continue; } $this->getBusinessWeek($person); if ($this->limit === $key) { break; } if ($this->testMode) { continue; } //commit transaction $this->db->commit(); $refresh_days = time() + $this->refreshDays * 24 * 60 * 60; $this->saveMeta($person->id, 'scraped', 1); $this->saveMeta('first_round', 'last_processed', $person->id); $this->printDebug("OK"); } catch (Exception $e) { //something bad happened, rollback $this->db->rollback(); throw $e; } } }
protected function execute($arguments = array(), $options = array()) { if (!$this->safeToRun()) { print "Process already running!\n"; die; } $timer = sfTimerManager::getTimer('execute'); $databaseManager = new sfDatabaseManager($this->configuration); $databaseManager->initialize($this->configuration); //get id of last-indexed entity $index = EntityTable::getLuceneIndex($options['index_file']); $index->setMergeFactor(200); $index->setMaxBufferedDocs(20); if ($count = $index->count()) { if (!($lastDoc = $index->getDocument($count - 1))) { throw new Exception("Can't find last document in index"); } $maxEntityId = $lastDoc->key; } else { $maxEntityId = 0; } //find non-deleted entities with greater IDs $q = LsDoctrineQuery::create()->from('Entity e')->leftJoin('e.Alias a')->where('e.id > ? AND e.is_deleted = ?', array($maxEntityId, false))->andWhere('a.context IS NULL')->offset($options['offset'])->limit($options['limit'])->orderBy('e.id ASC'); //index entities $optimize = 0; foreach ($q->fetchArray() as $entity) { if (EntityTable::updateLuceneIndex($entity, $index, $batchMode = true)) { if ($options['debug_mode']) { printf("Indexed entity with ID %s\n", $entity['id']); } } else { if ($options['debug_mode']) { printf("Skipped entity with ID %s\n", $entity['id']); } } } printf("Memory used: %s\n", LsNumber::makeBytesReadable(memory_get_usage())); printf("Index size: %s\n", $index->count()); $timer->addTime(); printf("Run time: %s\n", $timer->getElapsedTime()); sfTimerManager::clearTimers(); }
public function execute() { $schools = $this->getSchoolList(); $position = 0; if ($this->hasMeta('current_postion', 'position') && ($position = $this->getMeta('current_postion', 'position'))) { $this->printDebug("Resuming scraping. Starting at position: " . $position); } //print_r($schools); $count = 0; while ($position <= count($schools)) { $school = $schools[$position]; //get DB connection for transactions try { //begin transaction $this->db->beginTransaction(); $this->printDebug("\n***** Searching *****"); $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage())); $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A')); $this->import($school); if ($this->limit === $count) { break; } if ($this->testMode) { continue; } //commit transaction $this->db->commit(); $position++; $count++; $this->saveMeta('current_postion', 'position', $position); $this->printDebug("OK"); } catch (Exception $e) { //something bad happened, rollback $this->db->rollback(); throw $e; } } }
public function execute() { if (!$this->safeToRun('fedspending')) { $this->printDebug('script already running'); die; } $orgs = $this->getBusinessQuery()->execute(); if ($orgs->count()) { //loop through orgs foreach ($orgs as $count => $org) { $this->printDebug("\n***** Searching Organization: " . $org->getName() . " *****"); $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage())); $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A')); /*if ($this->hasMeta($org->id, 'refresh_time') && time() < (int)$this->getMeta($org->id, 'refresh_time') ) { $this->printDebug("Refresh time: " . date('l jS \of F Y h:i:s A', (int)$this->getMeta($org->id, 'refresh_time') ) ); $this->printDebug($org->name . " already scraped; skipping"); //$this->db->rollback(); //continue; }*/ $this->getFedSpendingData($org); if ($this->testMode) { continue; } if ($this->_count >= $this->_filing_limit) { $this->printDebug('filing limit reached'); die; } $refresh_days = time() + $this->refreshDays * 24 * 60 * 60; $this->saveMeta($this->_round, 'last_processed', $org->id); $this->printDebug($org->name . ": OK"); } } else { $this->printDebug("No businesses found on database"); } }