protected function execute($arguments = array(), $options = array())
 {
     if (!$this->safeToRun()) {
         print "Process already running!\n";
         die;
     }
     $timer = sfTimerManager::getTimer('execute');
     $databaseManager = new sfDatabaseManager($this->configuration);
     $databaseManager->initialize($this->configuration);
     //set up index
     $index = EntityTable::getLuceneIndex();
     //delete deleted entities
     $q = LsDoctrineQuery::create()->from('Entity e')->where('e.is_deleted = ?', true)->setHydrationMode(Doctrine::HYDRATE_ARRAY);
     foreach ($q->execute() as $entity) {
         if ($hits = $index->find('key:' . $entity['id'])) {
             if ($options['debug_mode']) {
                 printf("Deleting index for Entity %s\n", $entity['id']);
             }
             foreach ($hits as $hit) {
                 $index->delete($hit->id);
             }
         }
     }
     printf("Memory used: %s\n", LsNumber::makeBytesReadable(memory_get_usage()));
     printf("Index size: %s\n", $index->count());
     $timer->addTime();
     printf("Run time: %s\n", $timer->getElapsedTime());
     sfTimerManager::clearTimers();
 }
示例#2
0
 public function execute()
 {
     try {
         $this->urls = $this->getUrls();
         if (!$this->urls) {
             $this->printDebug("Could not retrieve list of URLS");
         }
     } catch (Exception $e) {
         throw $e;
     }
     $this->setListOptions();
     if ($this->list_name == null || $this->list_description == null || $this->list_fields == null) {
         throw new Exception('setListOptions must define: list_name, list_description, list_fields');
     }
     $this->setList($this->list_name, $this->list_description, $this->list_fields);
     if (count($this->urls)) {
         foreach ($this->urls as $count => $url) {
             //get DB connection for transactions
             try {
                 //begin transaction
                 $this->db->beginTransaction();
                 $this->printDebug("\n***** Searching  *****");
                 $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage()));
                 $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A'));
                 $urlkey = md5($url);
                 /*if ($this->hasMeta($urlkey, 'refesh_time') && time() < (int)$this->getMeta($urlkey, 'refesh_time') )
                 		{
                          $this->printDebug("Refresh time: " . date('l jS \of F Y h:i:s A', (int)$this->getMeta($urlkey, 'refesh_time') ) );
                 			$this->printDebug("Already scraped; skipping");
                 			$this->db->rollback();
                 			continue;
                 		}*/
                 $this->import($url);
                 if ($this->limit === $count) {
                     break;
                 }
                 if ($this->testMode) {
                     continue;
                 }
                 //commit transaction
                 $this->db->commit();
                 $refresh_days = time() + $this->refreshDays * 24 * 60 * 60;
                 $this->saveMeta($urlkey, 'refesh_time', $refresh_days);
                 $this->printDebug("OK");
             } catch (Exception $e) {
                 //something bad happened, rollback
                 $this->db->rollback();
                 throw $e;
             }
         }
     } else {
         $this->printDebug('No URLs found');
     }
 }
 protected function execute($arguments = array(), $options = array())
 {
     if (!$this->safeToRun()) {
         print "Process already running!\n";
         die;
     }
     $timer = sfTimerManager::getTimer('execute');
     //get index and optimize
     $index = EntityTable::getLuceneIndex();
     $index->optimize();
     printf("Memory used: %s\n", LsNumber::makeBytesReadable(memory_get_usage()));
     printf("Index size: %s\n", $index->count());
     $timer->addTime();
     printf("Run time: %s\n", $timer->getElapsedTime());
     sfTimerManager::clearTimers();
 }
示例#4
0
 public function execute()
 {
     //loop through entities
     $entities = $this->getEntitiesByExtension('Org')->execute();
     if ($entities->count()) {
         foreach ($entities as $count => $entity) {
             //get DB connection for transactions
             try {
                 //begin transaction
                 $this->db->beginTransaction();
                 $this->printDebug("\n***** Searching entity: " . $entity->getName() . " *****");
                 $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage()));
                 $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A'));
                 /*
                 					if ($this->hasMeta($entity->id, 'refesh_time') && time() < (int)$this->getMeta($entity->id, 'refesh_time') && !$this->forceScaper) 
                 					{
                   $this->printDebug("Refresh time: " . date('l jS \of F Y h:i:s A', (int)$this->getMeta($entity->id, 'refesh_time') ) );
                 						$this->printDebug($entity->name . " already scraped; skipping");
                 						$this->db->rollback();
                 						continue;
                 					}
                 */
                 $this->import($entity);
                 if ($this->limit === $count) {
                     break;
                 }
                 //if ($this->testMode) { continue; }
                 //commit transaction
                 //$this->db->commit();
                 //die();
                 /*
                 $refresh_days = time() + ($this->refreshDays * 24 * 60 * 60);
                 $last_scraped = time();
                 $this->saveMeta($entity->id, 'refesh_time', $refresh_days);					
                 $this->saveMeta($entity->id, 'last_scraped', $last_scraped);					
                 					$this->printDebug( $entity->name . ": OK");
                 */
             } catch (Exception $e) {
                 //something bad happened, rollback
                 $this->db->rollback();
                 throw $e;
             }
         }
     } else {
         $this->printDebug('No entities found on database');
     }
 }
 public function execute()
 {
     if (!$this->safeToRun('fec')) {
         $this->printDebug('script already running');
         die;
     }
     $this->beginTimer();
     $this->stopTimer();
     $this->_time = $this->timer->getElapsedTime();
     $this->beginTimer();
     //loop through persons
     $persons = $this->getPersonsQuery($this->entity_id)->execute();
     if ($persons->count()) {
         foreach ($persons as $count => $person) {
             //get DB connection for transactions
             try {
                 $this->temp_postal = array();
                 $this->printDebug("\n***** Searching person: " . $person->getName() . " *****");
                 $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage()));
                 $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A'));
                 if ($this->hasMeta($person->id, 'scraped') && $this->getMeta($person->id, 'scraped') && $this->forceScraper != true) {
                     $this->printDebug($person->name . " already scraped; skipping");
                     continue;
                 }
                 $this->getDonations($person);
                 if ($this->limit === $count) {
                     break;
                 }
                 if ($this->testMode) {
                     continue;
                 }
                 //commit transaction
                 $this->saveMeta($person->id, 'scraped', 1);
                 if (!$this->entity_id) {
                     $this->saveMeta('first_round', 'last_processed', $person->id);
                 }
                 $this->printDebug($person->name . ": OK");
             } catch (Exception $e) {
                 //something bad happened, rollback
                 throw $e;
             }
         }
     } else {
         $this->printDebug('No persons found on database');
     }
 }
 public function execute()
 {
     if (!$this->safeToRun('education')) {
         $this->printDebug('script already running');
         die;
     }
     $q = EntityTable::getByExtensionQuery(array('Person', 'BusinessPerson'))->limit($this->_limit);
     if ($this->hasMeta('first_round', 'last_processed')) {
         $q->addWhere('e.id > ?', $this->getMeta('first_round', 'last_processed'));
     }
     $people = $q->execute();
     foreach ($people as $key => $person) {
         //get DB connection for transactions
         try {
             //begin transaction
             $this->db->beginTransaction();
             $this->printDebug("\n***** Searching " . $person->name . " *****");
             $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage()));
             $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A'));
             if (0) {
                 $this->printDebug("Refresh time: " . date('l jS \\of F Y h:i:s A', (int) $this->getMeta($person->id, 'refresh_time')));
                 $this->printDebug($person->name . " already scraped; skipping");
                 $this->db->rollback();
                 continue;
             }
             $this->getBusinessWeek($person);
             if ($this->limit === $key) {
                 break;
             }
             if ($this->testMode) {
                 continue;
             }
             //commit transaction
             $this->db->commit();
             $refresh_days = time() + $this->refreshDays * 24 * 60 * 60;
             $this->saveMeta($person->id, 'scraped', 1);
             $this->saveMeta('first_round', 'last_processed', $person->id);
             $this->printDebug("OK");
         } catch (Exception $e) {
             //something bad happened, rollback
             $this->db->rollback();
             throw $e;
         }
     }
 }
 protected function execute($arguments = array(), $options = array())
 {
     if (!$this->safeToRun()) {
         print "Process already running!\n";
         die;
     }
     $timer = sfTimerManager::getTimer('execute');
     $databaseManager = new sfDatabaseManager($this->configuration);
     $databaseManager->initialize($this->configuration);
     //get id of last-indexed entity
     $index = EntityTable::getLuceneIndex($options['index_file']);
     $index->setMergeFactor(200);
     $index->setMaxBufferedDocs(20);
     if ($count = $index->count()) {
         if (!($lastDoc = $index->getDocument($count - 1))) {
             throw new Exception("Can't find last document in index");
         }
         $maxEntityId = $lastDoc->key;
     } else {
         $maxEntityId = 0;
     }
     //find non-deleted entities with greater IDs
     $q = LsDoctrineQuery::create()->from('Entity e')->leftJoin('e.Alias a')->where('e.id > ? AND e.is_deleted = ?', array($maxEntityId, false))->andWhere('a.context IS NULL')->offset($options['offset'])->limit($options['limit'])->orderBy('e.id ASC');
     //index entities
     $optimize = 0;
     foreach ($q->fetchArray() as $entity) {
         if (EntityTable::updateLuceneIndex($entity, $index, $batchMode = true)) {
             if ($options['debug_mode']) {
                 printf("Indexed entity with ID %s\n", $entity['id']);
             }
         } else {
             if ($options['debug_mode']) {
                 printf("Skipped entity with ID %s\n", $entity['id']);
             }
         }
     }
     printf("Memory used: %s\n", LsNumber::makeBytesReadable(memory_get_usage()));
     printf("Index size: %s\n", $index->count());
     $timer->addTime();
     printf("Run time: %s\n", $timer->getElapsedTime());
     sfTimerManager::clearTimers();
 }
示例#8
0
 public function execute()
 {
     $schools = $this->getSchoolList();
     $position = 0;
     if ($this->hasMeta('current_postion', 'position') && ($position = $this->getMeta('current_postion', 'position'))) {
         $this->printDebug("Resuming scraping. Starting at position: " . $position);
     }
     //print_r($schools);
     $count = 0;
     while ($position <= count($schools)) {
         $school = $schools[$position];
         //get DB connection for transactions
         try {
             //begin transaction
             $this->db->beginTransaction();
             $this->printDebug("\n***** Searching  *****");
             $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage()));
             $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A'));
             $this->import($school);
             if ($this->limit === $count) {
                 break;
             }
             if ($this->testMode) {
                 continue;
             }
             //commit transaction
             $this->db->commit();
             $position++;
             $count++;
             $this->saveMeta('current_postion', 'position', $position);
             $this->printDebug("OK");
         } catch (Exception $e) {
             //something bad happened, rollback
             $this->db->rollback();
             throw $e;
         }
     }
 }
 public function execute()
 {
     if (!$this->safeToRun('fedspending')) {
         $this->printDebug('script already running');
         die;
     }
     $orgs = $this->getBusinessQuery()->execute();
     if ($orgs->count()) {
         //loop through orgs
         foreach ($orgs as $count => $org) {
             $this->printDebug("\n***** Searching Organization: " . $org->getName() . " *****");
             $this->printDebug("Memory used: " . LsNumber::makeBytesReadable(memory_get_usage()));
             $this->printDebug("Now: " . date('l jS \\of F Y h:i:s A'));
             /*if ($this->hasMeta($org->id, 'refresh_time') && time() < (int)$this->getMeta($org->id, 'refresh_time') )
               {
                 $this->printDebug("Refresh time: " . date('l jS \of F Y h:i:s A', (int)$this->getMeta($org->id, 'refresh_time') ) );
                 $this->printDebug($org->name . " already scraped; skipping");
                 //$this->db->rollback();
                 //continue;
               }*/
             $this->getFedSpendingData($org);
             if ($this->testMode) {
                 continue;
             }
             if ($this->_count >= $this->_filing_limit) {
                 $this->printDebug('filing limit reached');
                 die;
             }
             $refresh_days = time() + $this->refreshDays * 24 * 60 * 60;
             $this->saveMeta($this->_round, 'last_processed', $org->id);
             $this->printDebug($org->name . ": OK");
         }
     } else {
         $this->printDebug("No businesses found on database");
     }
 }