public function procede() { if (!$this->oRequest->existParam('key')) { throw new Error('Vous devez renseigner la clé.', 3003); } if ($this->oRequest->getParam('key', 'string') != Config::get('ingestkey')) { throw new Error('La clé est invalide.', 3003); } //Ajoute du titre $this->oView->addData('titre', 'Analyse des releases'); //On récupère les 30 dernières releases $oMysqli = Database::getInstance(); //Traitement de la requête $sSqlRequest = "SELECT r.*, \r\n (SELECT GROUP_CONCAT(t.id_regex ORDER BY t.id_regex SEPARATOR ';') FROM tks_tags t WHERE t.id_release = r.id) AS tags,\r\n (SELECT GROUP_CONCAT(f.date ORDER BY f.date SEPARATOR ';') FROM tks_torrents f WHERE f.id_release = r.id) AS dates\t\r\n FROM tks_releases r \r\n WHERE r.id_categorie = '0' \r\n ORDER BY r.id DESC \r\n LIMIT 10"; $oResults = $oMysqli->query($sSqlRequest); $oTable = new TableGenerator(); $oTable->setId(md5('Scrapper')); $oTable->addColumn('Release'); $oTable->addColumn('Catégorie'); $oTable->addColumn('ID Fiche'); $aCategories = Categorie::getCategoriesSelect(); while ($aResult = $oResults->fetch_assoc()) { $oScrapper = new Scrapper($aResult['name'], $aResult['id']); $oScrapper->procede(); $oTable->addLine(array($aResult['name'], $aCategories[$oScrapper->getCategorie()], $oScrapper->getFiche())); } $oTable->setBottom(''); $oTable->create(); $this->oView->addData('content', $oTable->getCode()); $this->oView->Create(); }
function ejes() { $ret = array(); $s = new Scrapper(file_get_contents('http://portal.ejes.com/tapas-del-dia/')); $tapas = array('Nacion', 'Herald', 'Pagina', 'Tiempo', 'Pais', 'Clarin', 'Cronista', 'Ambito', 'BAE', 'Perfil', 'Economista', 'Estadista'); foreach ($s->query('//ul[@class="tapitas"]//a') as $a) { foreach ($tapas as $tapa) { $diario = str_replace(" ", "", stripAccents($a->text())); if (strpos($diario, $tapa) !== false) { $t = new Scrapper(file_get_contents($a->attr('href'))); foreach ($t->query('//img') as $img) { if (!isset($ret[$diario])) { $ret[$diario] = $img->attr('src'); } } } } } return $ret; }
/** * Get radar map image from OMSZ. * * @link http://www.met.hu/en/idojaras/aktualis_idojaras/radar/ * * @access public * * @return string $map Full URL of image. */ public function omsz_radar() { $args = array('type' => 'omsz', 'extension' => '.jpg', 'remote_image_url' => Scrapper::omsz()); return $this->get_local_current_map_image_url($args); }
function get_content($url) { $s = new Scrapper("http://ftr.fivefilters.org/makefulltextfeed.php?url=" . urlencode($url), array('xml')); return $s->node('//item/description')->text(); }
/** * Get forecast info from SerbianMeteo. * * @link http://serbianmeteo.com/ * * @access public * * @return array $forecast Title and content of forecast. */ public function serbianmeteo_forecast() { // Get forecast info for SerbianMeteo $forecast_info = Scrapper::serbianmeteo(); // Format texts if ($forecast_info) { $forecast_info['w3c_time'] = date(DATE_W3C, $forecast_info['time']); $forecast_info['human_time'] = date_i18n('j. F Y. у H:i', $forecast_info['time']); $forecast_info['title'] = $this->to_cyrillic($forecast_info['title']); $forecast_info['description'] = $this->format_forecast($forecast_info['description']); } return $forecast_info; }
public function ProcessScrappingSite() { // ##### Récupération des donnée ##### echo "<h1> Recuperation des donnees </h1>"; $time_all = microtime(true); $time_start = microtime(true); $Publications = parent::RetrieveDatas($this->urlPublications); $time_end = microtime(true); $time = round($time_end - $time_start, 2); echo "Temps Recuperation Publications: {$time} secondes <br>"; $time_start = microtime(true); $Membres = parent::RetrieveDatas($this->urlMembres); $time_end = microtime(true); $time = round($time_end - $time_start, 2); echo "Temps Recuperation Membres: {$time} secondes <br>"; $time_start = microtime(true); $Equipe = parent::RetrieveDatas($this->urlEquipe); $time_end = microtime(true); $time = round($time_end - $time_start, 2); echo "Temps Recuperation Equipe: {$time} secondes <br>"; $time = round($time_end - $time_all, 2); echo "<br>Temps total : {$time} secondes <br>"; // ##### Traitement des données ##### echo "<h1> Traitement des donnees </h1>"; $finalDatas = new DB_Datas(); $indice = 0; $indiceTeam = 0; $indiceUserTeam = 0; foreach ($Membres['results']['collection1'] as $membre) { if ($indice > 0) { # Le premier element est vide pour ainsi dire. $nomPrenom = explode(" ", $membre['Nom']['text']); //Ajouter un petit trim $nom = $nomPrenom[0]; if (count($nomPrenom) > 1) { $prenom = $nomPrenom[1]; } else { $prenom = ""; } $fonction = $membre['Profession']; $equipe = $membre['Equipe']; $descriptionEquipe = ""; $url_site = "http://www-lipn.univ-paris13.fr/~{$this->formatStringUniforme($nom)}/"; $telephone = $membre['Téléphone']; $email = $this->CorrectionEmail($membre["Email"]); $bureau = $membre['Salle']; $age = ""; $idUser = $this->idLaboratory + 100 * $indice; if ($equipe != "") { $idTeam = $this->idLaboratory + 100 * ($indiceTeam + 1); $idTeamContained = $this->getIdElementByName($equipe, $finalDatas->teams); if ($idTeamContained == -1) { # Team not already added $finalDatas->teams[$indiceTeam] = new Team($idTeam, $equipe, $descriptionEquipe, $this->idLaboratory); $finalDatas->userTeams[$indiceUserTeam] = new UserTeam($idTeam, $idUser); $indiceTeam++; } else { $finalDatas->userTeams[$indiceUserTeam] = new UserTeam($idTeamContained, $idUser); } $indiceUserTeam++; } $finalDatas->users[$indice] = new User($idUser, $prenom, $nom, $age, $url_site, $telephone, $email, $bureau, $fonction); } $indice++; } $finalDatas->university[0] = new University($this->idUniversity, "Paris 13", "99 Avenue J.B. Clément - 93430 Villetaneuse"); $finalDatas->laboratory[0] = new Laboratory($this->idLaboratory, "LIPN", "Laboratoire d'Informatique de l'université Paris-Nord", $this->idUniversity); ## Partie publications ## $indice = 0; $indiceUserPub = 0; foreach ($Publications['response']['docs'] as $publication) { $publicationName = $publication['title_s'][0]; $date_publication = $publication['producedDate_s']; $authors = $publication['authFullName_s']; $description = ""; $url = $publication['uri_s']; $idPublication = $this->idLaboratory + 100 * ($indice + 1); $finalDatas->publications[$indice] = new Publication($idPublication, $publicationName, $description, $url, $date_publication); foreach ($authors as $author) { $splitedName = explode(" ", $author); $authorFirstName = $splitedName[0]; $authorLastName = $splitedName[1]; $idUserExisting = $this->getIdElementByFirstNameAndLastName($authorFirstName, $authorLastName, $finalDatas->users); if ($idUserExisting != -1) { $finalDatas->userPublications[$indiceUserPub++] = new UserPublication($idPublication, $idUserExisting); } } $indice++; } ## Partie Equipe (description) ## $indice = 0; foreach ($Equipe['results']['collection1'] as $equipe) { $indexExistingTeam = $this->getIndexElementByName($equipe['Nom']['text'], $finalDatas->teams); if ($indexExistingTeam != -1) { $finalDatas->teams[$indexExistingTeam]->Description = $equipe['Description']; } $indice++; } return $finalDatas; }
$t = time() - 7 * 24 * 60 * 60; $y = date("Y", $t); $m = date("m", $t); $d = date("d", $t); $t1 = "{$d}/{$m}/{$y}"; $t = time(); $y = date("Y", $t); $m = date("m", $t); $d = date("d", $t); $t2 = "{$d}/{$m}/{$y}"; $s = new Scrapper("{$url}/resultados.html", array('post' => array('search' => $nombre, 'desde' => $t1, 'hasta' => $t2))); $primero = true; foreach ($s->query('//*[@id="seccion-arriba"]/article//a') as $a) { $link = $url . $a->attr('href'); $title = $a->text(); $t = new Scrapper($link); $author = $t->node('//li[@class="autor"]')->text(); if (strpos($author, $nombre) !== false) { $content = $t->node('//*[@id="header-noticia"]/h3')->html(); $content .= $t->node('//figure[@id="foto"]')->html(); $content .= $t->node('//div[@itemprop="articleBody"]')->html(); $content = preg_replace('#(?:<br\\s*/?>\\s*?)+#', '</p><p>', $content); $content = preg_replace('#src="/#', 'src="' . $url . '/', $content); $content = preg_replace('#href="/#', 'href="' . $url . '/', $content); $arts[] = new Article($link, $title, $author, $content); } } } return $arts; }; $get_content = function (&$art) {
<?php /** * index file that can be called from command line */ require_once 'include/class.scraper.php'; $url = "http://www.sainsburys.co.uk/webapp/wcs/stores/servlet/CategoryDisplay?msg=&langId=44&categoryId=185749&storeId=10151&krypto=XkNJuRN9KLq6n6uH5Mz8vPJfKTPApwG8ucakhoeAQfAFnb5qGoUjeNIcE37XJqsLFKgqJpn0KSE2%0A4jWWsyGCqL9MrFzaSCurcTmEROPW2THpmiThWdfCBMkVMSAzC8evzTjPcnXztC7gZGu6swc%2BYE%2Bg%0Ar8dazzckCO9eiVbSKD7I%2BqoK45FoyfB5vK58kXkI%2FokZVqWZgIcEc7yXITiZeunS4A409YSjTfwF%0A9Y8J5255LV9jsLZBkMAnB%2Fl6zy53JRXhDEg%2BB7w7Lls7d16DjbsU0i4zzK4W15E%2BSTHddb4%3D#langId=44&storeId=10151&catalogId=10137&categoryId=185749&parent_category_rn=12518&top_category=12518&pageSize=20&orderBy=FAVOURITES_FIRST&searchTerm=&beginIndex=0&hideFilters=true"; $scrapper = new Scrapper($url); $json_array = $scrapper->createJsonArray(); echo '<pre>'; echo json_encode($json_array, JSON_PRETTY_PRINT); echo "</pre>";
function py_lanacion() { $y = date("Y", $this->t); $m = date("m", $this->t); $d = date("d", $this->t); $url = 'http://www.lanacion.com.py/category/tapa/'; $s = new Scrapper($url, array('silence')); foreach ($s->query('//article//a') as $a) { if (strpos($a->attr('href'), "{$y}/{$m}/{$d}") !== false) { $s = new Scrapper($a->attr('href'), array('silence')); $img = $s->node('//div[@class="newsstand-blog-single-content"]/p//img')->attr('data-lazy-src'); return '<img src="' . $img . '" style="width:100%;"><br>'; } } }
public function ProcessScrappingSite() { // ##### Récupération des donnée ##### echo "<h1> Recuperation des donnees </h1>"; $time_all = microtime(true); /* $time_start = microtime(true); $Universite = parent::RetrieveDatas( $this->urlUniversite ); $time_end = microtime(true); $time = round ($time_end - $time_start,2); echo "Temps Recuperation Universite: $time secondes <br>"; $time_start = microtime(true); $Laboratoire = parent::RetrieveDatas( $this->urlLaboratoire ); $time_end = microtime(true); $time = round ($time_end - $time_start,2); echo "Temps Recuperation Laboratoire: $time secondes <br>"; $time_start = microtime(true); $Equipe = parent::RetrieveDatas( $this->urlEquipe ); $time_end = microtime(true); $time = round ($time_end - $time_start,2); echo "Temps Recuperation Equipe: $time secondes <br>"; */ $time_start = microtime(true); $Membres = parent::RetrieveDatas($this->urlMembres); $time_end = microtime(true); $time = round($time_end - $time_start, 2); echo "Temps Recuperation Membres: {$time} secondes <br>"; /* $time_start = microtime(true); $Publications = parent::RetrieveDatas( $this->urlPublications ); $time_end = microtime(true); $time = round ($time_end - $time_start,2); echo "Temps Recuperation Publications: $time secondes <br>";*/ $time = round($time_end - $time_all, 2); echo "<br>Temps total : {$time} secondes <br>"; // ##### Traitement des données ##### echo "<h1> Traitement des donnees </h1>"; $finalDatas = new DB_Datas(); //var_dump($Membres); $indice = 0; $indiceTeam = 0; $indiceUserTeam = 0; foreach ($Membres['results']['collection1'] as $membre) { if ($indice > 0) { # Le premier element est vide pour ainsi dire. $nomPrenom = explode(" ", $membre['Nom']['text']); //Ajouter un petit trim $nom = $nomPrenom[0]; $prenom = $nomPrenom[1]; $fonction = $membre['Profession']; $equipe = $membre['Equipe']; $descriptionEquipe = ""; $url_site = $membre['url']; $telephone = $membre['Téléphone']; $email = $membre['Email']; $bureau = $membre['Salle']; $age = ""; //echo "Nom = { $nom }, Prenom = { $prenom }, Fonction = { $fonction }, Equipe = { $equipe } <br>"; if ($equipe != "") { $idTeamContained = $this->getIdElementByName($equipe, $finalDatas->teams); if ($idTeamContained == -1) { # Team not already added $finalDatas->teams[$indiceTeam] = new Team($indiceTeam, $equipe, $descriptionEquipe, $this->idLaboratory); $finalDatas->userTeams[$indiceUserTeam] = new UserTeam($indiceTeam, $indice); $indiceTeam++; } else { $finalDatas->userTeams[$indiceUserTeam] = new UserTeam($idTeamContained, $indice); } $indiceUserTeam++; } $finalDatas->users[$indice] = new User($indice, $prenom, $nom, $age, $url_site, $telephone, $email, $bureau, $fonction); } $indice++; } $finalDatas->university[0] = new University($this->idUniversity, "Paris 13", "99 Avenue J.B. Clément - 93430 Villetaneuse"); $finalDatas->laboratory[0] = new Laboratory($this->idLaboratory, "L2TI", "Laboratoire de traitement et transport de l'information ", $this->idUniversity); ## Partie publications ## $time_start = microtime(true); $Publications = parent::RetrieveDatas($this->urlPublications); $time_end = microtime(true); $time = round($time_end - $time_start, 2); echo "Temps Recuperation Publications: {$time} secondes <br>"; $indice = 0; $indiceUserPub = 0; foreach ($Publications['response']['docs'] as $publication) { $publicationName = $publication['title_s'][0]; $date_publication = $publication['producedDate_s']; $authors = $publication['authFullName_s']; $description = ""; $url = $publication['uri_s']; $finalDatas->publications[$indice] = new Publication($indice, $publicationName, $description, $url, $date_publication); //echo "<br><br><br>publicationName = '$publicationName'<br>date_publication = '$date_publication'<br>authors = '$authors'<br><br><br>"; foreach ($authors as $author) { $splitedName = explode(" ", $author); $authorFirstName = $splitedName[0]; $authorLastName = $splitedName[1]; //echo "nom = $authorFirstName, prenom = $authorLastName"; $idUserExisting = $this->getIdElementByFirstNameAndLastName($authorFirstName, $authorLastName, $finalDatas->users); if ($idUserExisting != -1) { $finalDatas->userPublications[$indiceUserPub++] = new UserPublication($indice, $idUserExisting); } //echo "<br><br>authorFirstName = '$authorFirstName', authorLastName = '$authorLastName'<br><br>"; } $indice++; } ## Partie Equipe (description) ## $time_start = microtime(true); $Equipe = parent::RetrieveDatas($this->urlEquipe); $time_end = microtime(true); $time = round($time_end - $time_start, 2); echo "Temps Recuperation Equipe: {$time} secondes <br>"; $indice = 0; foreach ($Equipe['results']['collection1'] as $equipe) { //echo "Team => {$equipe['Nom']['text']}"; $indexExistingTeam = $this->getIndexElementByName($equipe['Nom']['text'], $finalDatas->teams); if ($indexExistingTeam != -1) { $finalDatas->teams[$indexExistingTeam]->Description = $equipe['Description']; //echo "EquipeDesc = '{$equipe['Nom']['text']}', Description = '{$equipe['Description']}'<br>"; } $indice++; } ##### ##### ##### AFFICHAGEs TESTS ##### ##### ##### //echo $Membres['results']['collection1'][2]['Equipe']; /* $indice = 0; foreach($finalDatas->functions as $function) { echo "Fonction No $indice = '{$function->Name}' <br>"; $indice++; }*/ $indice = 0; /*foreach($finalDatas->users as $user) { echo "User No $indice : { Id = '{$user->Id}', FirstName = '{$user->FirstName}', LastName = '{$user->LastName}', Age = '{$user->Age}' , url_site = '{$user->Url}' , Telephone = '{$user->Telephone}' , Email = '{$user->Email}' , Bureau = '{$user->Bureau}' , Profession = '{$user->Profession}' } <br>"; $indice++; }*/ foreach ($finalDatas->teams as $team) { echo "Team No {$indice} : { Id = '{$team->Id}', Name = '{$team->Name}', Description = '{$team->Description}', LaboratoryId = '{$team->LaboratoryId}' } <br>"; $indice++; } /* echo "<br><br><br>"; $indice = 0; foreach($finalDatas->userTeams as $userTeam) { echo "UserTeam No $indice : { TeamId = '{$userTeam->TeamId}', UserId = '{$userTeam->UserId}' } <br>"; $indice++; }*/ /*$indice = 0; foreach($finalDatas->publications as $publi) { echo "Publication No $indice : { Id = '{$publi->Id}', Title = '{$publi->Title}', Description = '{$publi->Description}', Url = '{$publi->Url}', Date_publication = '{$publi->Date_publication}' } <br>"; $indice++; }*/ /*echo "<br><br>"; $indice = 0; foreach($finalDatas->userPublications as $publi) { echo "PublicationUser No $indice : { PublicationId = '{$publi->PublicationId}', UserId = '{$publi->UserId}'<br>"; $indice++; }*/ return $finalDatas; }