public function scrape() { $em = $this->getManager(); // Array of offerings created or updated $offerings = array(); $this->out("Scraping " . $this->initiative->getName()); // Step 1: Getting a list of course URLs $this->out("Getting a list of course pages"); $urls = $this->getListOfCoursePages(); $urlsCount = count($urls); // Step 2: Go through the page and create/update offering $this->out("Number of courses found: {$urlsCount}"); $this->out("Gathering details about each course"); $courseDetails = array(); foreach ($urls as $url) { if (!$url) { continue; } $courseDetail = array(); $this->domParser->load(file_get_contents(self::BASE_URL . $url)); // Ignore self paced if (!$this->domParser->find('h2.offering_dates_date', 0)) { continue; } // Get Name and shortName $nameString = $this->domParser->find('h1.page-title', 0)->plaintext; $openBracketPosition = strpos($nameString, '('); $closeBracketPosition = strpos($nameString, ')'); $courseDetail['name'] = substr($nameString, 0, $openBracketPosition - 1); $courseDetail['shortName'] = substr($nameString, $openBracketPosition + 1, $closeBracketPosition - $openBracketPosition - 1); if ($courseDetail['name'] == 'Introduction to Nursing in Healthcar') { $courseDetail['name'] = 'Introduction to Nursing in Healthcare'; $courseDetail['shortName'] = 'IntroNur'; } // Get the video id from the url // eg. www.youtube.com/embed/Bw8HkjGQb3U?wmode=opaque&rel=0&showinfo=0 $youtubeIdPosition = 31; $video = 'http://' . $this->domParser->find('iframe.media-youtube-player', 0)->src; $questionMarkPosition = strpos($video, '?'); $courseDetail['video'] = 'http://www.youtube.com/watch?v=' . substr($video, $youtubeIdPosition, $questionMarkPosition - $youtubeIdPosition); $instructors = trim($this->domParser->find('div[id=subject-teacher-tagline]', 0)->plaintext); // Remove the 'by' $instructors = substr($instructors, 3); $courseDetail['instructors'] = explode(' & ', $instructors); $courseDetail['desc'] = $this->domParser->find('div.offering_body', 0)->plaintext; $courseDetail['start_date'] = $this->domParser->find('h2.offering_dates_date', 0)->plaintext; $courseDetail['end_date'] = $this->domParser->find('h2.offering_dates_date', 1)->plaintext; $courseDetail['url'] = $url; print_r($courseDetail); $courseDetails[] = $courseDetail; $this->domParser->clear(); } $this->out(count($courseDetails) . ' course pages found'); // Default stream $stream = $this->dbHelper->getStreamBySlug('business'); $this->out("Default stream is " . $stream->getName()); foreach ($courseDetails as $courseDetail) { /** * Taking a shortcut here. Check if a course is created or not. If it isn't create the * course,offering, etc. Updates are ignored * TODO: Not take a shortcut */ // Build a course object $course = new Course(); $courseShortName = 'open2study_' . $courseDetail['shortName']; $course->setShortName($courseShortName); $course->setInitiative($this->initiative); $course->setName($courseDetail['name']); $course->setDescription($courseDetail['desc']); $course->setStream($stream); // Default to Business $course->setVideoIntro($courseDetail['video']); $course->setUrl(self::BASE_URL . $courseDetail['url']); $dbCourse = $this->dbHelper->getCourseByShortName($courseShortName); if (!$dbCourse) { if ($this->doCreate()) { // New course $this->out("NEW COURSE - " . $course->getName()); if ($this->doModify()) { foreach ($courseDetail['instructors'] as $instructor) { $course->addInstructor($this->dbHelper->createInstructorIfNotExists($instructor)); } $em->persist($course); $em->flush(); } } } else { $course = $dbCourse; } // Check if offering exists $shortName = $this->getOfferingShortName($courseDetail); $offering = $this->dbHelper->getOfferingByShortName($shortName); if ($offering) { continue; } // Check if create offering is oon if (!$this->doCreate()) { $offerings[] = $offering; // Add it to the offerings table continue; } $offering = new Offering(); $offering->setCourse($course); $offering->setStartDate(\DateTime::createFromFormat("d/m/Y", $courseDetail['start_date'])); $offering->setEndDate(\DateTime::createFromFormat("d/m/Y", $courseDetail['end_date'])); $offering->setStatus(Offering::START_DATES_KNOWN); $offering->setLength(4); $offering->setShortName($shortName); $offering->setUrl(self::BASE_URL . $courseDetail['url']); $offering->setVideoIntro($courseDetail['video']); $offering->setSearchDesc($courseDetail['desc']); $offering->setCreated(new \DateTime()); if ($this->doModify()) { try { $em->persist($offering); $em->flush(); $this->out("OFFERING {$courseDetail['name']} created"); } catch (\Exception $e) { $this->out("OFFERING {$courseDetail['name']} creation FAILED"); } } $offerings[] = $offering; } return $offerings; }