public function scrape() { $em = $this->getManager(); // Array of offerings created or updated $offerings = array(); $this->out("Scraping " . $this->initiative->getName()); // Step 1: Getting a list of course URLs $this->out("Getting a list of course pages"); $urls = $this->getListOfCoursePages(); $urlsCount = count($urls); // Step 2: Go through the page and create/update offering $this->out("Number of courses found: {$urlsCount}"); $this->out("Gathering details about each course"); $courseDetails = array(); foreach ($urls as $url) { if (!$url) { continue; } $courseDetail = array(); $this->domParser->load(file_get_contents(self::BASE_URL . $url)); // Ignore self paced if (!$this->domParser->find('h2.offering_dates_date', 0)) { continue; } // Get Name and shortName $nameString = $this->domParser->find('h1.page-title', 0)->plaintext; $openBracketPosition = strpos($nameString, '('); $closeBracketPosition = strpos($nameString, ')'); $courseDetail['name'] = substr($nameString, 0, $openBracketPosition - 1); $courseDetail['shortName'] = substr($nameString, $openBracketPosition + 1, $closeBracketPosition - $openBracketPosition - 1); if ($courseDetail['name'] == 'Introduction to Nursing in Healthcar') { $courseDetail['name'] = 'Introduction to Nursing in Healthcare'; $courseDetail['shortName'] = 'IntroNur'; } // Get the video id from the url // eg. www.youtube.com/embed/Bw8HkjGQb3U?wmode=opaque&rel=0&showinfo=0 $youtubeIdPosition = 31; $video = 'http://' . $this->domParser->find('iframe.media-youtube-player', 0)->src; $questionMarkPosition = strpos($video, '?'); $courseDetail['video'] = 'http://www.youtube.com/watch?v=' . substr($video, $youtubeIdPosition, $questionMarkPosition - $youtubeIdPosition); $instructors = trim($this->domParser->find('div[id=subject-teacher-tagline]', 0)->plaintext); // Remove the 'by' $instructors = substr($instructors, 3); $courseDetail['instructors'] = explode(' & ', $instructors); $courseDetail['desc'] = $this->domParser->find('div.offering_body', 0)->plaintext; $courseDetail['start_date'] = $this->domParser->find('h2.offering_dates_date', 0)->plaintext; $courseDetail['end_date'] = $this->domParser->find('h2.offering_dates_date', 1)->plaintext; $courseDetail['url'] = $url; print_r($courseDetail); $courseDetails[] = $courseDetail; $this->domParser->clear(); } $this->out(count($courseDetails) . ' course pages found'); // Default stream $stream = $this->dbHelper->getStreamBySlug('business'); $this->out("Default stream is " . $stream->getName()); foreach ($courseDetails as $courseDetail) { /** * Taking a shortcut here. Check if a course is created or not. If it isn't create the * course,offering, etc. Updates are ignored * TODO: Not take a shortcut */ // Build a course object $course = new Course(); $courseShortName = 'open2study_' . $courseDetail['shortName']; $course->setShortName($courseShortName); $course->setInitiative($this->initiative); $course->setName($courseDetail['name']); $course->setDescription($courseDetail['desc']); $course->setStream($stream); // Default to Business $course->setVideoIntro($courseDetail['video']); $course->setUrl(self::BASE_URL . $courseDetail['url']); $dbCourse = $this->dbHelper->getCourseByShortName($courseShortName); if (!$dbCourse) { if ($this->doCreate()) { // New course $this->out("NEW COURSE - " . $course->getName()); if ($this->doModify()) { foreach ($courseDetail['instructors'] as $instructor) { $course->addInstructor($this->dbHelper->createInstructorIfNotExists($instructor)); } $em->persist($course); $em->flush(); } } } else { $course = $dbCourse; } // Check if offering exists $shortName = $this->getOfferingShortName($courseDetail); $offering = $this->dbHelper->getOfferingByShortName($shortName); if ($offering) { continue; } // Check if create offering is oon if (!$this->doCreate()) { $offerings[] = $offering; // Add it to the offerings table continue; } $offering = new Offering(); $offering->setCourse($course); $offering->setStartDate(\DateTime::createFromFormat("d/m/Y", $courseDetail['start_date'])); $offering->setEndDate(\DateTime::createFromFormat("d/m/Y", $courseDetail['end_date'])); $offering->setStatus(Offering::START_DATES_KNOWN); $offering->setLength(4); $offering->setShortName($shortName); $offering->setUrl(self::BASE_URL . $courseDetail['url']); $offering->setVideoIntro($courseDetail['video']); $offering->setSearchDesc($courseDetail['desc']); $offering->setCreated(new \DateTime()); if ($this->doModify()) { try { $em->persist($offering); $em->flush(); $this->out("OFFERING {$courseDetail['name']} created"); } catch (\Exception $e) { $this->out("OFFERING {$courseDetail['name']} creation FAILED"); } } $offerings[] = $offering; } return $offerings; }
private function getOnDemandCourse($data = array()) { $dbLanguageMap = $this->dbHelper->getLanguageMap(); $course = new Course(); $course->setShortName(substr('coursera_' . $data['elements'][0]['slug'], 0, 49)); $course->setInitiative($this->initiative); $course->setName($data['elements'][0]['name']); $course->setDescription($data['elements'][0]['description']); $course->setLongDescription(nl2br($data['elements'][0]['description'])); $course->setStream($this->dbHelper->getStreamBySlug('cs')); // Default to Computer Science $course->setUrl('https://www.coursera.org/learn/' . $data['elements'][0]['slug']); $lang = self::$languageMap[$data['elements']['0']['primaryLanguageCodes'][0]]; if (isset($dbLanguageMap[$lang])) { $course->setLanguage($dbLanguageMap[$lang]); } else { $this->out("Language not found " . $data['elements']['0']['primaryLanguageCodes'][0]); } $course->setCertificate(false); $course->setVerifiedCertificate($data['elements'][0]['isVerificationEnabled']); // Add the university foreach ($data['linked']['partners.v1'] as $university) { $ins = new Institution(); $ins->setName($university['name']); $ins->setIsUniversity(true); $ins->setSlug($university['shortName']); $course->addInstitution($this->dbHelper->createInstitutionIfNotExists($ins)); } foreach ($data['linked']['instructors.v1'] as $courseraInstructor) { if (!empty($courseraInstructor['fullName'])) { $insName = $courseraInstructor['fullName']; } else { $insName = $courseraInstructor['firstName'] . ' ' . $courseraInstructor['lastName']; } $course->addInstructor($this->dbHelper->createInstructorIfNotExists($insName)); } // Get Course Details like Syllabus and length $courseDetails = json_decode(file_get_contents(sprintf(self::ONDEMAND_OPENCOURSE_API, $data['elements'][0]['slug'])), true); if (!empty($courseDetails)) { $syllabus = ''; foreach ($courseDetails['courseMaterial']['elements'] as $item) { $syllabus .= "<b>{$item['name']}</b><br/>{$item['description']}<br/><br/>"; } $course->setSyllabus($syllabus); } // Calculate the length of the course $schedule = json_decode(file_get_contents(sprintf(self::ONDEMAND_COURSE_SCHEDULE, $data['elements'][0]['id'])), true); if (!empty($schedule)) { $length = 0; foreach ($schedule['elements'][0]['defaultSchedule']['periods'] as $period) { $length += $period['numberOfWeeks']; } if ($length > 0) { $course->setLength($length); } } return $course; }
private function getOnDemandCourse($data = array()) { $dbLanguageMap = $this->dbHelper->getLanguageMap(); $course = new Course(); $course->setShortName(substr('coursera_' . $data['elements'][0]['slug'], 0, 49)); $course->setInitiative($this->initiative); $course->setName($data['elements'][0]['name']); $course->setDescription($data['elements'][0]['description']); $course->setLongDescription(nl2br($data['elements'][0]['description'])); $course->setStream($this->dbHelper->getStreamBySlug('cs')); // Default to Computer Science $course->setUrl('https://www.coursera.org/learn/' . $data['elements'][0]['slug']); $lang = self::$languageMap[$data['elements']['0']['primaryLanguageCodes'][0]]; if (isset($dbLanguageMap[$lang])) { $course->setLanguage($dbLanguageMap[$lang]); } else { $this->out("Language not found " . $data['elements']['0']['primaryLanguageCodes'][0]); } $course->setCertificate(false); $course->setVerifiedCertificate($data['elements'][0]['isVerificationEnabled']); // Add the university foreach ($data['linked']['partners.v1'] as $university) { $ins = new Institution(); $ins->setName($university['name']); $ins->setIsUniversity(true); $ins->setSlug($university['shortName']); $course->addInstitution($this->dbHelper->createInstitutionIfNotExists($ins)); } foreach ($data['linked']['instructors.v1'] as $courseraInstructor) { if (!empty($courseraInstructor['fullName'])) { $insName = $courseraInstructor['fullName']; } else { $insName = $courseraInstructor['firstName'] . ' ' . $courseraInstructor['lastName']; } $course->addInstructor($this->dbHelper->createInstructorIfNotExists($insName)); } return $course; }