Created
November 22, 2012 22:48
-
-
Save soulcyon/4133209 to your computer and use it in GitHub Desktop.
NJIT Course Schedule Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* NJIT Course Schedule Scraper (11/21/2012) | |
* | |
* Using http://courseschedules.njit.edu/, this PHP scraper utilizes CURL to build a large | |
* associative array for all courses. The user may specify options through onlySubject and | |
* onlyCourse methods to scrape the respective subjects and/or courses. The resulting array only | |
* has the section data. Course data is not stored, but may easily be captured with small changes | |
* to the scraper. | |
* | |
* Example: | |
* $scraper = new NJITScrape("2013s", true); | |
* $scraper->onlySubject("CS"); | |
* $scraper->onlyCourse("490"); | |
* $courseData = $scraper->start(); | |
* | |
* JSON Representation of $courseData will look like this: | |
* { | |
* "CS": { | |
* "name": "Computer Science", | |
* "490": { | |
* "name": "DESIGN IN SOFTWARE ENGR", | |
* "description": "Prerequisite: senior standing or departmental approval. This course | |
* focuses on the methodology for developing software systems. Methods and techniques | |
* for functional requirements analysis and specifications, design, coding, testing and | |
* proving integration and maintenance are discussed.", | |
* "102": { | |
* "call": 21239, | |
* "comments": "", | |
* "days": "F:600PM - 905PM", | |
* "rooms": "CKB207", | |
* "status": "Open", | |
* "maxStudents": 36, | |
* "currentStudents": 28, | |
* "instructor": "Nicholson Theod", | |
* "instructorId": "THEO", | |
* "credits": 3 | |
* } | |
* } | |
* } | |
* } | |
* | |
* @author Sashank Tadepalli <[email protected]> | |
* @license Creative Commons Attribution 3.0 Unported License. | |
* @version 1.0 | |
* @link http://dijjit.com/php/njit-course-schedule-scraper/ | |
*/ | |
// SimpleHTMLDOM - Web Scraping Utility | |
// Please download from http://simplehtmldom.sourceforge.net/ | |
require_once "simplehtmldom.php"; | |
class NJITScrape { | |
private $page = "http://courseschedules.njit.edu/index.aspx"; | |
private $semester; | |
private $subjectId; | |
private $onlySubjectId = ""; | |
private $subjectName; | |
private $courseId; | |
private $onlyCourseId = ""; | |
private $courseName; | |
private $data = array(); | |
private $FLUSH_DEBUG = true; | |
public function __construct($startSemester, $flush){ | |
set_time_limit(0); | |
$this->FLUSH_DEBUG = $flush; | |
$this->semester = $startSemester; | |
} | |
public function start(){ | |
if( $this->FLUSH_DEBUG ){ | |
echo "<h1>" . $this->semester . "</h1>"; | |
@ob_flush(); | |
flush(); | |
} | |
$this->scrapeSemester(); | |
return $this->data; | |
} | |
public function onlySubject($subjectId){ | |
$this->onlySubjectId = $subjectId; | |
} | |
public function onlyCourse($courseId){ | |
$this->onlyCourseId = $courseId; | |
} | |
private function scrapeSemester(){ | |
$url = $this->page . "?semester=$this->semester"; | |
$html = file_get_html($url); | |
foreach($html->find("span a") as $ele){ | |
$subject = explode("-", $ele->plaintext); | |
$this->subjectId = trim($subject[0]); | |
if( !empty($this->onlySubjectId) && | |
strpos($this->subjectId, $this->onlySubjectId) === false ){ | |
continue; | |
} | |
$this->subjectName = trim($subject[1]); | |
$this->data[$this->subjectId] = array("name" => $this->subjectName); | |
if( $this->FLUSH_DEBUG ){ | |
echo $this->subjectName . "<br />"; | |
@ob_flush(); | |
flush(); | |
} | |
$this->scrapeSubject(); | |
} | |
} | |
private function scrapeSubject(){ | |
$courseIds = array(); | |
$courseNames = array(); | |
$url = $this->page . "?semester=$this->semester&subjectID=$this->subjectId"; | |
$html = file_get_html($url); | |
// Populate courseIds and courseNames | |
foreach($html->find(".courseName strong") as $ele){ | |
$courseIds[] = trim(str_replace($this->subjectId, "", $ele->plaintext)); | |
} | |
foreach ($html->find(".courseName") as $ele) { | |
$t = explode("-", $ele->plaintext); | |
$t = explode("(", $t[1]); | |
$courseNames[] = trim($t[0]); | |
} | |
// Build pagination query and fancy post request parameters | |
$pagecount = count($html->find(".pagination_wrapper a")) / 2 + 1; | |
$t = $html->find("#__VIEWSTATE"); | |
$vs = $t[0]->value; | |
$t = $html->find("#__EVENTVALIDATION"); | |
$evt = $t[0]->value; | |
// Loop through all pages to get all courses | |
for ($i = 1; $i < $pagecount; $i++) { | |
// Fancy post request to fake new page request | |
if( !($pageHTML = $this->postRequest($url, array( | |
"__EVENTARGUMENT" => "Page\$" . ($i + 1), | |
"__EVENTTARGET" => "ctl10\$GridView1", | |
"__VIEWSTATE" => trim($vs), | |
"__VIEWSTATEENCRYPTED" => "", | |
"__EVENTVALIDATION" => trim($evt), | |
"__LASTFOCUS" => "", | |
"ctl10\$ddlSemester" => $this->semester | |
))) ){ | |
continue; | |
} | |
$html = str_get_html($pageHTML); | |
// Populate courseIds and courseNames | |
foreach ($html->find(".courseName strong") as $ele1) { | |
$courseIds[] = trim(str_replace($this->subjectId, "", $ele1->plaintext)); | |
} | |
foreach ($html->find(".courseName") as $ele1) { | |
$t = explode("-", $ele1->plaintext); | |
$t = explode("(", $t[1]); | |
$courseNames[] = trim($t[0]); | |
} | |
} | |
for($i = 0; $i < count($courseIds); $i++){ | |
$this->courseId = $courseIds[$i]; | |
if( !empty($this->onlyCourseId) && | |
strpos($this->courseId, $this->onlyCourseId) === false ){ | |
continue; | |
} | |
$this->courseName = $courseNames[$i]; | |
$this->data[$this->subjectId][$this->courseId] = array("name" => $this->courseName); | |
$url = $this->page . | |
"?semester=$this->semester&subjectID=$this->subjectId&course=$this->courseId"; | |
if( !($sectionHTML = $this->postRequest($url, array())) ){ | |
continue; | |
} | |
if( $this->FLUSH_DEBUG ){ | |
echo $this->courseName . "<br />"; | |
@ob_flush(); | |
flush(); | |
} | |
$this->scrapeSections($sectionHTML); | |
} | |
} | |
private function scrapeSections($sectionHTML){ | |
$html = str_get_html($sectionHTML); | |
$description = $html->find("#ctl10_lblCourseDesc"); | |
if (count($description) == 0) { | |
$description = ""; | |
} else { | |
$description = $description[0]->plaintext; | |
} | |
$this->data[$this->subjectId][$this->courseId]["description"] = trim($description); | |
foreach ($html->find("#ctl10_gv_sectionTable .sectionRow") as $ele) { | |
$fixed = $ele->find("td"); | |
$days = explode("Section Comments:", $fixed[2]->plaintext); | |
$tempData = array(); | |
$tempData["call"] = intval( | |
trim( | |
str_replace( | |
"View Book Info", "", $fixed[1]->plaintext | |
) | |
) | |
); | |
$tempData["comments"] = (count($days) > 1) ? trim($days[1]) : ""; | |
$tempData["days"] = trim($days[0]); | |
$tempData["rooms"] = trim($fixed[3]->plaintext); | |
$tempData["status"] = trim($fixed[4]->plaintext); | |
$tempData["maxStudents"] = intval(trim($fixed[5]->plaintext)); | |
$tempData["currentStudents"] = intval(trim($fixed[6]->plaintext)); | |
$insId = $fixed[7]->find("a"); | |
$tempData["instructor"] = $instructor = trim($fixed[7]->plaintext); | |
if (count($insId) == 0) { | |
$insId = $instructor; | |
} else { | |
$insId = trim(str_replace( | |
"https://directory.njit.edu/PersDetails.aspx?persid=", "", $insId[0]->href) | |
); | |
} | |
$tempData["instructorId"] = $insId; | |
$tempData["credits"] = intval(trim($fixed[8]->plaintext)); | |
$this->data[$this->subjectId][$this->courseId][trim($fixed[0]->plaintext)] = $tempData; | |
} | |
} | |
private function postRequest($url, $post){ | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
if (!empty($post)) { | |
curl_setopt($ch, CURLOPT_POST, count($post)); | |
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($post)); | |
} | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
$result = curl_exec($ch); | |
$check = curl_getinfo($ch); | |
curl_close($ch); | |
if ($check["http_code"] != "200") { | |
return false; | |
} | |
return $result; | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment