Created
April 14, 2014 19:01
-
-
Save joshblack/10674623 to your computer and use it in GitHub Desktop.
Parse the UF Course Registrar page for course data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var Crawler = require("crawler").Crawler, | |
fs = require('graceful-fs'), | |
courseDictionary = require('./courseDictionary.json'), | |
buildingDictionary = require('./buildingDictionary.json'), | |
i = 1, | |
request = 0; | |
var c = new Crawler({ | |
"maxConnections": 1, // Set to one so we don't run into concurrency issues when it comes to | |
// writing to our result files | |
// This will be called for each crawled page | |
"callback": function(error, result, $) { | |
var rowCourseData = [], | |
rowMeetingData = []; | |
request++; | |
$('center table tr').each(function(index, elem) { | |
var course, courseCode, courseNum, | |
sectionNum, | |
creditsNum, | |
meetingDays, | |
periodNum, | |
buildingCode, | |
roomNum, | |
courseTitle, | |
instructors; | |
$(elem).children().each(function(index, elem) { | |
if (!$(elem).hasClass('colhelp') && $(elem).prop('tagName') !== "TH") { | |
// Grab the data values we need for our row data | |
switch (index) { | |
case 0: // Grab the course code | |
if ($(elem).children('a').html() !== undefined) { | |
course = $(elem).children('a').html().replace(/\s+/g, ''), // Grab course and trim white space | |
courseCode = course.substring(0, 3), | |
courseNum = course.substring(3); | |
break; | |
} | |
else { | |
course = undefined; | |
break; | |
} | |
break; | |
case 5: // Grab the secion number | |
sectionNum = $.trim($(elem).children('b').html()); | |
break; | |
case 6: // Grab the amount of credits | |
creditsNum = $.trim($(elem).html()); | |
break; | |
case 7: // Grab the meeting days | |
meetingDays = $.trim($(elem).html()).split(' '); // separate each day | |
// Go through the meeting days and make sure they are valid, otherwise | |
// we set them to undefined so they can't be added to our database | |
meetingDays.forEach(function(data) { | |
if (data === 'TBA' || data === '') | |
meetingDays = undefined; | |
}); | |
break; | |
case 8: // Grab the Period | |
periodNum = $(elem).html(); | |
break; | |
case 9: // Grab the Bldg code | |
buildingCode = $(elem).html(); | |
break; | |
case 10: // Grab the room number | |
roomNum = $(elem).html(); | |
break; | |
case 12: // Grab the Course title | |
courseTitle = $.trim($(elem).children('a').html()); | |
break; | |
case 13: // Grab the Instructor names | |
instructors = $.trim($(elem).html().replace(/\n/g, '').replace(/<br[^>]*>/gi, ' and ')); | |
break; | |
default: | |
break; | |
} | |
} | |
}); | |
// If our course code is undefined and we have valid meeting days we need to pull | |
// info from the last entry in our array | |
if (courseCode === undefined && meetingDays !== undefined) { | |
// Go through the meeting days available in this cyle and grab data that it's missing | |
// from the previous array entry for the creation of a new meeting | |
meetingDays.forEach(function(data) { | |
if (rowMeetingData[rowMeetingData.length - 1] !== undefined) { | |
rowMeetingData.push([ | |
rowMeetingData[rowMeetingData.length - 1][0], | |
rowMeetingData[rowMeetingData.length - 1][1], | |
roomNum, | |
data, | |
periodNum | |
]); | |
} | |
}); | |
} | |
else { | |
// add a new entry into the row course data | |
// rowCourseData.push([courseCode, courseNum, sectionNum, creditsNum, instructors, courseTitle, 'spring', '2014']); | |
// Throw out cases where meetingDays is undefined, we don't want those | |
if (meetingDays !== undefined) { | |
// Go through each day in our array and make a meeting for it | |
meetingDays.forEach(function(meetingDay) { | |
// Parse through our courseDictionary to find the course id for the meeting | |
courseDictionary.data.forEach(function(courseData) { | |
if (courseData.deptCode === courseCode && courseData.courseNumber === courseNum && courseData.sectionNumber === sectionNum) { | |
// Parse through our buildingDictionary to find the building id for the meeting | |
buildingDictionary.data.forEach(function(buildingData) { | |
if (buildingData.buildingCode === buildingCode) { | |
// We found everything we need! push the data to the meeting data array and | |
// exit out of the loop | |
rowMeetingData.push([courseData.id, buildingData.id, roomNum, meetingDay, periodNum]); | |
return false; | |
} | |
}); | |
// exit out of the course dictionary loop after we found our building id | |
return false; | |
} | |
}); | |
}); | |
} | |
} | |
}); | |
var finalCourseData = [], | |
finalMeetingData = []; | |
rowCourseData.forEach(function(data) { | |
var formattedCourseData = "['deptCode' => '" + data[0] + | |
"', 'courseNumber' => '" + data[1] + | |
"', 'sectionNumber' => '" + data[2] + | |
"', 'credits' => '" + data[3] + | |
"', 'instructor' => '" + data[4] + | |
"', 'courseTitle' => '" + data[5] + | |
"', 'semester' => '" + data[6] + | |
"', 'year' => " + data[7] + ']'; | |
finalCourseData.push(formattedCourseData); | |
}); | |
finalCourseData.forEach(function(data) { | |
fs.appendFile('results.txt', data + ',\n', function(err) { | |
if (err) throw err; | |
console.log('The data for ' + data.substring(16, 19) + data.substring(41, 45) + ' was appended to the file!'); | |
}); | |
}); | |
rowMeetingData.forEach(function(data) { | |
var formattedMeetingData = "['course_id' => '" + data[0] + | |
"', 'building_id' => '" + data[1] + | |
"', 'roomNumber' => '" + data[2] + | |
"', 'meetingDay' => '" + data[3] + | |
"', 'period' => '" + data[4] + '\']'; | |
finalMeetingData.push(formattedMeetingData); | |
}); | |
finalMeetingData.forEach(function(data) { | |
fs.appendFile('meetingsData.txt', data + ',\n', function(err) { | |
if (err) throw err; | |
console.log('The data for meeting #' + i + ' was appended to the file!'); | |
i++; | |
}); | |
}); | |
} | |
}); | |
// Queue a list of URLs | |
c.queue([ | |
'http://registrar.ufl.edu/soc/201401/all/accounts.htm', | |
'http://registrar.ufl.edu/soc/201401/all/advertis.htm', | |
'http://registrar.ufl.edu/soc/201401/all/aframstu.htm', | |
'http://registrar.ufl.edu/soc/201401/all/afrstudi.htm', | |
'http://registrar.ufl.edu/soc/201401/all/agribioe.htm', | |
'http://registrar.ufl.edu/soc/201401/all/agriedco.htm', | |
'http://registrar.ufl.edu/soc/201401/all/agriopma.htm', | |
'http://registrar.ufl.edu/soc/201401/all/agriture.htm', | |
'http://registrar.ufl.edu/soc/201401/all/agronomy.htm', | |
'http://registrar.ufl.edu/soc/201401/all/animalsc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/anthropo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/applphys.htm', | |
'http://registrar.ufl.edu/soc/201401/all/architec.htm', | |
'http://registrar.ufl.edu/soc/201401/all/arthisto.htm', | |
'http://registrar.ufl.edu/soc/201401/all/astronom.htm', | |
'http://registrar.ufl.edu/soc/201401/all/bibozobi.htm', | |
'http://registrar.ufl.edu/soc/201401/all/bibozobo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/bibozozo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/biomedeg.htm', | |
'http://registrar.ufl.edu/soc/201401/all/biostati.htm', | |
'http://registrar.ufl.edu/soc/201401/all/business.htm', | |
'http://registrar.ufl.edu/soc/201401/all/chemical.htm', | |
'http://registrar.ufl.edu/soc/201401/all/chemistr.htm', | |
'http://registrar.ufl.edu/soc/201401/all/civcseng.htm', | |
'http://registrar.ufl.edu/soc/201401/all/classicc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/classicg.htm', | |
'http://registrar.ufl.edu/soc/201401/all/classicl.htm', | |
'http://registrar.ufl.edu/soc/201401/all/clinicap.htm', | |
'http://registrar.ufl.edu/soc/201401/all/computer.htm', | |
'http://registrar.ufl.edu/soc/201401/all/construc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/denodiag.htm', | |
'http://registrar.ufl.edu/soc/201401/all/desconpl.htm', | |
'http://registrar.ufl.edu/soc/201401/all/digworld.htm', | |
'http://registrar.ufl.edu/soc/201401/all/economic.htm', | |
'http://registrar.ufl.edu/soc/201401/all/educahdo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/educasep.htm', | |
'http://registrar.ufl.edu/soc/201401/all/educattl.htm', | |
'http://registrar.ufl.edu/soc/201401/all/electric.htm', | |
'http://registrar.ufl.edu/soc/201401/all/engingen.htm', | |
'http://registrar.ufl.edu/soc/201401/all/englishs.htm', | |
'http://registrar.ufl.edu/soc/201401/all/entomolo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/envglohe.htm', | |
'http://registrar.ufl.edu/soc/201401/all/environm.htm', | |
'http://registrar.ufl.edu/soc/201401/all/envrhort.htm', | |
'http://registrar.ufl.edu/soc/201401/all/epidemio.htm', | |
'http://registrar.ufl.edu/soc/201401/all/european.htm', | |
'http://registrar.ufl.edu/soc/201401/all/famscien.htm', | |
'http://registrar.ufl.edu/soc/201401/all/finances.htm', | |
'http://registrar.ufl.edu/soc/201401/all/finearts.htm', | |
'http://registrar.ufl.edu/soc/201401/all/firstyrf.htm', | |
'http://registrar.ufl.edu/soc/201401/all/fishsfrc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/flexlear.htm', | |
'http://registrar.ufl.edu/soc/201401/all/foodreso.htm', | |
'http://registrar.ufl.edu/soc/201401/all/foodscie.htm', | |
'http://registrar.ufl.edu/soc/201401/all/forresco.htm', | |
'http://registrar.ufl.edu/soc/201401/all/geograph.htm', | |
'http://registrar.ufl.edu/soc/201401/all/geomatic.htm', | |
'http://registrar.ufl.edu/soc/201401/all/geoscien.htm', | |
'http://registrar.ufl.edu/soc/201401/all/healthed.htm', | |
'http://registrar.ufl.edu/soc/201401/all/healthop.htm', | |
'http://registrar.ufl.edu/soc/201401/all/healthpr.htm', | |
'http://registrar.ufl.edu/soc/201401/all/healthsa.htm', | |
'http://registrar.ufl.edu/soc/201401/all/historys.htm', | |
'http://registrar.ufl.edu/soc/201401/all/honorspr.htm', | |
'http://registrar.ufl.edu/soc/201401/all/horticul.htm', | |
'http://registrar.ufl.edu/soc/201401/all/industri.htm', | |
'http://registrar.ufl.edu/soc/201401/all/informat.htm', | |
'http://registrar.ufl.edu/soc/201401/all/innovati.htm', | |
'http://registrar.ufl.edu/soc/201401/all/interdis.htm', | |
'http://registrar.ufl.edu/soc/201401/all/interior.htm', | |
'http://registrar.ufl.edu/soc/201401/all/jewishst.htm', | |
'http://registrar.ufl.edu/soc/201401/all/journali.htm', | |
'http://registrar.ufl.edu/soc/201401/all/landscap.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langaaaa.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langakan.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langamha.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langarab.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langchin.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langczec.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langdutc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langfren.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langgerm.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langhait.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langhebr.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langital.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langjapa.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langpoli.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langruss.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langswah.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langviet.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langwolo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/langyoru.htm', | |
'http://registrar.ufl.edu/soc/201401/all/latiname.htm', | |
'http://registrar.ufl.edu/soc/201401/all/lawschoo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/lawtaxat.htm', | |
'http://registrar.ufl.edu/soc/201401/all/linguist.htm', | |
'http://registrar.ufl.edu/soc/201401/all/manageme.htm', | |
'http://registrar.ufl.edu/soc/201401/all/marketin.htm', | |
'http://registrar.ufl.edu/soc/201401/all/masscomm.htm', | |
'http://registrar.ufl.edu/soc/201401/all/material.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mathemat.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mechaero.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediaaaa.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medianat.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medianes.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medibioc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medicomm.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediemrg.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medigene.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medimole.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medineur.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medineus.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediobst.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediopht.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediortr.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediotol.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medipath.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medipedi.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediphas.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediphys.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medipsyc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediradi.htm', | |
'http://registrar.ufl.edu/soc/201401/all/mediraon.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medisurg.htm', | |
'http://registrar.ufl.edu/soc/201401/all/medivals.htm', | |
'http://registrar.ufl.edu/soc/201401/all/microbio.htm', | |
'http://registrar.ufl.edu/soc/201401/all/miliafor.htm', | |
'http://registrar.ufl.edu/soc/201401/all/miliarmy.htm', | |
'http://registrar.ufl.edu/soc/201401/all/milinavy.htm', | |
'http://registrar.ufl.edu/soc/201401/all/musicapp.htm', | |
'http://registrar.ufl.edu/soc/201401/all/natresou.htm', | |
'http://registrar.ufl.edu/soc/201401/all/nuclearr.htm', | |
'http://registrar.ufl.edu/soc/201401/all/nursinga.htm', | |
'http://registrar.ufl.edu/soc/201401/all/nursingh.htm', | |
'http://registrar.ufl.edu/soc/201401/all/nursingw.htm', | |
'http://registrar.ufl.edu/soc/201401/all/occupati.htm', | |
'http://registrar.ufl.edu/soc/201401/all/packagsc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/pestmana.htm', | |
'http://registrar.ufl.edu/soc/201401/all/pharcchm.htm', | |
'http://registrar.ufl.edu/soc/201401/all/pharceop.htm', | |
'http://registrar.ufl.edu/soc/201401/all/pharcets.htm', | |
'http://registrar.ufl.edu/soc/201401/all/phardyna.htm', | |
'http://registrar.ufl.edu/soc/201401/all/pharprac.htm', | |
'http://registrar.ufl.edu/soc/201401/all/philosop.htm', | |
'http://registrar.ufl.edu/soc/201401/all/physical.htm', | |
'http://registrar.ufl.edu/soc/201401/all/physicss.htm', | |
'http://registrar.ufl.edu/soc/201401/all/plantpat.htm', | |
'http://registrar.ufl.edu/soc/201401/all/politica.htm', | |
'http://registrar.ufl.edu/soc/201401/all/psycholo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/pubhealt.htm', | |
'http://registrar.ufl.edu/soc/201401/all/publicre.htm', | |
'http://registrar.ufl.edu/soc/201401/all/rehbsci2.htm', | |
'http://registrar.ufl.edu/soc/201401/all/religion.htm', | |
'http://registrar.ufl.edu/soc/201401/all/soccrimi.htm', | |
'http://registrar.ufl.edu/soc/201401/all/socsocio.htm', | |
'http://registrar.ufl.edu/soc/201401/all/soilwatr.htm', | |
'http://registrar.ufl.edu/soc/201401/all/spaporpo.htm', | |
'http://registrar.ufl.edu/soc/201401/all/spaporsp.htm', | |
'http://registrar.ufl.edu/soc/201401/all/speechlh.htm', | |
'http://registrar.ufl.edu/soc/201401/all/statisti.htm', | |
'http://registrar.ufl.edu/soc/201401/all/telecomm.htm', | |
'http://registrar.ufl.edu/soc/201401/all/theadanc.htm', | |
'http://registrar.ufl.edu/soc/201401/all/tourismr.htm', | |
'http://registrar.ufl.edu/soc/201401/all/urbanreg.htm', | |
'http://registrar.ufl.edu/soc/201401/all/veterina.htm', | |
'http://registrar.ufl.edu/soc/201401/all/wildlife.htm', | |
'http://registrar.ufl.edu/soc/201401/all/womenstu.htm', | |
'http://registrar.ufl.edu/soc/201401/all/writprog.htm', | |
'http://registrar.ufl.edu/soc/201401/all/writtenc.htm' | |
]); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment