Created
September 3, 2015 18:47
-
-
Save bentranter/41289cf7790d32a3d8f5 to your computer and use it in GitHub Desktop.
Scrape every course at LU in less than 3 seconds
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Gotta go fast | |
package main | |
import ( | |
"fmt" | |
"net/http" | |
"time" | |
"github.com/bentranter/chalk" | |
"github.com/yhat/scrape" | |
"golang.org/x/net/html" | |
) | |
var urls = []string{ | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/anth.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/apbi.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/biol.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/busi.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/chem.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/clas.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/comp.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/crim.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/econ.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/educ.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/engi.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/finn.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/fren.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/geoa.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/geog.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/geol.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/gero.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/gsci.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/hist.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/indi.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/intd.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/ital.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/kine.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/lang.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/laws.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/ling.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/math.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/mdst.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/meds.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/musi.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/nacc.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/nort.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/nrmt.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/nurs.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/ojib.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/outd.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/phil.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/phys.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/poli.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/psyc.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/reli.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/soci.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/sowk.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/span.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/visu.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/wate.html", | |
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/wome.html", | |
} | |
// HTTPResponse is the struct that holds our response | |
// data | |
type HTTPResponse struct { | |
url string | |
response *http.Response | |
err error | |
} | |
// Scrape finds and serializes the data from Lakehead's | |
// site. | |
func Scrape(resp *HTTPResponse) { | |
root, err := html.Parse(resp.response.Body) | |
if err != nil { | |
fmt.Println("Error :", err) | |
return | |
} | |
// Yhat's package expects atomic values for tags, see | |
// https://godoc.org/golang.org/x/net/html/atom if you | |
// need a different tag. | |
data := scrape.FindAll(root, scrape.ByTag(0x10502)) | |
for _, match := range data { | |
fmt.Println(scrape.Text(match)) | |
} | |
return | |
} | |
func asyncHTTPGet(urls []string) []*HTTPResponse { | |
start := time.Now() | |
ch := make(chan *HTTPResponse) | |
responses := []*HTTPResponse{} | |
for _, url := range urls { | |
go func(url string) { | |
fmt.Printf("Fetching %s \n", url) | |
resp, err := http.Get(url) | |
if err != nil { | |
fmt.Println("Error: ", err) | |
ch <- &HTTPResponse{url, nil, err} | |
} else { | |
ch <- &HTTPResponse{url, resp, err} | |
} | |
}(url) | |
} | |
for { | |
select { | |
case r := <-ch: | |
fmt.Printf("%s | %s was fetched\n", time.Since(start), r.url) | |
// Scrape | |
Scrape(r) | |
responses = append(responses, r) | |
if len(responses) == len(urls) { | |
return responses | |
} | |
defer r.response.Body.Close() | |
} | |
} | |
return responses | |
} | |
func main() { | |
fmt.Println(chalk.Blue("*** ASYNC ***")) | |
results := asyncHTTPGet(urls) | |
for _, result := range results { | |
fmt.Printf("%s status, %s\n", result.url, result.response.Status) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment