Skip to content

Instantly share code, notes, and snippets.

@bentranter
Created September 3, 2015 18:47
Show Gist options
  • Save bentranter/41289cf7790d32a3d8f5 to your computer and use it in GitHub Desktop.
Save bentranter/41289cf7790d32a3d8f5 to your computer and use it in GitHub Desktop.
Scrape every course at LU in less than 3 seconds
// Gotta go fast
package main
import (
"fmt"
"net/http"
"time"
"github.com/bentranter/chalk"
"github.com/yhat/scrape"
"golang.org/x/net/html"
)
var urls = []string{
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/anth.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/apbi.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/biol.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/busi.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/chem.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/clas.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/comp.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/crim.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/econ.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/educ.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/engi.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/finn.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/fren.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/geoa.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/geog.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/geol.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/gero.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/gsci.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/hist.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/indi.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/intd.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/ital.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/kine.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/lang.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/laws.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/ling.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/math.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/mdst.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/meds.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/musi.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/nacc.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/nort.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/nrmt.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/nurs.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/ojib.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/outd.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/phil.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/phys.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/poli.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/psyc.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/reli.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/soci.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/sowk.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/span.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/visu.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/wate.html",
"http://timetable.lakeheadu.ca/2015FW_UG_TBAY/wome.html",
}
// HTTPResponse is the struct that holds our response
// data
type HTTPResponse struct {
url string
response *http.Response
err error
}
// Scrape finds and serializes the data from Lakehead's
// site.
func Scrape(resp *HTTPResponse) {
root, err := html.Parse(resp.response.Body)
if err != nil {
fmt.Println("Error :", err)
return
}
// Yhat's package expects atomic values for tags, see
// https://godoc.org/golang.org/x/net/html/atom if you
// need a different tag.
data := scrape.FindAll(root, scrape.ByTag(0x10502))
for _, match := range data {
fmt.Println(scrape.Text(match))
}
return
}
func asyncHTTPGet(urls []string) []*HTTPResponse {
start := time.Now()
ch := make(chan *HTTPResponse)
responses := []*HTTPResponse{}
for _, url := range urls {
go func(url string) {
fmt.Printf("Fetching %s \n", url)
resp, err := http.Get(url)
if err != nil {
fmt.Println("Error: ", err)
ch <- &HTTPResponse{url, nil, err}
} else {
ch <- &HTTPResponse{url, resp, err}
}
}(url)
}
for {
select {
case r := <-ch:
fmt.Printf("%s | %s was fetched\n", time.Since(start), r.url)
// Scrape
Scrape(r)
responses = append(responses, r)
if len(responses) == len(urls) {
return responses
}
defer r.response.Body.Close()
}
}
return responses
}
func main() {
fmt.Println(chalk.Blue("*** ASYNC ***"))
results := asyncHTTPGet(urls)
for _, result := range results {
fmt.Printf("%s status, %s\n", result.url, result.response.Status)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment