Skip to content

Instantly share code, notes, and snippets.

@naquad
Created March 16, 2014 01:02
Show Gist options
  • Save naquad/9576773 to your computer and use it in GitHub Desktop.
Save naquad/9576773 to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"net/http"
"net/url"
"io"
"github.com/PuerkitoBio/goquery"
)
// some types for convience
type DownloaderChannel chan *Request
type ProcessFunc func(DownloaderChannel, *http.Response)
// this is "subclass" of Request to drag processor func around
// this could be done by separate structure, but in the end
// i don't want too many objects flying around. less typing,
// smaller code
type Request struct {
*http.Request
Processor ProcessFunc
}
// create new request. same as http.NewRequest except also accepts ProcessFunc
func NewRequest(method, url string, body io.Reader, proc ProcessFunc) (*Request, error) {
req, err := http.NewRequest(method, url, body)
if err != nil {
return nil, err
}
return &Request{
Request: req,
Processor: proc,
}, nil
}
// shorthard that'll handle the error
func Get(url string, proc ProcessFunc) (*Request) {
req, err := NewRequest("GET", url, nil, proc)
if err != nil {
fmt.Println(err)
return nil
}
return req
}
// absolutizes relative URL with responses URL, that is
// joins response URL with passed url
func RelativeURL(res *http.Response, target string) (string) {
url_parsed, err := url.Parse(target)
if err != nil {
fmt.Println(err)
return ""
}
return res.Request.URL.ResolveReference(url_parsed).String()
}
/// actual parser of a first page
func parseFront(work DownloaderChannel, res *http.Response){
doc, err := goquery.NewDocumentFromResponse(res)
if err != nil {
fmt.Println(err)
return
}
doc.Find(".main-nav li a[href]").Each(func(_ int, elem *goquery.Selection){
attr, _ := elem.Attr("href")
// now parse given links with parseCategory
work <- Get(RelativeURL(res, attr), parseCategory)
})
}
func parseCategory(work DownloaderChannel, res *http.Response){
doc, err := goquery.NewDocumentFromResponse(res)
if err != nil {
fmt.Println(err)
return
}
// subcategory pages = product lists
doc.Find("#category-map a").Each(func(_ int, elem *goquery.Selection){
attr, _ := elem.Attr("href")
work <- Get(RelativeURL(res, attr), parseList)
})
}
func parseList(work DownloaderChannel, res *http.Response){
doc, err := goquery.NewDocumentFromResponse(res)
if err != nil {
fmt.Println(err)
return
}
/// next page
elem := doc.Find("#listing .pager-bottom .next a[href]").First()
if elem.Size() > 0 {
attr, _ := elem.Attr("href")
work <- Get(RelativeURL(res, attr), parseList)
}
// products
doc.Find("#listing .offers article[id^='item-'] h2 a").Each(func(_ int, elem *goquery.Selection){
attr, _ := elem.Attr("href")
work <- Get(RelativeURL(res, attr), parseProduct)
})
}
func parseProduct(work DownloaderChannel, res *http.Response){
fmt.Println(res.Request.URL)
}
func Worker(work DownloaderChannel){
client := &http.Client{}
for req := range work {
fmt.Println("Getting ", req.URL)
resp, err := client.Do(req.Request)
if err != nil {
fmt.Println("Error while processing: ", err)
} else {
req.Processor(work, resp)
}
fmt.Println("done")
}
}
const N = 30
func main(){
work := make(DownloaderChannel)
for i := 0; i < N; i++ {
go Worker(work)
}
work <- Get("http://www.aukro.ua", parseFront)
select {}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment