Created
March 16, 2014 01:02
-
-
Save naquad/9576773 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"net/http" | |
"net/url" | |
"io" | |
"github.com/PuerkitoBio/goquery" | |
) | |
// some types for convience | |
type DownloaderChannel chan *Request | |
type ProcessFunc func(DownloaderChannel, *http.Response) | |
// this is "subclass" of Request to drag processor func around | |
// this could be done by separate structure, but in the end | |
// i don't want too many objects flying around. less typing, | |
// smaller code | |
type Request struct { | |
*http.Request | |
Processor ProcessFunc | |
} | |
// create new request. same as http.NewRequest except also accepts ProcessFunc | |
func NewRequest(method, url string, body io.Reader, proc ProcessFunc) (*Request, error) { | |
req, err := http.NewRequest(method, url, body) | |
if err != nil { | |
return nil, err | |
} | |
return &Request{ | |
Request: req, | |
Processor: proc, | |
}, nil | |
} | |
// shorthard that'll handle the error | |
func Get(url string, proc ProcessFunc) (*Request) { | |
req, err := NewRequest("GET", url, nil, proc) | |
if err != nil { | |
fmt.Println(err) | |
return nil | |
} | |
return req | |
} | |
// absolutizes relative URL with responses URL, that is | |
// joins response URL with passed url | |
func RelativeURL(res *http.Response, target string) (string) { | |
url_parsed, err := url.Parse(target) | |
if err != nil { | |
fmt.Println(err) | |
return "" | |
} | |
return res.Request.URL.ResolveReference(url_parsed).String() | |
} | |
/// actual parser of a first page | |
func parseFront(work DownloaderChannel, res *http.Response){ | |
doc, err := goquery.NewDocumentFromResponse(res) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
doc.Find(".main-nav li a[href]").Each(func(_ int, elem *goquery.Selection){ | |
attr, _ := elem.Attr("href") | |
// now parse given links with parseCategory | |
work <- Get(RelativeURL(res, attr), parseCategory) | |
}) | |
} | |
func parseCategory(work DownloaderChannel, res *http.Response){ | |
doc, err := goquery.NewDocumentFromResponse(res) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
// subcategory pages = product lists | |
doc.Find("#category-map a").Each(func(_ int, elem *goquery.Selection){ | |
attr, _ := elem.Attr("href") | |
work <- Get(RelativeURL(res, attr), parseList) | |
}) | |
} | |
func parseList(work DownloaderChannel, res *http.Response){ | |
doc, err := goquery.NewDocumentFromResponse(res) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
/// next page | |
elem := doc.Find("#listing .pager-bottom .next a[href]").First() | |
if elem.Size() > 0 { | |
attr, _ := elem.Attr("href") | |
work <- Get(RelativeURL(res, attr), parseList) | |
} | |
// products | |
doc.Find("#listing .offers article[id^='item-'] h2 a").Each(func(_ int, elem *goquery.Selection){ | |
attr, _ := elem.Attr("href") | |
work <- Get(RelativeURL(res, attr), parseProduct) | |
}) | |
} | |
func parseProduct(work DownloaderChannel, res *http.Response){ | |
fmt.Println(res.Request.URL) | |
} | |
func Worker(work DownloaderChannel){ | |
client := &http.Client{} | |
for req := range work { | |
fmt.Println("Getting ", req.URL) | |
resp, err := client.Do(req.Request) | |
if err != nil { | |
fmt.Println("Error while processing: ", err) | |
} else { | |
req.Processor(work, resp) | |
} | |
fmt.Println("done") | |
} | |
} | |
const N = 30 | |
func main(){ | |
work := make(DownloaderChannel) | |
for i := 0; i < N; i++ { | |
go Worker(work) | |
} | |
work <- Get("http://www.aukro.ua", parseFront) | |
select {} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment