Last active
January 6, 2022 16:44
-
-
Save toannd96/a48d437c2f7cbd23fb0cfb029f5d95d0 to your computer and use it in GitHub Desktop.
crawl jobstreet use channel, colly and goquery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"fmt" | |
"log" | |
"net/http" | |
"os" | |
"strconv" | |
"sync" | |
"time" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/cenkalti/backoff" | |
"github.com/gocolly/colly" | |
) | |
const webPage = "https://www.jobstreet.vn/t%C3%ACmvi%E1%BB%87c" | |
type Job struct { | |
Title string `json:"title"` | |
Company string `json:"company"` | |
Location string `json:"location"` | |
Descript string `json:"descript"` | |
Url string `json:"url"` | |
Site string `json:"site"` | |
CreatedAt string `json:"created_at"` | |
} | |
type Jobs struct { | |
List []Job `json:"jobs"` | |
TotalJobs int `json:"total_jobs"` | |
} | |
const ( | |
maxRetry = 3 * time.Minute | |
) | |
func get(url string) (*http.Response, error) { | |
req, err := http.NewRequest("GET", url, nil) | |
if err != nil { | |
return nil, err | |
} | |
client := &http.Client{} | |
resp, err := client.Do(req) | |
if err != nil { | |
return nil, err | |
} | |
return resp, nil | |
} | |
func Get(url string) (*http.Response, error) { | |
var err error | |
var resp *http.Response | |
bo := backoff.NewExponentialBackOff() | |
bo.MaxInterval = maxRetry | |
bo.MaxElapsedTime = maxRetry | |
for { | |
resp, err = get(url) | |
if err == nil { | |
break | |
} | |
d := bo.NextBackOff() | |
if d == backoff.Stop { | |
break | |
} | |
time.Sleep(d) | |
} | |
if err != nil { | |
return nil, err | |
} | |
return resp, nil | |
} | |
func crawlJobStreet() { | |
var urls []string | |
pipe := make(chan string) | |
done := make(chan bool) | |
go func() { | |
for { | |
url, more := <-pipe | |
if more { | |
fmt.Println("Received urls", url) | |
urls = append(urls, url) | |
fmt.Println("Append url received to array", len(urls)) | |
} else { | |
fmt.Println("Received all urls", len(urls)) | |
extractInfoJob(urls) | |
done <- true | |
return | |
} | |
} | |
}() | |
var wg sync.WaitGroup | |
wg.Add(2) | |
go getUrlByProvince(pipe, &wg) | |
go getUrlByCategory(pipe, &wg) | |
go func() { | |
wg.Wait() | |
close(pipe) | |
}() | |
<-done | |
} | |
func extractInfoJob(urls []string) error { | |
var jobs Jobs | |
var job Job | |
c := colly.NewCollector( | |
// colly.Async(true), | |
) | |
// c.Limit(&colly.LimitRule{ | |
// Parallelism: 2, | |
// }) | |
c.SetRequestTimeout(120 * time.Second) | |
c.OnRequest(func(r *colly.Request) { | |
fmt.Println("Visiting", r.URL) | |
}) | |
c.OnError(func(r *colly.Response, err error) { | |
fmt.Println(err) | |
}) | |
c.OnHTML(".jobresults .job-card", func(e *colly.HTMLElement) { | |
job.Url = "https://www.jobstreet.vn" + e.ChildAttr("h3.job-title > a", "href") | |
job.Title = e.ChildText("h3.job-title > a") | |
job.Company = e.ChildText("span.job-company") | |
job.Location = e.ChildText("span.job-location") | |
c.Visit(e.Request.AbsoluteURL(job.Url)) | |
c.OnHTML("div[class=heading-xsmall]", func(e *colly.HTMLElement) { | |
job.Site = e.ChildText("span.site") | |
job.CreatedAt = e.ChildText("span.listed-date") | |
}) | |
if job.Site == "TopCV" { | |
job.Descript = "" | |
} else { | |
c.OnHTML("div[class=-desktop-no-padding-top]", func(e *colly.HTMLElement) { | |
job.Descript = e.Text | |
}) | |
} | |
jobs.TotalJobs++ | |
jobs.List = append(jobs.List, job) | |
dataBytes, errMarshal := json.Marshal(jobs) | |
if errMarshal != nil { | |
fmt.Println(errMarshal) | |
} | |
os.WriteFile("jobstreet.json", dataBytes, 0700) | |
}) | |
for _, url := range urls { | |
c.Visit(url) | |
} | |
// c.Wait() | |
return nil | |
} | |
// getUrlByProvince get all search url by province | |
func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error { | |
defer wg.Done() | |
doc, err := getNewDocument(webPage) | |
if err != nil { | |
return err | |
} | |
// Get all search urls by province | |
doc.Find("div[id=browse-locations] a[href]").Each(func(index int, province *goquery.Selection) { | |
href, _ := province.Attr("href") | |
urlProvince := fmt.Sprintf("https://www.jobstreet.vn%s", href) | |
// Get total page count of each url by province | |
totalPage, err := getTotalPage(urlProvince) | |
if err != nil { | |
fmt.Println(err) | |
} | |
// Merge all url pages by province | |
for page := 1; page <= totalPage; page++ { | |
urlProvinceByPage := fmt.Sprintf("%s?p=%d", urlProvince, page) | |
pipe <- urlProvinceByPage | |
} | |
}) | |
return nil | |
} | |
// getUrlByCategory get all search url by category | |
func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error { | |
defer wg.Done() | |
doc, err := getNewDocument(webPage) | |
if err != nil { | |
return err | |
} | |
// Get all search urls by category | |
doc.Find("div[id=browse-categories] a[href]").Each(func(index int, category *goquery.Selection) { | |
href, _ := category.Attr("href") | |
urlCategory := fmt.Sprintf("https://www.jobstreet.vn%s", href) | |
docChild, err := getNewDocument(urlCategory) | |
if err != nil { | |
fmt.Println(err) | |
} | |
// Get all search urls by category child | |
docChild.Find("div[id=browse-keywords] a[href]").Each(func(index int, key *goquery.Selection) { | |
href, _ := key.Attr("href") | |
urlCategoryChild := fmt.Sprintf("https://www.jobstreet.vn%s", href) | |
// Get total page count of each url by category child | |
totalPage, err := getTotalPage(urlCategoryChild) | |
if err != nil { | |
fmt.Println(err) | |
} | |
// Merge all url pages by category child | |
for page := 1; page <= totalPage; page++ { | |
urlCategoryChildByPage := fmt.Sprintf("%s?p=%d", urlCategoryChild, page) | |
pipe <- urlCategoryChildByPage | |
} | |
}) | |
}) | |
return nil | |
} | |
// getTotalPage get total page count of each url | |
func getTotalPage(url string) (int, error) { | |
var totalPage int | |
doc, err := getNewDocument(url) | |
if err != nil { | |
return 0, err | |
} | |
pageStr := doc.Find("div.search-results-count strong:last-child").Text() | |
if pageStr != "" { | |
totalPage, err = strconv.Atoi(pageStr) | |
if err != nil { | |
return 0, err | |
} | |
} | |
return totalPage, nil | |
} | |
// getNewDocument get html document from url | |
func getNewDocument(url string) (*goquery.Document, error) { | |
resp, err := Get(url) | |
if err != nil { | |
fmt.Println(err) | |
} | |
defer resp.Body.Close() | |
if resp.StatusCode != 200 { | |
log.Fatalf("status code error: %d %s", resp.StatusCode, resp.Status) | |
} | |
doc, err := goquery.NewDocumentFromReader(resp.Body) | |
if err != nil { | |
fmt.Println(err) | |
} | |
return doc, nil | |
} | |
func schedule(timeSchedule time.Duration, index int) { | |
ticker := time.NewTicker(timeSchedule) | |
go func() { | |
for { | |
switch index { | |
case 1: | |
<-ticker.C | |
crawlJobStreet() | |
} | |
} | |
}() | |
} | |
func main() { | |
crawlJobStreet() | |
// schedule crawler | |
go schedule(24*time.Hour, 1) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment