toannd96 · January 6, 2022 16:44
diff --git a/jobstreet.go b/jobstreet.go
 package main

 import (
 	"encoding/json"
 	"fmt"
 	"log"
 	"net/http"
 	"os"
 	"strconv"
 	"sync"
 	"time"

 	"github.com/PuerkitoBio/goquery"
 	"github.com/cenkalti/backoff"
 	"github.com/gocolly/colly"
 )

 const webPage = "https://www.jobstreet.vn/t%C3%ACmvi%E1%BB%87c"

 type Job struct {
 	Title     string `json:"title"`
 	Company   string `json:"company"`
 	Location  string `json:"location"`
 	Descript  string `json:"descript"`
 	Url       string `json:"url"`
 	Site      string `json:"site"`
 	CreatedAt string `json:"created_at"`
 }

 type Jobs struct {
 	List      []Job `json:"jobs"`
 	TotalJobs int   `json:"total_jobs"`
 }

 const (
 	maxRetry = 3 * time.Minute
 )

 func get(url string) (*http.Response, error) {
 	req, err := http.NewRequest("GET", url, nil)
 	if err != nil {
 		return nil, err
 	}
 	client := &http.Client{}
 	resp, err := client.Do(req)
 	if err != nil {
 		return nil, err
 	}
 	return resp, nil
 }

 func Get(url string) (*http.Response, error) {
 	var err error
 	var resp *http.Response
 	bo := backoff.NewExponentialBackOff()
 	bo.MaxInterval = maxRetry
 	bo.MaxElapsedTime = maxRetry
 	for {
 		resp, err = get(url)
 		if err == nil {
 			break
 		}
 		d := bo.NextBackOff()
 		if d == backoff.Stop {
 			break
 		}
 		time.Sleep(d)
 	}
 	if err != nil {
 		return nil, err
 	}
 	return resp, nil
 }

 func crawlJobStreet() {
 	var urls []string

 	pipe := make(chan string)
 	done := make(chan bool)
 	go func() {
 		for {
 			url, more := <-pipe
 			if more {
 				fmt.Println("Received urls", url)
 				urls = append(urls, url)
 				fmt.Println("Append url received to array", len(urls))
 			} else {
 				fmt.Println("Received all urls", len(urls))
 				extractInfoJob(urls)
 				done <- true
 				return
 			}
 		}
 	}()

 	var wg sync.WaitGroup
 	wg.Add(2)
 	go getUrlByProvince(pipe, &wg)
 	go getUrlByCategory(pipe, &wg)

 	go func() {
 		wg.Wait()
 		close(pipe)
 	}()
 	<-done
 }

 func extractInfoJob(urls []string) error {
 	var jobs Jobs
 	var job Job

 	c := colly.NewCollector(
 // 		colly.Async(true),
 	)

 // 	c.Limit(&colly.LimitRule{
 // 		Parallelism: 2,
 // 	})

 	c.SetRequestTimeout(120 * time.Second)

 	c.OnRequest(func(r *colly.Request) {
 		fmt.Println("Visiting", r.URL)
 	})

 	c.OnError(func(r *colly.Response, err error) {
 		fmt.Println(err)
 	})

 	c.OnHTML(".jobresults .job-card", func(e *colly.HTMLElement) {
 		job.Url = "https://www.jobstreet.vn" + e.ChildAttr("h3.job-title > a", "href")
 		job.Title = e.ChildText("h3.job-title > a")
 		job.Company = e.ChildText("span.job-company")
 		job.Location = e.ChildText("span.job-location")

 		c.Visit(e.Request.AbsoluteURL(job.Url))
 		c.OnHTML("div[class=heading-xsmall]", func(e *colly.HTMLElement) {
 			job.Site = e.ChildText("span.site")
 			job.CreatedAt = e.ChildText("span.listed-date")
 		})

 		if job.Site == "TopCV" {
 			job.Descript = ""
 		} else {
 			c.OnHTML("div[class=-desktop-no-padding-top]", func(e *colly.HTMLElement) {
 				job.Descript = e.Text
 			})
 		}

 		jobs.TotalJobs++
 		jobs.List = append(jobs.List, job)

 		dataBytes, errMarshal := json.Marshal(jobs)
 		if errMarshal != nil {
 			fmt.Println(errMarshal)
 		}
 		os.WriteFile("jobstreet.json", dataBytes, 0700)
 	})

 	for _, url := range urls {
 		c.Visit(url)
 	}

 // 	c.Wait()

 	return nil
 }

 // getUrlByProvince get all search url by province
 func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error {
 	defer wg.Done()

 	doc, err := getNewDocument(webPage)
 	if err != nil {
 		return err
 	}

 	// Get all search urls by province
 	doc.Find("div[id=browse-locations] a[href]").Each(func(index int, province *goquery.Selection) {
 		href, _ := province.Attr("href")
 		urlProvince := fmt.Sprintf("https://www.jobstreet.vn%s", href)

 		// Get total page count of each url by province
 		totalPage, err := getTotalPage(urlProvince)
 		if err != nil {
 			fmt.Println(err)
 		}

 		// Merge all url pages by province
 		for page := 1; page <= totalPage; page++ {
 			urlProvinceByPage := fmt.Sprintf("%s?p=%d", urlProvince, page)
 			pipe <- urlProvinceByPage
 		}
 	})

 	return nil
 }

 // getUrlByCategory get all search url by category
 func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error {
 	defer wg.Done()
 	doc, err := getNewDocument(webPage)
 	if err != nil {
 		return err
 	}

 	// Get all search urls by category
 	doc.Find("div[id=browse-categories] a[href]").Each(func(index int, category *goquery.Selection) {
 		href, _ := category.Attr("href")
 		urlCategory := fmt.Sprintf("https://www.jobstreet.vn%s", href)

 		docChild, err := getNewDocument(urlCategory)
 		if err != nil {
 			fmt.Println(err)
 		}

 		// Get all search urls by category child
 		docChild.Find("div[id=browse-keywords] a[href]").Each(func(index int, key *goquery.Selection) {
 			href, _ := key.Attr("href")
 			urlCategoryChild := fmt.Sprintf("https://www.jobstreet.vn%s", href)

 			// Get total page count of each url by category child
 			totalPage, err := getTotalPage(urlCategoryChild)
 			if err != nil {
 				fmt.Println(err)
 			}

 			// Merge all url pages by category child
 			for page := 1; page <= totalPage; page++ {
 				urlCategoryChildByPage := fmt.Sprintf("%s?p=%d", urlCategoryChild, page)
 				pipe <- urlCategoryChildByPage
 			}
 		})
 	})

 	return nil
 }

 // getTotalPage get total page count of each url
 func getTotalPage(url string) (int, error) {
 	var totalPage int
 	doc, err := getNewDocument(url)
 	if err != nil {
 		return 0, err
 	}

 	pageStr := doc.Find("div.search-results-count strong:last-child").Text()
 	if pageStr != "" {
 		totalPage, err = strconv.Atoi(pageStr)
 		if err != nil {
 			return 0, err
 		}
 	}

 	return totalPage, nil
 }

 // getNewDocument get html document from url
 func getNewDocument(url string) (*goquery.Document, error) {
 	resp, err := Get(url)
 	if err != nil {
 		fmt.Println(err)
 	}
 	defer resp.Body.Close()

 	if resp.StatusCode != 200 {
 		log.Fatalf("status code error: %d %s", resp.StatusCode, resp.Status)
 	}

 	doc, err := goquery.NewDocumentFromReader(resp.Body)
 	if err != nil {
 		fmt.Println(err)
 	}

 	return doc, nil
 }

 func schedule(timeSchedule time.Duration, index int) {
 	ticker := time.NewTicker(timeSchedule)
 	go func() {
 		for {
 			switch index {
 			case 1:
 				<-ticker.C
 				crawlJobStreet()
 			}
 		}
 	}()
 }

 func main() {
 	crawlJobStreet()

 	// schedule crawler
 	go schedule(24*time.Hour, 1)
 }
	package main

	import (
	"encoding/json"
	"fmt"
	"log"
	"net/http"
	"os"
	"strconv"
	"sync"
	"time"

	"github.com/PuerkitoBio/goquery"
	"github.com/cenkalti/backoff"
	"github.com/gocolly/colly"
	)

	const webPage = "https://www.jobstreet.vn/t%C3%ACmvi%E1%BB%87c"

	type Job struct {
	Title string `json:"title"`
	Company string `json:"company"`
	Location string `json:"location"`
	Descript string `json:"descript"`
	Url string `json:"url"`
	Site string `json:"site"`
	CreatedAt string `json:"created_at"`
	}

	type Jobs struct {
	List []Job `json:"jobs"`
	TotalJobs int `json:"total_jobs"`
	}

	const (
	maxRetry = 3 * time.Minute
	)

	func get(url string) (*http.Response, error) {
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
	return nil, err
	}
	client := &http.Client{}
	resp, err := client.Do(req)
	if err != nil {
	return nil, err
	}
	return resp, nil
	}

	func Get(url string) (*http.Response, error) {
	var err error
	var resp *http.Response
	bo := backoff.NewExponentialBackOff()
	bo.MaxInterval = maxRetry
	bo.MaxElapsedTime = maxRetry
	for {
	resp, err = get(url)
	if err == nil {
	break
	}
	d := bo.NextBackOff()
	if d == backoff.Stop {
	break
	}
	time.Sleep(d)
	}
	if err != nil {
	return nil, err
	}
	return resp, nil
	}

	func crawlJobStreet() {
	var urls []string

	pipe := make(chan string)
	done := make(chan bool)
	go func() {
	for {
	url, more := <-pipe
	if more {
	fmt.Println("Received urls", url)
	urls = append(urls, url)
	fmt.Println("Append url received to array", len(urls))
	} else {
	fmt.Println("Received all urls", len(urls))
	extractInfoJob(urls)
	done <- true
	return
	}
	}
	}()

	var wg sync.WaitGroup
	wg.Add(2)
	go getUrlByProvince(pipe, &wg)
	go getUrlByCategory(pipe, &wg)

	go func() {
	wg.Wait()
	close(pipe)
	}()
	<-done
	}

	func extractInfoJob(urls []string) error {
	var jobs Jobs
	var job Job

	c := colly.NewCollector(
	// colly.Async(true),
	)

	// c.Limit(&colly.LimitRule{
	// Parallelism: 2,
	// })

	c.SetRequestTimeout(120 * time.Second)

	c.OnRequest(func(r *colly.Request) {
	fmt.Println("Visiting", r.URL)
	})

	c.OnError(func(r *colly.Response, err error) {
	fmt.Println(err)
	})

	c.OnHTML(".jobresults .job-card", func(e *colly.HTMLElement) {
	job.Url = "https://www.jobstreet.vn" + e.ChildAttr("h3.job-title > a", "href")
	job.Title = e.ChildText("h3.job-title > a")
	job.Company = e.ChildText("span.job-company")
	job.Location = e.ChildText("span.job-location")

	c.Visit(e.Request.AbsoluteURL(job.Url))
	c.OnHTML("div[class=heading-xsmall]", func(e *colly.HTMLElement) {
	job.Site = e.ChildText("span.site")
	job.CreatedAt = e.ChildText("span.listed-date")
	})

	if job.Site == "TopCV" {
	job.Descript = ""
	} else {
	c.OnHTML("div[class=-desktop-no-padding-top]", func(e *colly.HTMLElement) {
	job.Descript = e.Text
	})
	}

	jobs.TotalJobs++
	jobs.List = append(jobs.List, job)

	dataBytes, errMarshal := json.Marshal(jobs)
	if errMarshal != nil {
	fmt.Println(errMarshal)
	}
	os.WriteFile("jobstreet.json", dataBytes, 0700)
	})

	for _, url := range urls {
	c.Visit(url)
	}

	// c.Wait()

	return nil
	}

	// getUrlByProvince get all search url by province
	func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error {
	defer wg.Done()

	doc, err := getNewDocument(webPage)
	if err != nil {
	return err
	}

	// Get all search urls by province
	doc.Find("div[id=browse-locations] a[href]").Each(func(index int, province *goquery.Selection) {
	href, _ := province.Attr("href")
	urlProvince := fmt.Sprintf("https://www.jobstreet.vn%s", href)

	// Get total page count of each url by province
	totalPage, err := getTotalPage(urlProvince)
	if err != nil {
	fmt.Println(err)
	}

	// Merge all url pages by province
	for page := 1; page <= totalPage; page++ {
	urlProvinceByPage := fmt.Sprintf("%s?p=%d", urlProvince, page)
	pipe <- urlProvinceByPage
	}
	})

	return nil
	}

	// getUrlByCategory get all search url by category
	func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error {
	defer wg.Done()
	doc, err := getNewDocument(webPage)
	if err != nil {
	return err
	}

	// Get all search urls by category
	doc.Find("div[id=browse-categories] a[href]").Each(func(index int, category *goquery.Selection) {
	href, _ := category.Attr("href")
	urlCategory := fmt.Sprintf("https://www.jobstreet.vn%s", href)

	docChild, err := getNewDocument(urlCategory)
	if err != nil {
	fmt.Println(err)
	}

	// Get all search urls by category child
	docChild.Find("div[id=browse-keywords] a[href]").Each(func(index int, key *goquery.Selection) {
	href, _ := key.Attr("href")
	urlCategoryChild := fmt.Sprintf("https://www.jobstreet.vn%s", href)

	// Get total page count of each url by category child
	totalPage, err := getTotalPage(urlCategoryChild)
	if err != nil {
	fmt.Println(err)
	}

	// Merge all url pages by category child
	for page := 1; page <= totalPage; page++ {
	urlCategoryChildByPage := fmt.Sprintf("%s?p=%d", urlCategoryChild, page)
	pipe <- urlCategoryChildByPage
	}
	})
	})

	return nil
	}

	// getTotalPage get total page count of each url
	func getTotalPage(url string) (int, error) {
	var totalPage int
	doc, err := getNewDocument(url)
	if err != nil {
	return 0, err
	}

	pageStr := doc.Find("div.search-results-count strong:last-child").Text()
	if pageStr != "" {
	totalPage, err = strconv.Atoi(pageStr)
	if err != nil {
	return 0, err
	}
	}

	return totalPage, nil
	}

	// getNewDocument get html document from url
	func getNewDocument(url string) (*goquery.Document, error) {
	resp, err := Get(url)
	if err != nil {
	fmt.Println(err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != 200 {
	log.Fatalf("status code error: %d %s", resp.StatusCode, resp.Status)
	}

	doc, err := goquery.NewDocumentFromReader(resp.Body)
	if err != nil {
	fmt.Println(err)
	}

	return doc, nil
	}

	func schedule(timeSchedule time.Duration, index int) {
	ticker := time.NewTicker(timeSchedule)
	go func() {
	for {
	switch index {
	case 1:
	<-ticker.C
	crawlJobStreet()
	}
	}
	}()
	}

	func main() {
	crawlJobStreet()

	// schedule crawler
	go schedule(24*time.Hour, 1)
	}