Last active
January 8, 2022 08:48
-
-
Save toannd96/e4d7ddcac5abaee5d7ab557900a43bc1 to your computer and use it in GitHub Desktop.
crawl masothue.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"fmt" | |
"net/http" | |
"os" | |
"sync" | |
"time" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/cenkalti/backoff" | |
) | |
const ( | |
basePath = "https://www.masothue.com" | |
TypeCompanyPath = "/tra-cuu-ma-so-thue-theo-loai-hinh-doanh-nghiep" | |
TypeBusinessPath = "/tra-cuu-ma-so-thue-theo-nganh-nghe" | |
fileName = "masothue.json" | |
maxRetry = 3 * time.Minute | |
) | |
type CompanyInfo struct { | |
Name string `json:"name"` | |
TaxInfo map[string]string `json:"tax_info"` | |
Business []BusinessInfo `json:"business_info"` | |
} | |
type BusinessInfo struct { | |
ID string `json:"id"` | |
Carees string `json:"carees"` | |
} | |
type Company struct { | |
List []CompanyInfo `json:"company"` | |
TotalCompany int `json:"total_company"` | |
} | |
func NewCompany() *CompanyInfo { | |
return &CompanyInfo{ | |
TaxInfo: make(map[string]string), | |
} | |
} | |
// Get http request basic | |
func get(url string) (*http.Response, error) { | |
req, err := http.NewRequest("GET", url, nil) | |
if err != nil { | |
return nil, err | |
} | |
client := &http.Client{} | |
resp, err := client.Do(req) | |
if err != nil { | |
return nil, err | |
} | |
return resp, nil | |
} | |
// Get http request with backoff retry | |
func Get(url string) (*http.Response, error) { | |
var err error | |
var resp *http.Response | |
bo := backoff.NewExponentialBackOff() | |
bo.MaxInterval = maxRetry | |
bo.MaxElapsedTime = maxRetry | |
for { | |
resp, err = get(url) | |
if err == nil { | |
break | |
} | |
fmt.Println("BackOff retry") | |
d := bo.NextBackOff() | |
if d == backoff.Stop { | |
fmt.Println("Retry time out") | |
break | |
} | |
fmt.Println("Retry in ", d) | |
time.Sleep(d) | |
} | |
if err != nil { | |
return nil, err | |
} | |
return resp, nil | |
} | |
// Get html document from url | |
func getNewDocument(url string) (*goquery.Document, error) { | |
resp, err := Get(url) | |
if err != nil { | |
fmt.Println(err) | |
} | |
defer resp.Body.Close() | |
if resp.StatusCode != 200 { | |
fmt.Printf("status code error: %d %s", resp.StatusCode, resp.Status) | |
} | |
doc, err := goquery.NewDocumentFromReader(resp.Body) | |
if err != nil { | |
fmt.Println(err) | |
} | |
return doc, nil | |
} | |
func crawlMasothue() { | |
var wg sync.WaitGroup | |
var allCompany []CompanyInfo | |
pipe := make(chan string) | |
done := make(chan bool) | |
go func() { | |
for { | |
url, more := <-pipe | |
if more { | |
fmt.Println("Extract url", url) | |
oneCompany, _ := extractCompanyInfo(url) | |
allCompany = append(allCompany, oneCompany...) | |
} else { | |
fmt.Println("Extract all url") | |
company := Company{ | |
List: allCompany, | |
TotalCompany: len(allCompany), | |
} | |
dataBytes, errMarshal := json.Marshal(company) | |
if errMarshal != nil { | |
fmt.Println(errMarshal) | |
} | |
os.WriteFile(fileName, dataBytes, 0700) | |
done <- true | |
return | |
} | |
} | |
}() | |
wg.Add(1) | |
go getUrl(pipe, &wg) | |
go func() { | |
wg.Wait() | |
close(pipe) | |
}() | |
<-done | |
} | |
func getUrl(pipe chan<- string, wg *sync.WaitGroup) error { | |
defer wg.Done() | |
doc, err := getNewDocument(basePath + TypeBusinessPath) | |
if err != nil { | |
return err | |
} | |
doc.Find("table tbody").Each(func(index int, tableHtml *goquery.Selection) { | |
tableHtml.Find("tr").Each(func(indexTr int, rowHtml *goquery.Selection) { | |
rowHtml.Find("td:last-child a[href]").Each(func(ndexTd int, tableCell *goquery.Selection) { | |
href, _ := tableCell.Attr("href") | |
for page := 1; page <= 10; page++ { | |
urlTypeCompany := fmt.Sprintf("%s%s?page=%d", basePath, href, page) | |
docChild, _ := getNewDocument(urlTypeCompany) | |
docChild.Find("div.tax-listing h3 a[href]").Each(func(index int, info *goquery.Selection) { | |
href, _ := info.Attr("href") | |
urlInfoCompany := fmt.Sprintf("%s%s", basePath, href) | |
pipe <- urlInfoCompany | |
}) | |
} | |
}) | |
}) | |
}) | |
return nil | |
} | |
func extractCompanyInfo(url string) ([]CompanyInfo, error) { | |
var company Company | |
companyInfo := NewCompany() | |
doc, err := getNewDocument(url) | |
if err != nil { | |
return nil, err | |
} | |
// extract tax info | |
doc.Find("table.table-taxinfo").Each(func(index int, tableTaxHtml *goquery.Selection) { | |
tableTaxHtml.Find("th span.copy").Each(func(indexTr int, rowTaxHtml *goquery.Selection) { | |
companyInfo.Name = rowTaxHtml.Text() | |
}) | |
tableTaxHtml.Find("tbody tr").Each(func(indexTr int, rowTaxHtml *goquery.Selection) { | |
row := make([]string, 0) | |
rowTaxHtml.Find("td").Each(func(ndexTd int, tableCell *goquery.Selection) { | |
row = append(row, tableCell.Text()) | |
}) | |
if len(row) != 1 { | |
companyInfo.TaxInfo[row[0]] = row[1] | |
} | |
}) | |
}) | |
// extract type business | |
doc.Find("table.table").Each(func(index int, tableBusinessHtml *goquery.Selection) { | |
tableBusinessHtml.Find("tbody tr").Each(func(indexTr int, rowBusinessHtml *goquery.Selection) { | |
row := make([]string, 0) | |
rowBusinessHtml.Find("td").Each(func(ndexTd int, tableCell *goquery.Selection) { | |
row = append(row, tableCell.Text()) | |
}) | |
businessInfo := BusinessInfo{ | |
ID: row[0], | |
Carees: row[1], | |
} | |
companyInfo.Business = append(companyInfo.Business, businessInfo) | |
}) | |
}) | |
company.List = append(company.List, *companyInfo) | |
return company.List, nil | |
} | |
func main() { | |
crawlMasothue() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment