Created
May 10, 2018 14:58
-
-
Save imfht/4b5c062cfdcf2fd3537264f3dd8b9007 to your computer and use it in GitHub Desktop.
colly(a golang web cralwer framework ) with auto decode example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"github.com/gocolly/colly" | |
"golang.org/x/net/html/charset" | |
"bytes" | |
"io/ioutil" | |
"github.com/saintfish/chardet" | |
"strings" | |
"regexp" | |
"github.com/astaxie/beego/logs" | |
"github.com/gocolly/colly/queue" | |
) | |
func get_head_encoding(response *colly.Response) string { | |
contentType := strings.ToLower(response.Headers.Get("content-type")) | |
rtn_value := "" | |
if len(contentType) > 0 { | |
if (strings.Contains(contentType, "charset")) { | |
re := regexp.MustCompile(`(?i)charset=(?P<charset>.*)`) | |
a := re.FindSubmatch([]byte(contentType)) | |
if len(a) > 0 { | |
rtn_value = string(a[1]) | |
} | |
} | |
} | |
return rtn_value | |
} | |
func get_body_encoding(response *colly.Response) string { | |
charset_re := regexp.MustCompile(`(?i)<meta.*?charset=["']*(?P<charset>.+?)["'>]`) | |
//println(string(response.Body[1:2000])) | |
temp := make([]byte, 1024) | |
if len(response.Body) > 1024 { | |
temp = response.Body[1:1024] | |
} else { | |
temp = response.Body | |
} | |
cs := charset_re.FindSubmatch(temp) | |
if len(cs) == 2 { | |
return string(cs[1]) | |
} | |
return "" | |
} | |
func convertToUTF8(str string, origEncoding string) string { | |
strBytes := []byte(str) | |
byteReader := bytes.NewReader(strBytes) | |
reader, _ := charset.NewReaderLabel(origEncoding, byteReader) | |
strBytes, _ = ioutil.ReadAll(reader) | |
return string(strBytes) | |
} | |
func test(some_text []byte) string { | |
detector := chardet.NewTextDetector() | |
result, err := detector.DetectBest(some_text) | |
if err == nil { | |
fmt.Printf( | |
"Detected charset is %s, language is %s\n", | |
result.Charset, | |
result.Language) | |
} | |
return result.Charset | |
} | |
func batch_test() { | |
// Instantiate default collector | |
c := colly.NewCollector( | |
// Visit only domains: hackerspaces.org, wiki.hackerspaces.org | |
//colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"), | |
) | |
c.OnResponse(func(response *colly.Response) { | |
logs.Info("%s,%d,%s,%s\n", response.Request.URL.Host, response.StatusCode, get_head_encoding(response), get_body_encoding(response)) | |
}) | |
// Before making a request print "Visiting ..." | |
c.OnRequest(func(r *colly.Request) { | |
logs.Debug("Visiting", r.URL.String()) | |
}) | |
data, _ := ioutil.ReadFile("top-10000") | |
// Start scraping on https://hackerspaces.org | |
lines := string(data) | |
q, _ := queue.New( | |
2, // Number of consumer threads | |
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage | |
) | |
for _, t := range strings.Split(lines, "\n") { | |
q.AddURL("http://" + t) | |
} | |
c.Async = true | |
q.Run(c) | |
c.Wait() | |
} | |
func main() { | |
batch_test() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
test-10000 look like