Created
September 23, 2016 13:40
-
-
Save hengfeiyang/32e4b99e7348e953577e607727625698 to your computer and use it in GitHub Desktop.
use go crawl website urls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"net/url" | |
"strings" | |
"time" | |
"github.com/PuerkitoBio/goquery" | |
) | |
const ( | |
uriSuffix = ".html" // 页面后缀 | |
) | |
func main() { | |
start := time.Now() | |
// 种子URL | |
initURLs := []string{"http://xiezhenye.com/", "http://www.laruence.com/"} | |
// 存储的URL | |
store := make(map[string]bool) | |
// 获取过程中的URL通道 | |
chanURL := make(chan []string, 10000) | |
// 计算终止数 | |
wait := 0 | |
// 控制抓取的协程数 | |
chanLink := make(chan struct{}, 10) | |
// 开启种子协程 | |
for _, url := range initURLs { | |
wait++ | |
chanLink <- struct{}{} | |
go func(url string) { | |
chanURL <- link(url) | |
<-chanLink | |
}(url) | |
} | |
// 开启主进程 | |
for ; wait > 0; wait-- { | |
urls := <-chanURL | |
if urls == nil { | |
continue | |
} | |
for _, url := range urls { | |
// 判断URL是否采集过 | |
if _, ok := store[url]; ok { | |
fmt.Println("repeat!!! ->", url) | |
continue | |
} | |
// 存储抓取状态 | |
store[url] = true | |
// 开启抓取 | |
fmt.Println("crawl -> ", url) | |
wait++ | |
chanLink <- struct{}{} | |
go func(url string) { | |
chanURL <- link(url) | |
<-chanLink | |
}(url) | |
} | |
} | |
// 执行结束 | |
i := 0 | |
for url := range store { | |
i++ | |
fmt.Printf("%6d %s\n", i, url) | |
} | |
fmt.Printf("总数:%d, 耗时:%v\n", len(store), time.Since(start)) | |
} | |
// link 抓取符合规则的URL | |
func link(url string) []string { | |
urls := crawl(url) | |
newURLs := make([]string, 0, len(urls)) | |
for _, href := range urls { | |
href = fixURL(href, url) | |
if strings.Contains(href, uriSuffix) { | |
newURLs = append(newURLs, href) | |
} | |
} | |
return newURLs | |
} | |
// crawl 抓取页面上所有的URL | |
func crawl(url string) []string { | |
dom, err := goquery.NewDocument(url) | |
if err != nil { | |
return nil | |
} | |
var urls []string | |
as := dom.Find("a") | |
for i := 0; i < as.Size(); i++ { | |
if href, exist := goquery.NewDocumentFromNode(as.Get(i)).Attr("href"); exist { | |
urls = append(urls, href) | |
} | |
} | |
return urls | |
} | |
// fixURL 过滤非本站URL和修复相对路径 | |
func fixURL(href, uri string) string { | |
if href == "" { | |
return "" | |
} | |
// 过滤锚点 | |
if href[0] == '#' { | |
return "" | |
} | |
if pos := strings.Index(href, "#"); pos > 0 { | |
href = href[0:pos] | |
} | |
// 过滤JS连接 | |
if strings.Contains(href, "javascript:") { | |
return "" | |
} | |
// 过滤mailto | |
if strings.Contains(href, "mailto:") { | |
return "" | |
} | |
u, err := url.Parse(uri) | |
if err != nil { | |
return "" | |
} | |
if strings.Contains(href, "://") { | |
if strings.HasPrefix(href, u.Scheme+"://"+u.Host) == false { | |
return "" | |
} | |
return href | |
} | |
if href[0] == '/' { | |
return u.Scheme + "://" + u.Host + href | |
} | |
return u.Scheme + "://" + u.Host + u.Path + "/" + href | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment