Last active
February 11, 2018 10:09
-
-
Save abhijat/673a5065c80fe4354e949563240128fb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"net/http" | |
"log" | |
"golang.org/x/net/html" | |
"regexp" | |
"fmt" | |
"errors" | |
"bytes" | |
"io" | |
"strings" | |
"time" | |
"os" | |
"sync" | |
) | |
const ( | |
rootUrl = "https://en.wikipedia.org" | |
mainPage = "/wiki/Main_Page" | |
sleepSec = 3 | |
) | |
var ( | |
validUrl *regexp.Regexp | |
searchKey string | |
wg sync.WaitGroup | |
) | |
func init() { | |
if len(os.Args) != 2 { | |
fmt.Printf("usage: %s <search key>\n", os.Args[0]) | |
os.Exit(1) | |
} | |
searchKey = os.Args[1] | |
validUrl = regexp.MustCompile(`^/wiki/[a-zA-Z_]*$`) | |
} | |
func processQueue(links []string) (*bytes.Buffer, error) { | |
if len(links) == 0 { | |
return nil, errors.New("all links processed") | |
} | |
link, links := links[0], links[1:] | |
time.Sleep(sleepSec * time.Second) | |
resp, err := http.Get(link) | |
if err != nil { | |
return nil, err | |
} | |
defer resp.Body.Close() | |
if resp.StatusCode != 200 { | |
return nil, err | |
} | |
var b bytes.Buffer | |
_, err = io.Copy(&b, resp.Body) | |
if err != nil { | |
return nil, err | |
} | |
return &b, nil | |
} | |
func parseOutGoing(b *bytes.Buffer, links []string) { | |
tokenizer := html.NewTokenizer(b) | |
linkSet := make(map[string]bool) | |
for { | |
tt := tokenizer.Next() | |
if tt == html.ErrorToken { | |
for link := range linkSet { | |
fmt.Printf("adding %s to crawl queue\n", link) | |
links = append(links, link) | |
} | |
return | |
} | |
token := tokenizer.Token() | |
if token.Data == "a" { | |
for _, attr := range token.Attr { | |
if attr.Key == "href" { | |
url := attr.Val | |
if url != mainPage && validUrl.MatchString(url) { | |
newLink := fmt.Sprintf("%s%s", rootUrl, url) | |
if strings.Contains(strings.ToLower(newLink), strings.ToLower(searchKey)) { | |
fmt.Printf("stopping search - we found %s\n", newLink) | |
wg.Done() | |
return | |
} | |
linkSet[newLink] = true | |
} | |
} | |
} | |
} | |
if token.Data == "title" && token.Type == html.StartTagToken { | |
tokenizer.Next() | |
title := tokenizer.Token() | |
fmt.Printf("visiting page %s\n", title) | |
} | |
} | |
} | |
func grep(links []string) { | |
for { | |
data, err := processQueue(links) | |
if err != nil { | |
log.Println(err) | |
} | |
parseOutGoing(data, links) | |
} | |
} | |
func main() { | |
seed := []string{"https://en.wikipedia.org/wiki/Special:Random"} | |
wg.Add(1) | |
go grep(seed) | |
wg.Wait() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment