Skip to content

Instantly share code, notes, and snippets.

@abhijat
Last active February 11, 2018 10:09
Show Gist options
  • Save abhijat/673a5065c80fe4354e949563240128fb to your computer and use it in GitHub Desktop.
Save abhijat/673a5065c80fe4354e949563240128fb to your computer and use it in GitHub Desktop.
package main
import (
"net/http"
"log"
"golang.org/x/net/html"
"regexp"
"fmt"
"errors"
"bytes"
"io"
"strings"
"time"
"os"
"sync"
)
const (
rootUrl = "https://en.wikipedia.org"
mainPage = "/wiki/Main_Page"
sleepSec = 3
)
var (
validUrl *regexp.Regexp
searchKey string
wg sync.WaitGroup
)
func init() {
if len(os.Args) != 2 {
fmt.Printf("usage: %s <search key>\n", os.Args[0])
os.Exit(1)
}
searchKey = os.Args[1]
validUrl = regexp.MustCompile(`^/wiki/[a-zA-Z_]*$`)
}
func processQueue(links []string) (*bytes.Buffer, error) {
if len(links) == 0 {
return nil, errors.New("all links processed")
}
link, links := links[0], links[1:]
time.Sleep(sleepSec * time.Second)
resp, err := http.Get(link)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return nil, err
}
var b bytes.Buffer
_, err = io.Copy(&b, resp.Body)
if err != nil {
return nil, err
}
return &b, nil
}
func parseOutGoing(b *bytes.Buffer, links []string) {
tokenizer := html.NewTokenizer(b)
linkSet := make(map[string]bool)
for {
tt := tokenizer.Next()
if tt == html.ErrorToken {
for link := range linkSet {
fmt.Printf("adding %s to crawl queue\n", link)
links = append(links, link)
}
return
}
token := tokenizer.Token()
if token.Data == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
url := attr.Val
if url != mainPage && validUrl.MatchString(url) {
newLink := fmt.Sprintf("%s%s", rootUrl, url)
if strings.Contains(strings.ToLower(newLink), strings.ToLower(searchKey)) {
fmt.Printf("stopping search - we found %s\n", newLink)
wg.Done()
return
}
linkSet[newLink] = true
}
}
}
}
if token.Data == "title" && token.Type == html.StartTagToken {
tokenizer.Next()
title := tokenizer.Token()
fmt.Printf("visiting page %s\n", title)
}
}
}
func grep(links []string) {
for {
data, err := processQueue(links)
if err != nil {
log.Println(err)
}
parseOutGoing(data, links)
}
}
func main() {
seed := []string{"https://en.wikipedia.org/wiki/Special:Random"}
wg.Add(1)
go grep(seed)
wg.Wait()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment