Skip to content

Instantly share code, notes, and snippets.

@phagenlocher
Created August 18, 2016 13:00
Show Gist options
  • Select an option

  • Save phagenlocher/a7998abd657d9d51f487f4af99acf81d to your computer and use it in GitHub Desktop.

Select an option

Save phagenlocher/a7998abd657d9d51f487f4af99acf81d to your computer and use it in GitHub Desktop.
A simple webcrawler written in Golang
package main
import (
"os"
"strconv"
"fmt"
"github.com/mxk/go-sqlite/sqlite3"
"golang.org/x/net/html"
"net/http"
"strings"
)
func crawl(url string, next_url chan string, r_token chan bool) {
// Take a token (waits until a new token appears in a
// empty channel)
<-r_token
// Give a token back at function exit
defer func() {
r_token <- true
}()
// Get the HTML for our url
resp, err := http.Get(url)
if err != nil {
return
}
defer resp.Body.Close()
// Print info
fmt.Println("Crawled", url)
// Tokenize the HTML
z := html.NewTokenizer(resp.Body)
for {
// If the HTML has ended, we break out of the loop
token := z.Next()
if token == html.ErrorToken {
break
}
// New Token started
if token == html.StartTagToken {
// Check if the token is an <a> tag
if name, _ := z.TagName(); string(name) == "a" {
for {
// Get the next attribute
name, val, more := z.TagAttr()
// Check if the attribute is "href"
if string(name) == "href" {
// Cast Url
url = string(val)
// Check if the URL is valid
if !strings.HasPrefix(url, "http://") {
if !strings.HasPrefix(url, "https://") {
continue
}
}
// The URL is valid so send it to the Url channel
next_url <- url
}
// There are no more attributes so we break out of the
// attribute search loop.
if !more {
break
}
}
}
}
}
}
func parseArguments() (int, int, string, bool) {
// Check for correct length
if len(os.Args) != 4 {
// Print usage
fmt.Println("Invalid arguments!")
fmt.Printf("Usage: %s $1 $2 $3\n", os.Args[0])
fmt.Println(" $1: Number of concurrent processes")
fmt.Println(" $2: Number of URLs to search for")
fmt.Println(" $3: URL to start crawling on")
fmt.Printf("\nExample: %s 100 1000 http://example.com\n", os.Args[0])
return 0, 0, "", true
}
// Cast arguments
first, _ := strconv.Atoi(os.Args[1])
second, _ := strconv.Atoi(os.Args[2])
third := os.Args[3]
return first, second, third, false
}
func main() {
// Parse arguments
num_routines, num_urls, start_url, err := parseArguments()
if err {
return
}
// Make buffered channel with strings for the urls
next_url := make(chan string, num_routines)
next_url <- start_url
// Create buffered channel for tokens
r_tokens := make(chan bool, num_routines)
for i := 0; i < num_routines; i++ {
r_tokens <- true
}
// Create map (string -> bool) to check if a url has
// been visited
m := map[string]bool{}
// Create/Connect to database
c, _ := sqlite3.Open("urls.db")
defer c.Close()
// Create a table for our urls
c.Exec("CREATE TABLE urls(url TEXT)")
for i := 0; i < num_urls; {
// Get next url
url := <-next_url
// Check if the url has been visited
_, found := m[url]
// If yes, go to the next url
if found {
continue
}
// If not, add the url to the map, increment
// the counter, insert it into the database
// and start new crawl goroutine with that
// url.
m[url] = true
i++
c.Exec("INSERT INTO urls VALUES(?) ", url)
go crawl(url, next_url, r_tokens)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment