Created
August 18, 2016 13:00
-
-
Save phagenlocher/a7998abd657d9d51f487f4af99acf81d to your computer and use it in GitHub Desktop.
A simple webcrawler written in Golang
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| import ( | |
| "os" | |
| "strconv" | |
| "fmt" | |
| "github.com/mxk/go-sqlite/sqlite3" | |
| "golang.org/x/net/html" | |
| "net/http" | |
| "strings" | |
| ) | |
| func crawl(url string, next_url chan string, r_token chan bool) { | |
| // Take a token (waits until a new token appears in a | |
| // empty channel) | |
| <-r_token | |
| // Give a token back at function exit | |
| defer func() { | |
| r_token <- true | |
| }() | |
| // Get the HTML for our url | |
| resp, err := http.Get(url) | |
| if err != nil { | |
| return | |
| } | |
| defer resp.Body.Close() | |
| // Print info | |
| fmt.Println("Crawled", url) | |
| // Tokenize the HTML | |
| z := html.NewTokenizer(resp.Body) | |
| for { | |
| // If the HTML has ended, we break out of the loop | |
| token := z.Next() | |
| if token == html.ErrorToken { | |
| break | |
| } | |
| // New Token started | |
| if token == html.StartTagToken { | |
| // Check if the token is an <a> tag | |
| if name, _ := z.TagName(); string(name) == "a" { | |
| for { | |
| // Get the next attribute | |
| name, val, more := z.TagAttr() | |
| // Check if the attribute is "href" | |
| if string(name) == "href" { | |
| // Cast Url | |
| url = string(val) | |
| // Check if the URL is valid | |
| if !strings.HasPrefix(url, "http://") { | |
| if !strings.HasPrefix(url, "https://") { | |
| continue | |
| } | |
| } | |
| // The URL is valid so send it to the Url channel | |
| next_url <- url | |
| } | |
| // There are no more attributes so we break out of the | |
| // attribute search loop. | |
| if !more { | |
| break | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| func parseArguments() (int, int, string, bool) { | |
| // Check for correct length | |
| if len(os.Args) != 4 { | |
| // Print usage | |
| fmt.Println("Invalid arguments!") | |
| fmt.Printf("Usage: %s $1 $2 $3\n", os.Args[0]) | |
| fmt.Println(" $1: Number of concurrent processes") | |
| fmt.Println(" $2: Number of URLs to search for") | |
| fmt.Println(" $3: URL to start crawling on") | |
| fmt.Printf("\nExample: %s 100 1000 http://example.com\n", os.Args[0]) | |
| return 0, 0, "", true | |
| } | |
| // Cast arguments | |
| first, _ := strconv.Atoi(os.Args[1]) | |
| second, _ := strconv.Atoi(os.Args[2]) | |
| third := os.Args[3] | |
| return first, second, third, false | |
| } | |
| func main() { | |
| // Parse arguments | |
| num_routines, num_urls, start_url, err := parseArguments() | |
| if err { | |
| return | |
| } | |
| // Make buffered channel with strings for the urls | |
| next_url := make(chan string, num_routines) | |
| next_url <- start_url | |
| // Create buffered channel for tokens | |
| r_tokens := make(chan bool, num_routines) | |
| for i := 0; i < num_routines; i++ { | |
| r_tokens <- true | |
| } | |
| // Create map (string -> bool) to check if a url has | |
| // been visited | |
| m := map[string]bool{} | |
| // Create/Connect to database | |
| c, _ := sqlite3.Open("urls.db") | |
| defer c.Close() | |
| // Create a table for our urls | |
| c.Exec("CREATE TABLE urls(url TEXT)") | |
| for i := 0; i < num_urls; { | |
| // Get next url | |
| url := <-next_url | |
| // Check if the url has been visited | |
| _, found := m[url] | |
| // If yes, go to the next url | |
| if found { | |
| continue | |
| } | |
| // If not, add the url to the map, increment | |
| // the counter, insert it into the database | |
| // and start new crawl goroutine with that | |
| // url. | |
| m[url] = true | |
| i++ | |
| c.Exec("INSERT INTO urls VALUES(?) ", url) | |
| go crawl(url, next_url, r_tokens) | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment