Last active
February 15, 2016 07:51
-
-
Save mickelsonm/b34daf128fc9ee8bfb92 to your computer and use it in GitHub Desktop.
A Go program that does data scraping on a website for states and their counties.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"fmt" | |
"sort" | |
"strings" | |
"sync" | |
"github.com/PuerkitoBio/goquery" | |
) | |
const numWorkers = 10 | |
//State - a state | |
type State struct { | |
Name string `json:"name"` | |
Counties Counties `json:"counties"` | |
} | |
//County - a county | |
type County struct { | |
Name string `json:"name"` | |
} | |
//States - a collection of states | |
type States []State | |
//Counties - a collection of counties | |
type Counties []County | |
func (s States) Len() int { return len(s) } | |
func (s States) Swap(i, j int) { s[i], s[j] = s[j], s[i] } | |
func (s States) Less(i, j int) bool { return s[i].Name < s[j].Name } | |
func (c Counties) Len() int { return len(c) } | |
func (c Counties) Swap(i, j int) { c[i], c[j] = c[j], c[i] } | |
func (c Counties) Less(i, j int) bool { return c[i].Name < c[j].Name } | |
func fetchCounties(stateName string) (Counties, error) { | |
var counties Counties | |
subdoc, err := goquery.NewDocument( | |
fmt.Sprintf("http://www.zipcodestogo.com/%s/", stateName)) | |
if err != nil { | |
return counties, err | |
} | |
var county County | |
countyMap := make(map[string]uint) | |
subdoc.Find("#container table tbody tr td:nth-child(3)").Each(func(i int, s *goquery.Selection) { | |
txt := s.Text() | |
if txt == "" || strings.Contains(txt, "KB") || txt == "Zip Code" || txt == "City" || txt == "County" { | |
return | |
} | |
if _, exists := countyMap[txt]; !exists { | |
county = County{ | |
Name: txt, | |
} | |
counties = append(counties, county) | |
countyMap[txt] = 1 | |
} | |
}) | |
return counties, nil | |
} | |
func main() { | |
var states States | |
var wg sync.WaitGroup | |
fetchChan := make(chan string) | |
stateChan := make(chan State) | |
doc, err := goquery.NewDocument("http://www.zipcodestogo.com/ZIP-Codes-by-State.htm") | |
if err != nil { | |
return | |
} | |
for i := 0; i < numWorkers; i++ { | |
wg.Add(1) | |
go func() { | |
for name := range fetchChan { | |
state := State{ | |
Name: name, | |
} | |
counties, err := fetchCounties(state.Name) | |
if err != nil { | |
fmt.Println("err fetching state:", state.Name) | |
return | |
} | |
state.Counties = counties | |
sort.Sort(state.Counties) | |
stateChan <- state | |
} | |
wg.Done() | |
}() | |
} | |
go func() { | |
doc.Find("#page #content ul.listStates li a").Each(func(i int, s *goquery.Selection) { | |
name, exists := s.Attr("title") | |
if !exists { | |
//as an alternative let's look at the href | |
tmp, _ := s.Attr("href") | |
//why they forgot to have a title for Idaho is perplexing... | |
if strings.Contains(tmp, "Idaho") { | |
name = "Idaho" | |
} | |
} | |
name = strings.Replace(name, " Zip Codes", "", -1) | |
if strings.Contains("$", name) { | |
return | |
} | |
fetchChan <- name | |
}) | |
close(fetchChan) | |
}() | |
go func() { | |
wg.Wait() | |
close(stateChan) | |
}() | |
for s := range stateChan { | |
states = append(states, s) | |
} | |
sort.Sort(states) | |
//js, err := json.MarshalIndent(states, "", " ") | |
js, err := json.Marshal(states) | |
if err != nil { | |
return | |
} | |
fmt.Println(string(js)) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"fmt" | |
"sort" | |
"strings" | |
"github.com/PuerkitoBio/goquery" | |
) | |
func main() { | |
doc, err := goquery.NewDocument("http://www.zipcodestogo.com/ZIP-Codes-by-State.htm") | |
if err != nil { | |
return | |
} | |
processCounties := func(s State) (Counties, error) { | |
var counties Counties | |
subdoc, err := goquery.NewDocument( | |
fmt.Sprintf("http://www.zipcodestogo.com/%s/", s.Name)) | |
if err != nil { | |
return counties, err | |
} | |
var county County | |
countyMap := make(map[string]uint) | |
subdoc.Find("#container table tbody tr td:nth-child(3)").Each(func(i int, s *goquery.Selection) { | |
txt := s.Text() | |
if txt == "" || strings.Contains(txt, "KB") || txt == "Zip Code" || txt == "City" || txt == "County" { | |
return | |
} | |
if _, exists := countyMap[txt]; !exists { | |
county = County{ | |
Name: txt, | |
} | |
counties = append(counties, county) | |
countyMap[txt] = 1 | |
} | |
}) | |
return counties, nil | |
} | |
var states States | |
var state State | |
doc.Find("#page #content ul.listStates li a").Each(func(i int, s *goquery.Selection) { | |
name, _ := s.Attr("title") | |
name = strings.Replace(name, " Zip Codes", "", -1) | |
if strings.Contains("$", name) { | |
return | |
} | |
state = State{ | |
Name: name, | |
} | |
state.Counties, _ = processCounties(state) | |
sort.Sort(state.Counties) | |
states = append(states, state) | |
}) | |
sort.Sort(states) | |
//js, err := json.MarshalIndent(states, "", " ") | |
js, err := json.Marshal(states) | |
if err != nil { | |
return | |
} | |
fmt.Println(string(js)) | |
} | |
//States - a collection of states | |
type States []State | |
//State - a state | |
type State struct { | |
Name string `json:"name"` | |
Counties Counties `json:"counties"` | |
} | |
//Counties - a collection of counties | |
type Counties []County | |
//County - a county | |
type County struct { | |
Name string `json:"name"` | |
} | |
func (s States) Len() int { return len(s) } | |
func (s States) Swap(i, j int) { s[i], s[j] = s[j], s[i] } | |
func (s States) Less(i, j int) bool { return s[i].Name < s[j].Name } | |
func (c Counties) Len() int { return len(c) } | |
func (c Counties) Swap(i, j int) { c[i], c[j] = c[j], c[i] } | |
func (c Counties) Less(i, j int) bool { return c[i].Name < c[j].Name } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"fmt" | |
"sort" | |
"strings" | |
"sync" | |
"github.com/PuerkitoBio/goquery" | |
) | |
const numWorkers = 10 | |
func main() { | |
var states States | |
var wg sync.WaitGroup | |
fetchChan := make(chan string) | |
stateChan := make(chan State) | |
doc, err := goquery.NewDocument("http://www.zipcodestogo.com/ZIP-Codes-by-State.htm") | |
if err != nil { | |
return | |
} | |
for i := 0; i < numWorkers; i++ { | |
wg.Add(1) | |
go func() { | |
for name := range fetchChan { | |
state := State{ | |
Name: name, | |
Abbreviation: getAbbreviation(name), | |
} | |
counties, err := fetchCounties(state.Name) | |
if err != nil { | |
fmt.Println("err fetching state:", state.Name) | |
return | |
} | |
state.Counties = counties | |
sort.Sort(state.Counties) | |
stateChan <- state | |
} | |
wg.Done() | |
}() | |
} | |
go func() { | |
doc.Find("#page #content ul.listStates li a").Each(func(i int, s *goquery.Selection) { | |
name, exists := s.Attr("title") | |
if !exists { | |
//as an alternative let's look at the href | |
tmp, _ := s.Attr("href") | |
//why they forgot to have a title for Idaho is perplexing... | |
if strings.Contains(tmp, "Idaho") { | |
name = "Idaho" | |
} | |
} | |
name = strings.Replace(name, " Zip Codes", "", -1) | |
if strings.Contains("$", name) { | |
return | |
} | |
fetchChan <- name | |
}) | |
close(fetchChan) | |
}() | |
go func() { | |
wg.Wait() | |
close(stateChan) | |
}() | |
for s := range stateChan { | |
states = append(states, s) | |
} | |
sort.Sort(states) | |
js, err := json.MarshalIndent(states, "", " ") | |
//js, err := json.Marshal(states) | |
if err != nil { | |
return | |
} | |
fmt.Println(string(js)) | |
} | |
func fetchCounties(stateName string) (Counties, error) { | |
var counties Counties | |
doc, err := goquery.NewDocument( | |
fmt.Sprintf("http://www.zipcodestogo.com/%s/", stateName)) | |
if err != nil { | |
return counties, err | |
} | |
var indexColumn int | |
var zipCode, cityName, countyName string | |
countyMap := make(map[string]County) | |
cityMap := make(map[string]City) | |
doc.Find("#container #leftCol table tbody tr td table tbody tr:nth-child(n+3) td").Each(func(i int, s *goquery.Selection) { | |
txt := s.Text() | |
if txt == "View Map" { | |
zipCode = "" | |
cityName = "" | |
countyName = "" | |
indexColumn = 0 | |
return | |
} | |
switch indexColumn { | |
case 0: | |
zipCode = txt | |
case 1: | |
cityName = txt | |
case 2: | |
countyName = txt | |
} | |
if zipCode != "" && cityName != "" && countyName != "" { | |
if county, countyExists := countyMap[countyName]; !countyExists { | |
count := County{ | |
Name: countyName, | |
} | |
cit := City{ | |
Name: cityName, | |
ZipCodes: []string{zipCode}, | |
} | |
count.Cities = append(count.Cities, cit) | |
cityMap[cityName] = cit | |
countyMap[countyName] = count | |
} else { | |
if city, cityExists := cityMap[cityName]; !cityExists { | |
cit := City{ | |
Name: cityName, | |
ZipCodes: []string{zipCode}, | |
} | |
county.Cities = append(county.Cities, cit) | |
cityMap[cityName] = cit | |
countyMap[countyName] = county | |
} else { | |
city.ZipCodes = append(city.ZipCodes, zipCode) | |
cityMap[cityName] = city | |
for i, c := range county.Cities { | |
if c.Name == cityName { | |
county.Cities[i] = city | |
break | |
} | |
} | |
countyMap[countyName] = county | |
} | |
} | |
indexColumn = 0 | |
} | |
indexColumn++ | |
}) | |
for _, county := range countyMap { | |
sort.Sort(county.Cities) | |
counties = append(counties, county) | |
} | |
return counties, nil | |
} | |
func getAbbreviation(name string) string { | |
switch strings.ToLower(name) { | |
case "alabama": | |
return "AL" | |
case "alaska": | |
return "AK" | |
case "arizona": | |
return "AZ" | |
case "arkansas": | |
return "AR" | |
case "california": | |
return "CA" | |
case "colorado": | |
return "CO" | |
case "connecticut": | |
return "CT" | |
case "delaware": | |
return "DE" | |
case "florida": | |
return "FL" | |
case "georgia": | |
return "GA" | |
case "hawaii": | |
return "HI" | |
case "idaho": | |
return "ID" | |
case "illinois": | |
return "IL" | |
case "indiana": | |
return "IN" | |
case "iowa": | |
return "IA" | |
case "kansas": | |
return "KS" | |
case "kentucky": | |
return "KY" | |
case "louisiana": | |
return "LA" | |
case "maine": | |
return "ME" | |
case "maryland": | |
return "MD" | |
case "massachusetts": | |
return "MA" | |
case "michigan": | |
return "MI" | |
case "minnesota": | |
return "MN" | |
case "mississippi": | |
return "MS" | |
case "missouri": | |
return "MO" | |
case "montana": | |
return "MT" | |
case "nebraska": | |
return "NE" | |
case "nevada": | |
return "NV" | |
case "new hampshire": | |
return "NH" | |
case "new jersey": | |
return "NJ" | |
case "new mexico": | |
return "NM" | |
case "new york": | |
return "NY" | |
case "north carolina": | |
return "NC" | |
case "north dakota": | |
return "ND" | |
case "ohio": | |
return "OH" | |
case "oklahoma": | |
return "OK" | |
case "oregon": | |
return "OR" | |
case "pennsylvania": | |
return "PA" | |
case "rhode island": | |
return "RI" | |
case "south carolina": | |
return "SC" | |
case "south dakota": | |
return "SD" | |
case "tennessee": | |
return "TN" | |
case "texas": | |
return "TX" | |
case "utah": | |
return "UT" | |
case "vermont": | |
return "VT" | |
case "virginia": | |
return "VA" | |
case "washington": | |
return "WA" | |
case "west virginia": | |
return "WV" | |
case "wisconsin": | |
return "WI" | |
case "wyoming": | |
return "WY" | |
case "american samoa": | |
return "AS" | |
case "district of columbia": | |
return "DC" | |
case "guam": | |
return "GU" | |
case "northern mariana islands": | |
return "MP" | |
case "puerto rico": | |
return "PR" | |
case "virgin islands": | |
return "VI" | |
default: | |
return "" | |
} | |
} | |
//State - a state | |
type State struct { | |
Name string `json:"name"` | |
Abbreviation string `json:"abbreviation"` | |
Counties Counties `json:"counties"` | |
} | |
//County - a county | |
type County struct { | |
Name string `json:"name"` | |
Cities Cities `json:"cities"` | |
} | |
//City - a city | |
type City struct { | |
Name string `json:"name"` | |
ZipCodes []string `json:"zipcodes"` | |
} | |
//States - a collection of states | |
type States []State | |
//Counties - a collection of counties | |
type Counties []County | |
//Cities - a collection of cities | |
type Cities []City | |
func (s States) Len() int { return len(s) } | |
func (s States) Swap(i, j int) { s[i], s[j] = s[j], s[i] } | |
func (s States) Less(i, j int) bool { return s[i].Name < s[j].Name } | |
func (c Counties) Len() int { return len(c) } | |
func (c Counties) Swap(i, j int) { c[i], c[j] = c[j], c[i] } | |
func (c Counties) Less(i, j int) bool { return c[i].Name < c[j].Name } | |
func (c Cities) Len() int { return len(c) } | |
func (c Cities) Swap(i, j int) { c[i], c[j] = c[j], c[i] } | |
func (c Cities) Less(i, j int) bool { return c[i].Name < c[j].Name } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
active-states-counties-json.go - This one can do the task in ~3 seconds using Go routines and channels.
old-states-counties-json.go - This one does things very synchronously, but it can do the job in ~16 seconds.