Skip to content

Instantly share code, notes, and snippets.

@mickelsonm
Last active February 15, 2016 07:51
Show Gist options
  • Save mickelsonm/b34daf128fc9ee8bfb92 to your computer and use it in GitHub Desktop.
Save mickelsonm/b34daf128fc9ee8bfb92 to your computer and use it in GitHub Desktop.
A Go program that does data scraping on a website for states and their counties.
package main
import (
"encoding/json"
"fmt"
"sort"
"strings"
"sync"
"github.com/PuerkitoBio/goquery"
)
const numWorkers = 10
//State - a state
type State struct {
Name string `json:"name"`
Counties Counties `json:"counties"`
}
//County - a county
type County struct {
Name string `json:"name"`
}
//States - a collection of states
type States []State
//Counties - a collection of counties
type Counties []County
func (s States) Len() int { return len(s) }
func (s States) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s States) Less(i, j int) bool { return s[i].Name < s[j].Name }
func (c Counties) Len() int { return len(c) }
func (c Counties) Swap(i, j int) { c[i], c[j] = c[j], c[i] }
func (c Counties) Less(i, j int) bool { return c[i].Name < c[j].Name }
func fetchCounties(stateName string) (Counties, error) {
var counties Counties
subdoc, err := goquery.NewDocument(
fmt.Sprintf("http://www.zipcodestogo.com/%s/", stateName))
if err != nil {
return counties, err
}
var county County
countyMap := make(map[string]uint)
subdoc.Find("#container table tbody tr td:nth-child(3)").Each(func(i int, s *goquery.Selection) {
txt := s.Text()
if txt == "" || strings.Contains(txt, "KB") || txt == "Zip Code" || txt == "City" || txt == "County" {
return
}
if _, exists := countyMap[txt]; !exists {
county = County{
Name: txt,
}
counties = append(counties, county)
countyMap[txt] = 1
}
})
return counties, nil
}
func main() {
var states States
var wg sync.WaitGroup
fetchChan := make(chan string)
stateChan := make(chan State)
doc, err := goquery.NewDocument("http://www.zipcodestogo.com/ZIP-Codes-by-State.htm")
if err != nil {
return
}
for i := 0; i < numWorkers; i++ {
wg.Add(1)
go func() {
for name := range fetchChan {
state := State{
Name: name,
}
counties, err := fetchCounties(state.Name)
if err != nil {
fmt.Println("err fetching state:", state.Name)
return
}
state.Counties = counties
sort.Sort(state.Counties)
stateChan <- state
}
wg.Done()
}()
}
go func() {
doc.Find("#page #content ul.listStates li a").Each(func(i int, s *goquery.Selection) {
name, exists := s.Attr("title")
if !exists {
//as an alternative let's look at the href
tmp, _ := s.Attr("href")
//why they forgot to have a title for Idaho is perplexing...
if strings.Contains(tmp, "Idaho") {
name = "Idaho"
}
}
name = strings.Replace(name, " Zip Codes", "", -1)
if strings.Contains("$", name) {
return
}
fetchChan <- name
})
close(fetchChan)
}()
go func() {
wg.Wait()
close(stateChan)
}()
for s := range stateChan {
states = append(states, s)
}
sort.Sort(states)
//js, err := json.MarshalIndent(states, "", " ")
js, err := json.Marshal(states)
if err != nil {
return
}
fmt.Println(string(js))
}
package main
import (
"encoding/json"
"fmt"
"sort"
"strings"
"github.com/PuerkitoBio/goquery"
)
func main() {
doc, err := goquery.NewDocument("http://www.zipcodestogo.com/ZIP-Codes-by-State.htm")
if err != nil {
return
}
processCounties := func(s State) (Counties, error) {
var counties Counties
subdoc, err := goquery.NewDocument(
fmt.Sprintf("http://www.zipcodestogo.com/%s/", s.Name))
if err != nil {
return counties, err
}
var county County
countyMap := make(map[string]uint)
subdoc.Find("#container table tbody tr td:nth-child(3)").Each(func(i int, s *goquery.Selection) {
txt := s.Text()
if txt == "" || strings.Contains(txt, "KB") || txt == "Zip Code" || txt == "City" || txt == "County" {
return
}
if _, exists := countyMap[txt]; !exists {
county = County{
Name: txt,
}
counties = append(counties, county)
countyMap[txt] = 1
}
})
return counties, nil
}
var states States
var state State
doc.Find("#page #content ul.listStates li a").Each(func(i int, s *goquery.Selection) {
name, _ := s.Attr("title")
name = strings.Replace(name, " Zip Codes", "", -1)
if strings.Contains("$", name) {
return
}
state = State{
Name: name,
}
state.Counties, _ = processCounties(state)
sort.Sort(state.Counties)
states = append(states, state)
})
sort.Sort(states)
//js, err := json.MarshalIndent(states, "", " ")
js, err := json.Marshal(states)
if err != nil {
return
}
fmt.Println(string(js))
}
//States - a collection of states
type States []State
//State - a state
type State struct {
Name string `json:"name"`
Counties Counties `json:"counties"`
}
//Counties - a collection of counties
type Counties []County
//County - a county
type County struct {
Name string `json:"name"`
}
func (s States) Len() int { return len(s) }
func (s States) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s States) Less(i, j int) bool { return s[i].Name < s[j].Name }
func (c Counties) Len() int { return len(c) }
func (c Counties) Swap(i, j int) { c[i], c[j] = c[j], c[i] }
func (c Counties) Less(i, j int) bool { return c[i].Name < c[j].Name }
package main
import (
"encoding/json"
"fmt"
"sort"
"strings"
"sync"
"github.com/PuerkitoBio/goquery"
)
const numWorkers = 10
func main() {
var states States
var wg sync.WaitGroup
fetchChan := make(chan string)
stateChan := make(chan State)
doc, err := goquery.NewDocument("http://www.zipcodestogo.com/ZIP-Codes-by-State.htm")
if err != nil {
return
}
for i := 0; i < numWorkers; i++ {
wg.Add(1)
go func() {
for name := range fetchChan {
state := State{
Name: name,
Abbreviation: getAbbreviation(name),
}
counties, err := fetchCounties(state.Name)
if err != nil {
fmt.Println("err fetching state:", state.Name)
return
}
state.Counties = counties
sort.Sort(state.Counties)
stateChan <- state
}
wg.Done()
}()
}
go func() {
doc.Find("#page #content ul.listStates li a").Each(func(i int, s *goquery.Selection) {
name, exists := s.Attr("title")
if !exists {
//as an alternative let's look at the href
tmp, _ := s.Attr("href")
//why they forgot to have a title for Idaho is perplexing...
if strings.Contains(tmp, "Idaho") {
name = "Idaho"
}
}
name = strings.Replace(name, " Zip Codes", "", -1)
if strings.Contains("$", name) {
return
}
fetchChan <- name
})
close(fetchChan)
}()
go func() {
wg.Wait()
close(stateChan)
}()
for s := range stateChan {
states = append(states, s)
}
sort.Sort(states)
js, err := json.MarshalIndent(states, "", " ")
//js, err := json.Marshal(states)
if err != nil {
return
}
fmt.Println(string(js))
}
func fetchCounties(stateName string) (Counties, error) {
var counties Counties
doc, err := goquery.NewDocument(
fmt.Sprintf("http://www.zipcodestogo.com/%s/", stateName))
if err != nil {
return counties, err
}
var indexColumn int
var zipCode, cityName, countyName string
countyMap := make(map[string]County)
cityMap := make(map[string]City)
doc.Find("#container #leftCol table tbody tr td table tbody tr:nth-child(n+3) td").Each(func(i int, s *goquery.Selection) {
txt := s.Text()
if txt == "View Map" {
zipCode = ""
cityName = ""
countyName = ""
indexColumn = 0
return
}
switch indexColumn {
case 0:
zipCode = txt
case 1:
cityName = txt
case 2:
countyName = txt
}
if zipCode != "" && cityName != "" && countyName != "" {
if county, countyExists := countyMap[countyName]; !countyExists {
count := County{
Name: countyName,
}
cit := City{
Name: cityName,
ZipCodes: []string{zipCode},
}
count.Cities = append(count.Cities, cit)
cityMap[cityName] = cit
countyMap[countyName] = count
} else {
if city, cityExists := cityMap[cityName]; !cityExists {
cit := City{
Name: cityName,
ZipCodes: []string{zipCode},
}
county.Cities = append(county.Cities, cit)
cityMap[cityName] = cit
countyMap[countyName] = county
} else {
city.ZipCodes = append(city.ZipCodes, zipCode)
cityMap[cityName] = city
for i, c := range county.Cities {
if c.Name == cityName {
county.Cities[i] = city
break
}
}
countyMap[countyName] = county
}
}
indexColumn = 0
}
indexColumn++
})
for _, county := range countyMap {
sort.Sort(county.Cities)
counties = append(counties, county)
}
return counties, nil
}
func getAbbreviation(name string) string {
switch strings.ToLower(name) {
case "alabama":
return "AL"
case "alaska":
return "AK"
case "arizona":
return "AZ"
case "arkansas":
return "AR"
case "california":
return "CA"
case "colorado":
return "CO"
case "connecticut":
return "CT"
case "delaware":
return "DE"
case "florida":
return "FL"
case "georgia":
return "GA"
case "hawaii":
return "HI"
case "idaho":
return "ID"
case "illinois":
return "IL"
case "indiana":
return "IN"
case "iowa":
return "IA"
case "kansas":
return "KS"
case "kentucky":
return "KY"
case "louisiana":
return "LA"
case "maine":
return "ME"
case "maryland":
return "MD"
case "massachusetts":
return "MA"
case "michigan":
return "MI"
case "minnesota":
return "MN"
case "mississippi":
return "MS"
case "missouri":
return "MO"
case "montana":
return "MT"
case "nebraska":
return "NE"
case "nevada":
return "NV"
case "new hampshire":
return "NH"
case "new jersey":
return "NJ"
case "new mexico":
return "NM"
case "new york":
return "NY"
case "north carolina":
return "NC"
case "north dakota":
return "ND"
case "ohio":
return "OH"
case "oklahoma":
return "OK"
case "oregon":
return "OR"
case "pennsylvania":
return "PA"
case "rhode island":
return "RI"
case "south carolina":
return "SC"
case "south dakota":
return "SD"
case "tennessee":
return "TN"
case "texas":
return "TX"
case "utah":
return "UT"
case "vermont":
return "VT"
case "virginia":
return "VA"
case "washington":
return "WA"
case "west virginia":
return "WV"
case "wisconsin":
return "WI"
case "wyoming":
return "WY"
case "american samoa":
return "AS"
case "district of columbia":
return "DC"
case "guam":
return "GU"
case "northern mariana islands":
return "MP"
case "puerto rico":
return "PR"
case "virgin islands":
return "VI"
default:
return ""
}
}
//State - a state
type State struct {
Name string `json:"name"`
Abbreviation string `json:"abbreviation"`
Counties Counties `json:"counties"`
}
//County - a county
type County struct {
Name string `json:"name"`
Cities Cities `json:"cities"`
}
//City - a city
type City struct {
Name string `json:"name"`
ZipCodes []string `json:"zipcodes"`
}
//States - a collection of states
type States []State
//Counties - a collection of counties
type Counties []County
//Cities - a collection of cities
type Cities []City
func (s States) Len() int { return len(s) }
func (s States) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s States) Less(i, j int) bool { return s[i].Name < s[j].Name }
func (c Counties) Len() int { return len(c) }
func (c Counties) Swap(i, j int) { c[i], c[j] = c[j], c[i] }
func (c Counties) Less(i, j int) bool { return c[i].Name < c[j].Name }
func (c Cities) Len() int { return len(c) }
func (c Cities) Swap(i, j int) { c[i], c[j] = c[j], c[i] }
func (c Cities) Less(i, j int) bool { return c[i].Name < c[j].Name }
@mickelsonm
Copy link
Author

active-states-counties-json.go - This one can do the task in ~3 seconds using Go routines and channels.

old-states-counties-json.go - This one does things very synchronously, but it can do the job in ~16 seconds.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment