Created
July 30, 2014 22:23
-
-
Save rafakato/e4e352cf69e2a56e5d7b to your computer and use it in GitHub Desktop.
Go Crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"github.com/PuerkitoBio/goquery" | |
"gopkg.in/mgo.v2" | |
"gopkg.in/mgo.v2/bson" | |
"log" | |
"net/http" | |
"strconv" | |
"strings" | |
"sync" | |
"time" | |
) | |
func perror(err error) { | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
func ScrapPage(url string, infoChannel chan InformacaoNutricional) { | |
wg.Add(1) | |
fmt.Printf("Scraping page %s\n", url) | |
info := InformacaoNutricional{Propriedades: make(map[string]string)} | |
doc, err := goquery.NewDocument(url) | |
perror(err) | |
fmt.Printf("Page data received: %s\n", url) | |
info.Nome = strings.TrimSpace(doc.Find(".product-header > .product-header__heading").Text()) | |
info.LojaUrl = url | |
doc.Find(".product-grid table").Each(func(t_i int, table *goquery.Selection) { | |
//header | |
info.Medida = strings.TrimSpace(table.Find("thead > tr").Eq(1).Text()) | |
table.Find("tbody > tr").Each(func(tr_i int, row *goquery.Selection) { | |
info.Propriedades[strings.TrimSpace(row.Find("td").Eq(0).Text())] = strings.TrimSpace(row.Find("td").Eq(1).Text()) | |
}) | |
}) | |
infoChannel <- info | |
} | |
func AddToList(index int, infoChannel chan InformacaoNutricional) { | |
info := <-infoChannel | |
if info.Nome != "" && info.Medida != "" { | |
infos = append(infos, info) | |
} | |
defer wg.Done() | |
} | |
var wg sync.WaitGroup | |
var infos []interface{} | |
func main() { | |
http.DefaultTransport.(*http.Transport).ResponseHeaderTimeout = time.Minute * 3 | |
mongodbServer := "127.0.0.1:27017" | |
fmt.Printf("Connecting to MongoDB on %s\n", mongodbServer) | |
session, err := mgo.Dial(mongodbServer) | |
perror(err) | |
defer session.Close() | |
fmt.Printf("Connected to MongoDB on %s\n", mongodbServer) | |
alimentosCollection := session.DB("CrawlerAlimentos").C("Alimentos") | |
totalPerLoop := 100 | |
max := 10000000 | |
infoChannel := make(chan InformacaoNutricional) | |
for n := 0; n < (max / totalPerLoop); n++ { | |
start := (n * totalPerLoop) + 1 | |
end := ((n + 1) * totalPerLoop) | |
infos = make([]interface{}, 0) | |
for i := start; i <= end; i++ { | |
url := "http://www.paodeacucar.com.br/produto/" + strconv.Itoa(i) | |
var info InformacaoNutricional | |
alimentosCollection.Find(bson.M{"LojaUrl": url}).One(&info) | |
if info.Nome == "" { | |
go ScrapPage(url, infoChannel) | |
go AddToList(i, infoChannel) | |
} else { | |
fmt.Printf("Skipping page: %s\n", url) | |
} | |
time.Sleep(150 * time.Millisecond) | |
} | |
wg.Wait() | |
fmt.Println("Writing batch to db") | |
alimentosCollection.Insert(infos...) | |
time.Sleep(1 * time.Second) | |
} | |
fmt.Println("Done!") | |
} | |
type InformacaoNutricional struct { | |
LojaUrl string | |
Nome string | |
Medida string | |
Propriedades map[string]string | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment