Last active
October 7, 2017 14:13
-
-
Save detorto/ccc8884a1245c726d48b01d22d0c8d19 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"net/url" | |
"github.com/PuerkitoBio/goquery" | |
"time" | |
"strings" | |
"strconv" | |
"math/rand" | |
"net/http" | |
"gopkg.in/headzoo/surf.v1" | |
"github.com/headzoo/surf/browser" | |
"github.com/headzoo/surf/agent" | |
"html/template" | |
"gopkg.in/mgo.v2" | |
"gopkg.in/mgo.v2/bson" | |
"log" | |
) | |
var db *mgo.Session | |
type Mention struct { | |
Query string | |
Time time.Time | |
Timeraw string | |
Brief string | |
Link string | |
Name string | |
Soruce string | |
Aggr string | |
ID bson.ObjectId `bson:"_id,omitempty"` | |
} | |
type Query struct { | |
SumbitTime time.Time | |
Text string | |
Status string | |
MentionsCount int | |
LastMention time.Time | |
Yparsed, Gparsed int | |
ID bson.ObjectId `bson:"_id,omitempty"` | |
} | |
func getURL(bow *browser.Browser, url string, useragent string) (*goquery.Selection) { | |
//var doc *goquery.Document | |
err := bow.Open(url) | |
if err != nil { | |
panic(err) | |
} | |
return bow.Dom() | |
} | |
func randomUserAgent() string { | |
ua := []string{ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0", | |
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", | |
} | |
return ua[rand.Intn(len(ua))] | |
} | |
func randomSleepMS() int { | |
return (rand.Intn(55)+5)*100 | |
} | |
func ScrapeGoogleNewsForDates(bow *browser.Browser,query string, page int) ([]Mention, bool) { | |
parseTime := func(tm string) time.Time { | |
day := strings.Split(tm," ")[0] | |
if _, err := strconv.Atoi(day); err == nil { | |
return time.Now() | |
} | |
lay := "Jan 2, 2006" | |
t, err := time.Parse(lay, strings.Trim(tm," ")) | |
if err != nil { | |
fmt.Println(err) | |
} | |
return t | |
} | |
var mentions [] Mention | |
Url, _ := url.Parse("https://www.google.com/search") | |
parameters := url.Values{"hl":{"en"},"q":{query},"tbm":{"nws"},"start":{strconv.Itoa(10*page)}} | |
Url.RawQuery = parameters.Encode() | |
doc := getURL(bow, Url.String(), randomUserAgent()) | |
if strings.Contains(bow.Url().String(),"google.com/sorry") { | |
return mentions, true | |
} | |
doc.Find(".g").Each(func(i int, s *goquery.Selection) { | |
var m Mention | |
nametg := s.Find("a") | |
m.Name = nametg.Text() | |
link,_ := nametg.Attr("href")//("href") | |
m.Link = link | |
m.Brief = s.Find(".st").Text() | |
src_time := s.Find("span").Text() | |
src_time_sl := strings.Split(src_time,"-") | |
src := strings.Join(src_time_sl[0 : len(src_time_sl)-1],"") | |
time := src_time_sl[len(src_time_sl)-1] | |
m.Timeraw = time | |
m.Time = parseTime(time) | |
m.Soruce = src | |
m.Aggr = "Google" | |
m.Query = query | |
//fmt.Printf("Aggr: %s\nSource: %s\nName: %s\nTime: %s [%s]\nLink: %s\nBrief: %s\n\n",m.aggr, m.soruce, m.name, m.time, m.timeraw, m.link, m.brief) | |
mentions = append(mentions,m) | |
}) | |
return mentions, false | |
} | |
func getMonthLong(m string) time.Month { | |
mm := map[string]time.Month {"января":time.January, | |
"февраля":time.February, | |
"марта":time.March, | |
"апреля":time.April, | |
"мая":time.May, | |
"июня":time.June, | |
"июля":time.February, | |
"августа":time.August, | |
"сентября":time.September, | |
"октября":time.October, | |
"ноября":time.November, | |
"декбря":time.December} | |
return mm[strings.ToLower(strings.Trim(m,""))] | |
} | |
func ScrapeYandexNewsForDates(bow *browser.Browser, query string, page int) ([]Mention, bool) { | |
parseTime := func(tm string) time.Time { | |
var day int | |
var month time.Month | |
var year int | |
firstl := strings.Split(tm," ") | |
if len(firstl) <= 1{ | |
return time.Now() | |
} | |
first:= firstl[0] | |
if _ ,err := strconv.Atoi(first); err == nil { | |
d,_ := strconv.Atoi(first) | |
day = d | |
fmt.Println(strings.Split(tm," ")) | |
month = getMonthLong(strings.Split(tm," ")[1]) | |
year = time.Now().Year() | |
fmt.Println("%s %s %s\n",year,month,day) | |
return time.Date(year,month,day,23, 0, 0, 0, time.UTC) | |
} else { | |
if first == "вчера"{ | |
year = time.Now().Year() | |
day = time.Now().Day() | |
month = time.Now().Month() | |
return time.Date(year,month,day-1,23, 0, 0, 0, time.UTC) | |
} | |
layout := "02.01.06" | |
t, err := time.Parse(layout, first) | |
if err != nil { | |
panic(err) | |
} | |
return t | |
} | |
return time.Now() | |
} | |
var mentions [] Mention | |
Url, _ := url.Parse("https://news.yandex.ru/yandsearch") | |
parameters := url.Values{"numdoc":{"30"},"text":{query},"rpt":{"nnews"},"p":{strconv.Itoa(page)}, "rel":{"tm"}} | |
Url.RawQuery = parameters.Encode() | |
doc := getURL(bow, Url.String(), randomUserAgent()) | |
if strings.Contains(bow.Url().String(),"https://news.yandex.ru/showcaptcha") { | |
return mentions, true | |
} | |
doc.Find(".search-item").Each(func(i int, s *goquery.Selection) { | |
var m Mention | |
m.Soruce = s.Find(".document__provider-name").Text() | |
m.Name = s.Find(".document__title").Text() | |
m.Time = parseTime(s.Find(".document__time").Text()) | |
m.Timeraw = s.Find(".document__time").Text() | |
link,_ := s.Find(".link").Attr("href") | |
m.Link = link | |
m.Brief = s.Find(".document__snippet").Text() | |
m.Aggr = "Yandex" | |
m.Query = query | |
mentions = append(mentions,m) | |
}) | |
return mentions,false | |
} | |
type Page struct { | |
Title string | |
Body []byte | |
} | |
func saveMentions(men []Mention) { | |
session, err := mgo.Dial("localhost") | |
b := session | |
defer session.Close() | |
c := b.DB("db").C("mentions") | |
for _,m := range men{ | |
err = c.Insert(&m) | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
} | |
func scrapper_yandex(){ | |
session, err := mgo.Dial("localhost") | |
b := session | |
defer session.Close() | |
mpage := 5 | |
updateStatus := func (ide bson.ObjectId , status string) { | |
c := b.DB("db").C("queries") | |
colQuerier := bson.M{"_id": ide} | |
change := bson.M{"$set": bson.M{"status":status}} | |
err := c.Update(colQuerier, change) | |
if err != nil { | |
panic(err) | |
} | |
} | |
if err != nil { | |
panic(err) | |
} | |
c := b.DB("db").C("queries") | |
bow := surf.NewBrowser() | |
bow.SetUserAgent(randomUserAgent()) | |
//mpage := 5 | |
for { | |
result := Query{} | |
iter:=c.Find(nil).Sort("-SumbitTime").Iter() | |
for iter.Next(&result) { | |
if result.Yparsed == mpage { | |
continue | |
} else { | |
fmt.Println("Goin scrap YANDEX", result.Text, result.Yparsed) | |
updateStatus(result.ID,fmt.Sprintf("Parsing yandex...[%d]",result.Yparsed)) | |
ymentions,ybanned := ScrapeYandexNewsForDates(bow,result.Text,result.Yparsed) | |
if ybanned{ | |
fmt.Println("Banned in yandex", result.Text, result.Yparsed) | |
updateStatus(result.ID,fmt.Sprintf("Baned yandex...[%d]",result.Yparsed)) | |
time.Sleep(3 * time.Minute) | |
continue | |
} | |
saveMentions(ymentions) | |
colQuerier := bson.M{"_id": result.ID} | |
change := bson.M{"$set": bson.M{"yparsed":result.Yparsed+1,"mentionscount":result.MentionsCount+len(ymentions)}} | |
err := c.Update(colQuerier, change) | |
if err != nil { | |
panic(err) | |
} | |
time.Sleep(time.Duration(randomSleepMS()+1000)*time.Millisecond) | |
} | |
} | |
} | |
} | |
func scrapper_google(){ | |
session, err := mgo.Dial("localhost") | |
b := session | |
defer session.Close() | |
mpage := 5 | |
updateStatus := func (ide bson.ObjectId , status string) { | |
c := b.DB("db").C("queries") | |
colQuerier := bson.M{"_id": ide} | |
change := bson.M{"$set": bson.M{"status":status}} | |
err := c.Update(colQuerier, change) | |
if err != nil { | |
panic(err) | |
} | |
} | |
if err != nil { | |
panic(err) | |
} | |
c := b.DB("db").C("queries") | |
bow := surf.NewBrowser() | |
bow.SetUserAgent(agent.Safari()) | |
//mpage := 5 | |
for { | |
result := Query{} | |
iter:=c.Find(nil).Sort("-SumbitTime").Iter() | |
for iter.Next(&result) { | |
if result.Gparsed == mpage { | |
continue | |
} else { | |
fmt.Println("Goin scrap GOOGLE", result.Text, result.Gparsed) | |
updateStatus(result.ID,fmt.Sprintf("Parsing google...[%d]",result.Gparsed)) | |
gmentions,gbanned := ScrapeGoogleNewsForDates(bow,result.Text,result.Gparsed) | |
if gbanned{ | |
fmt.Println("Banned in google", result.Text, result.Gparsed) | |
updateStatus(result.ID,fmt.Sprintf("Baned google...[%d]",result.Gparsed)) | |
time.Sleep(2 * time.Minute) | |
continue | |
} | |
saveMentions(gmentions) | |
colQuerier := bson.M{"_id": result.ID} | |
change := bson.M{"$set": bson.M{"gparsed":result.Gparsed+1,"mentionscount":result.MentionsCount+len(gmentions)}} | |
err := c.Update(colQuerier, change) | |
if err != nil { | |
panic(err) | |
} | |
time.Sleep(time.Duration(randomSleepMS())*time.Millisecond) | |
} | |
} | |
} | |
} | |
func submitQueryForScrap(query string) { | |
c := db.DB("db").C("queries") | |
err := c.Insert(&Query{time.Now(), query, "Pending...",0, time.Now(), 0, 0, ""}) | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
func getEnginesStatus() map[string]string{ | |
c := db.DB("db").C("engines") | |
var engines []struct {Name, Status string} | |
var res map[string]string | |
c.Find(nil).All(&engines) | |
for _,v := range engines { | |
res[v.Name] = v.Status | |
} | |
return res | |
} | |
type MainPage struct { | |
engines_status map[string]string | |
} | |
func mainpage(w http.ResponseWriter, r *http.Request) { | |
t, _ := template.ParseFiles("./templates/main.html") | |
var p MainPage | |
p.engines_status = getEnginesStatus() | |
t.Execute(w, getEnginesStatus()) | |
} | |
func result(w http.ResponseWriter, r *http.Request) { | |
fmt.Fprintf(w, "Hi there, I love %s!", r.URL.Path[1:]) | |
} | |
func submit(w http.ResponseWriter, r *http.Request) { | |
query := r.FormValue("query") | |
if query != "" { | |
submitQueryForScrap(query) | |
} | |
http.Redirect(w, r, "/", http.StatusFound) | |
} | |
func getQueries() []Query { | |
c := db.DB("db").C("queries") | |
result := []Query{} | |
iter := c.Find(nil).Limit(100).Iter() | |
err := iter.All(&result) | |
if err != nil { | |
log.Fatal(err) | |
} | |
return result | |
} | |
func queries(w http.ResponseWriter, r *http.Request) { | |
queries := getQueries() | |
for _,q:= range queries { | |
fmt.Fprintf(w, "<a href=\"/result?q=%s\">Text: %s, status=%s subtime=%s Cont=%d </a><p>", q.Text,q.Text, q.Status, q.SumbitTime, q.MentionsCount) | |
} | |
} | |
func getMentions(qwery string) []Mention { | |
c := db.DB("db").C("mentions") | |
result := []Mention{} | |
iter := c.Find(bson.M{"query":qwery}).Iter() | |
err := iter.All(&result) | |
if err != nil { | |
log.Fatal(err) | |
} | |
return result | |
} | |
func resultp(w http.ResponseWriter, r *http.Request) { | |
mentions := getMentions(r.URL.Query()["q"][0]) | |
for _,m:= range mentions { | |
fmt.Fprintf(w, "Aggr: %s<p>Source: %s<p>Name: %s<p>Time: %s [%s]<p>Link: %s<p>Brief: %s<p><p>",m.Aggr, m.Soruce, m.Name, m.Time, m.Timeraw, m.Link, m.Brief) | |
} | |
} | |
func main() { | |
session, err := mgo.Dial("localhost") | |
if err != nil { | |
panic(err) | |
} | |
db = session | |
defer session.Close() | |
// Optional. Switch the session to a monotonic behavior. | |
session.SetMode(mgo.Monotonic, true) | |
fmt.Println("Mongo client initialized") | |
http.HandleFunc("/", mainpage) | |
http.HandleFunc("/queries/", queries) | |
http.HandleFunc("/result", resultp) | |
http.HandleFunc("/submit/", submit) | |
go scrapper_google() | |
go scrapper_yandex() | |
http.ListenAndServe(":8080", nil) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment