Skip to content

Instantly share code, notes, and snippets.

@detorto
Last active October 7, 2017 14:13
Show Gist options
  • Save detorto/ccc8884a1245c726d48b01d22d0c8d19 to your computer and use it in GitHub Desktop.
Save detorto/ccc8884a1245c726d48b01d22d0c8d19 to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"net/url"
"github.com/PuerkitoBio/goquery"
"time"
"strings"
"strconv"
"math/rand"
"net/http"
"gopkg.in/headzoo/surf.v1"
"github.com/headzoo/surf/browser"
"github.com/headzoo/surf/agent"
"html/template"
"gopkg.in/mgo.v2"
"gopkg.in/mgo.v2/bson"
"log"
)
var db *mgo.Session
type Mention struct {
Query string
Time time.Time
Timeraw string
Brief string
Link string
Name string
Soruce string
Aggr string
ID bson.ObjectId `bson:"_id,omitempty"`
}
type Query struct {
SumbitTime time.Time
Text string
Status string
MentionsCount int
LastMention time.Time
Yparsed, Gparsed int
ID bson.ObjectId `bson:"_id,omitempty"`
}
func getURL(bow *browser.Browser, url string, useragent string) (*goquery.Selection) {
//var doc *goquery.Document
err := bow.Open(url)
if err != nil {
panic(err)
}
return bow.Dom()
}
func randomUserAgent() string {
ua := []string{ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
}
return ua[rand.Intn(len(ua))]
}
func randomSleepMS() int {
return (rand.Intn(55)+5)*100
}
func ScrapeGoogleNewsForDates(bow *browser.Browser,query string, page int) ([]Mention, bool) {
parseTime := func(tm string) time.Time {
day := strings.Split(tm," ")[0]
if _, err := strconv.Atoi(day); err == nil {
return time.Now()
}
lay := "Jan 2, 2006"
t, err := time.Parse(lay, strings.Trim(tm," "))
if err != nil {
fmt.Println(err)
}
return t
}
var mentions [] Mention
Url, _ := url.Parse("https://www.google.com/search")
parameters := url.Values{"hl":{"en"},"q":{query},"tbm":{"nws"},"start":{strconv.Itoa(10*page)}}
Url.RawQuery = parameters.Encode()
doc := getURL(bow, Url.String(), randomUserAgent())
if strings.Contains(bow.Url().String(),"google.com/sorry") {
return mentions, true
}
doc.Find(".g").Each(func(i int, s *goquery.Selection) {
var m Mention
nametg := s.Find("a")
m.Name = nametg.Text()
link,_ := nametg.Attr("href")//("href")
m.Link = link
m.Brief = s.Find(".st").Text()
src_time := s.Find("span").Text()
src_time_sl := strings.Split(src_time,"-")
src := strings.Join(src_time_sl[0 : len(src_time_sl)-1],"")
time := src_time_sl[len(src_time_sl)-1]
m.Timeraw = time
m.Time = parseTime(time)
m.Soruce = src
m.Aggr = "Google"
m.Query = query
//fmt.Printf("Aggr: %s\nSource: %s\nName: %s\nTime: %s [%s]\nLink: %s\nBrief: %s\n\n",m.aggr, m.soruce, m.name, m.time, m.timeraw, m.link, m.brief)
mentions = append(mentions,m)
})
return mentions, false
}
func getMonthLong(m string) time.Month {
mm := map[string]time.Month {"января":time.January,
"февраля":time.February,
"марта":time.March,
"апреля":time.April,
"мая":time.May,
"июня":time.June,
"июля":time.February,
"августа":time.August,
"сентября":time.September,
"октября":time.October,
"ноября":time.November,
"декбря":time.December}
return mm[strings.ToLower(strings.Trim(m,""))]
}
func ScrapeYandexNewsForDates(bow *browser.Browser, query string, page int) ([]Mention, bool) {
parseTime := func(tm string) time.Time {
var day int
var month time.Month
var year int
firstl := strings.Split(tm," ")
if len(firstl) <= 1{
return time.Now()
}
first:= firstl[0]
if _ ,err := strconv.Atoi(first); err == nil {
d,_ := strconv.Atoi(first)
day = d
fmt.Println(strings.Split(tm," "))
month = getMonthLong(strings.Split(tm," ")[1])
year = time.Now().Year()
fmt.Println("%s %s %s\n",year,month,day)
return time.Date(year,month,day,23, 0, 0, 0, time.UTC)
} else {
if first == "вчера"{
year = time.Now().Year()
day = time.Now().Day()
month = time.Now().Month()
return time.Date(year,month,day-1,23, 0, 0, 0, time.UTC)
}
layout := "02.01.06"
t, err := time.Parse(layout, first)
if err != nil {
panic(err)
}
return t
}
return time.Now()
}
var mentions [] Mention
Url, _ := url.Parse("https://news.yandex.ru/yandsearch")
parameters := url.Values{"numdoc":{"30"},"text":{query},"rpt":{"nnews"},"p":{strconv.Itoa(page)}, "rel":{"tm"}}
Url.RawQuery = parameters.Encode()
doc := getURL(bow, Url.String(), randomUserAgent())
if strings.Contains(bow.Url().String(),"https://news.yandex.ru/showcaptcha") {
return mentions, true
}
doc.Find(".search-item").Each(func(i int, s *goquery.Selection) {
var m Mention
m.Soruce = s.Find(".document__provider-name").Text()
m.Name = s.Find(".document__title").Text()
m.Time = parseTime(s.Find(".document__time").Text())
m.Timeraw = s.Find(".document__time").Text()
link,_ := s.Find(".link").Attr("href")
m.Link = link
m.Brief = s.Find(".document__snippet").Text()
m.Aggr = "Yandex"
m.Query = query
mentions = append(mentions,m)
})
return mentions,false
}
type Page struct {
Title string
Body []byte
}
func saveMentions(men []Mention) {
session, err := mgo.Dial("localhost")
b := session
defer session.Close()
c := b.DB("db").C("mentions")
for _,m := range men{
err = c.Insert(&m)
if err != nil {
log.Fatal(err)
}
}
}
func scrapper_yandex(){
session, err := mgo.Dial("localhost")
b := session
defer session.Close()
mpage := 5
updateStatus := func (ide bson.ObjectId , status string) {
c := b.DB("db").C("queries")
colQuerier := bson.M{"_id": ide}
change := bson.M{"$set": bson.M{"status":status}}
err := c.Update(colQuerier, change)
if err != nil {
panic(err)
}
}
if err != nil {
panic(err)
}
c := b.DB("db").C("queries")
bow := surf.NewBrowser()
bow.SetUserAgent(randomUserAgent())
//mpage := 5
for {
result := Query{}
iter:=c.Find(nil).Sort("-SumbitTime").Iter()
for iter.Next(&result) {
if result.Yparsed == mpage {
continue
} else {
fmt.Println("Goin scrap YANDEX", result.Text, result.Yparsed)
updateStatus(result.ID,fmt.Sprintf("Parsing yandex...[%d]",result.Yparsed))
ymentions,ybanned := ScrapeYandexNewsForDates(bow,result.Text,result.Yparsed)
if ybanned{
fmt.Println("Banned in yandex", result.Text, result.Yparsed)
updateStatus(result.ID,fmt.Sprintf("Baned yandex...[%d]",result.Yparsed))
time.Sleep(3 * time.Minute)
continue
}
saveMentions(ymentions)
colQuerier := bson.M{"_id": result.ID}
change := bson.M{"$set": bson.M{"yparsed":result.Yparsed+1,"mentionscount":result.MentionsCount+len(ymentions)}}
err := c.Update(colQuerier, change)
if err != nil {
panic(err)
}
time.Sleep(time.Duration(randomSleepMS()+1000)*time.Millisecond)
}
}
}
}
func scrapper_google(){
session, err := mgo.Dial("localhost")
b := session
defer session.Close()
mpage := 5
updateStatus := func (ide bson.ObjectId , status string) {
c := b.DB("db").C("queries")
colQuerier := bson.M{"_id": ide}
change := bson.M{"$set": bson.M{"status":status}}
err := c.Update(colQuerier, change)
if err != nil {
panic(err)
}
}
if err != nil {
panic(err)
}
c := b.DB("db").C("queries")
bow := surf.NewBrowser()
bow.SetUserAgent(agent.Safari())
//mpage := 5
for {
result := Query{}
iter:=c.Find(nil).Sort("-SumbitTime").Iter()
for iter.Next(&result) {
if result.Gparsed == mpage {
continue
} else {
fmt.Println("Goin scrap GOOGLE", result.Text, result.Gparsed)
updateStatus(result.ID,fmt.Sprintf("Parsing google...[%d]",result.Gparsed))
gmentions,gbanned := ScrapeGoogleNewsForDates(bow,result.Text,result.Gparsed)
if gbanned{
fmt.Println("Banned in google", result.Text, result.Gparsed)
updateStatus(result.ID,fmt.Sprintf("Baned google...[%d]",result.Gparsed))
time.Sleep(2 * time.Minute)
continue
}
saveMentions(gmentions)
colQuerier := bson.M{"_id": result.ID}
change := bson.M{"$set": bson.M{"gparsed":result.Gparsed+1,"mentionscount":result.MentionsCount+len(gmentions)}}
err := c.Update(colQuerier, change)
if err != nil {
panic(err)
}
time.Sleep(time.Duration(randomSleepMS())*time.Millisecond)
}
}
}
}
func submitQueryForScrap(query string) {
c := db.DB("db").C("queries")
err := c.Insert(&Query{time.Now(), query, "Pending...",0, time.Now(), 0, 0, ""})
if err != nil {
log.Fatal(err)
}
}
func getEnginesStatus() map[string]string{
c := db.DB("db").C("engines")
var engines []struct {Name, Status string}
var res map[string]string
c.Find(nil).All(&engines)
for _,v := range engines {
res[v.Name] = v.Status
}
return res
}
type MainPage struct {
engines_status map[string]string
}
func mainpage(w http.ResponseWriter, r *http.Request) {
t, _ := template.ParseFiles("./templates/main.html")
var p MainPage
p.engines_status = getEnginesStatus()
t.Execute(w, getEnginesStatus())
}
func result(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, "Hi there, I love %s!", r.URL.Path[1:])
}
func submit(w http.ResponseWriter, r *http.Request) {
query := r.FormValue("query")
if query != "" {
submitQueryForScrap(query)
}
http.Redirect(w, r, "/", http.StatusFound)
}
func getQueries() []Query {
c := db.DB("db").C("queries")
result := []Query{}
iter := c.Find(nil).Limit(100).Iter()
err := iter.All(&result)
if err != nil {
log.Fatal(err)
}
return result
}
func queries(w http.ResponseWriter, r *http.Request) {
queries := getQueries()
for _,q:= range queries {
fmt.Fprintf(w, "<a href=\"/result?q=%s\">Text: %s, status=%s subtime=%s Cont=%d </a><p>", q.Text,q.Text, q.Status, q.SumbitTime, q.MentionsCount)
}
}
func getMentions(qwery string) []Mention {
c := db.DB("db").C("mentions")
result := []Mention{}
iter := c.Find(bson.M{"query":qwery}).Iter()
err := iter.All(&result)
if err != nil {
log.Fatal(err)
}
return result
}
func resultp(w http.ResponseWriter, r *http.Request) {
mentions := getMentions(r.URL.Query()["q"][0])
for _,m:= range mentions {
fmt.Fprintf(w, "Aggr: %s<p>Source: %s<p>Name: %s<p>Time: %s [%s]<p>Link: %s<p>Brief: %s<p><p>",m.Aggr, m.Soruce, m.Name, m.Time, m.Timeraw, m.Link, m.Brief)
}
}
func main() {
session, err := mgo.Dial("localhost")
if err != nil {
panic(err)
}
db = session
defer session.Close()
// Optional. Switch the session to a monotonic behavior.
session.SetMode(mgo.Monotonic, true)
fmt.Println("Mongo client initialized")
http.HandleFunc("/", mainpage)
http.HandleFunc("/queries/", queries)
http.HandleFunc("/result", resultp)
http.HandleFunc("/submit/", submit)
go scrapper_google()
go scrapper_yandex()
http.ListenAndServe(":8080", nil)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment