Created
September 7, 2019 22:07
-
-
Save kanzitelli/a94759362798175aa69d551f8c091c4a to your computer and use it in GitHub Desktop.
SecretMag crawler. #1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package crawler | |
import ( | |
"fmt" | |
"time" | |
"github.com/gocolly/colly" | |
) | |
// SecretMag <struct> | |
// is used to present Secret Magazine crawler. | |
type SecretMag struct{} | |
const ( | |
baseURL_SM = "https://secretmag.ru" | |
crawlURL_SM = "https://secretmag.ru/news" | |
) | |
// Run <function> | |
// is used to start crawling process. | |
func (sm SecretMag) Run() []models.News { | |
var totalNews []models.News | |
newsFuncs := []NewsFunc{ | |
sm.runNews, | |
} | |
for _, f := range newsFuncs { | |
tmpNews := f() | |
totalNews = append(totalNews, tmpNews...) | |
} | |
return totalNews | |
} | |
func (sm SecretMag) runNews() []models.News { | |
// creating simple colly instance without any options | |
newsCollector := colly.NewCollector() | |
// array of news that will be returned | |
var news []models.News | |
newsCollector.OnHTML(".wrapper", func(divWrapper *colly.HTMLElement) { | |
divWrapper.ForEach(".container", func(i1 int, divContainer *colly.HTMLElement) { | |
divContainer.ForEach(".item", func(i2 int, divItem *colly.HTMLElement) { | |
link := divItem.ChildAttr("a[href]", "href") | |
fullLink := fmt.Sprintf("%s%s", baseURL_SM, link) | |
title := divItem.ChildText(".headline") | |
_id := utils.MakeHash(fullLink) // here we are going to create hash from full link in order to set ID of a news to hash value, so mongo won't add in case of duplicates | |
news = append(news, models.News{ | |
ID: _id, | |
Title: title, | |
Link: fullLink, | |
Preamble: "", | |
TimeAdded: time.Now().Unix(), | |
NewsType: models.TypeNews, | |
NewsSource: models.SecretMagNewsSource, | |
}) | |
}) | |
}) | |
}) | |
newsCollector.Visit(crawlURL_SM) | |
newsCollector.Wait() | |
return news | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment