Created
March 24, 2021 10:43
-
-
Save mojocn/b797fa2c74ad7e70c2fcc7c5172c3e61 to your computer and use it in GitHub Desktop.
fetch hacknews
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ( | |
"crypto/tls" | |
"errors" | |
"fmt" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/sirupsen/logrus" | |
"gorm.io/gorm" | |
"net/http" | |
"strings" | |
"time" | |
) | |
type HnCate string | |
var ( | |
HnCateNews = HnCate("news") | |
HnCateShow = HnCate("show") | |
) | |
type HackNew struct { | |
gorm.Model | |
TitleZh string `json:"title_zh" form:"title_zh"` | |
TitleEn string `json:"title_en" form:"title_en"` | |
Url string `gorm:"index" json:"url" form:"url"` | |
Cate string `json:"cate" comment:"news or show" form:"cate"` | |
} | |
func downloadHtml(url string) (*goquery.Document, error) { | |
tr := &http.Transport{ | |
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, | |
} | |
client := &http.Client{Transport: tr, Timeout: time.Second * 60} | |
req, err := http.NewRequest("GET", url, nil) | |
if err != nil { | |
return nil, err | |
} | |
req.Header.Set("cookie", "user=neochau&SlKqTK32QSFSiWQu1vGgCr4aqvTx5NxT") | |
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36") | |
res, err := client.Do(req) | |
if err != nil { | |
return nil, err | |
} | |
if res.StatusCode != 200 { | |
return nil, errors.New("the get request's response code is not 200") | |
} | |
defer res.Body.Close() | |
return goquery.NewDocumentFromReader(res.Body) | |
} | |
func SpiderHN(db *gorm.DB, cate HnCate) error { | |
doc, err := downloadHtml(fmt.Sprintf("https://news.ycombinator.com/%s", cate)) | |
if err != nil { | |
return err | |
} | |
doc.Find("a.storylink").Each(func(i int, s *goquery.Selection) { | |
url, _ := s.Attr("href") | |
if strings.HasPrefix(url, "/") { | |
url = "https://news.ycombinator.com" + url | |
} | |
titleEn := s.Text() | |
titleEn = strings.ReplaceAll(titleEn, "[", "") | |
titleEn = strings.ReplaceAll(titleEn, "]", "") | |
row := new(HackNew) | |
if errors.Is(db.Where("url = ?", url).Take(row).Error, gorm.ErrRecordNotFound) { | |
row.TitleEn = titleEn | |
row.Url = url | |
row.Cate = string(cate) | |
} | |
if row.TitleZh == "" { | |
zh, err := GoogleTranslate(titleEn) | |
if err != nil { | |
logrus.WithError(err).Error("google 翻译错误") | |
} | |
row.TitleZh = zh | |
time.Sleep(time.Second * 15) | |
} | |
err = db.Save(row).Error | |
if err != nil { | |
logrus.WithError(err).Error("upsert") | |
} | |
}) | |
return nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment