Skip to content

Instantly share code, notes, and snippets.

@zweite
Last active December 10, 2016 10:42
Show Gist options
  • Save zweite/ef5e3aed3450ed99e1beadb21baa0d95 to your computer and use it in GitHub Desktop.
Save zweite/ef5e3aed3450ed99e1beadb21baa0d95 to your computer and use it in GitHub Desktop.
豆瓣“开放”API 爬取电影信息
package main
import (
"encoding/json"
"io/ioutil"
"log"
"net/http"
"net/url"
"strconv"
"sync"
_ "github.com/go-sql-driver/mysql"
"github.com/jinzhu/gorm"
)
// url parameter
const (
Tag = "tag" // url_encode(tag)
PageLimit = "page_limit"
PageStart = "page_start"
Sort = "sort" // recommend
Type = "type" // movie
DefaultUrl = "https://movie.douban.com/j/search_subjects"
)
// table name
const (
MOVIE_TABLE = "movie"
TAG_TABLE = "tag"
)
var (
db *gorm.DB
once sync.Once
)
// init db
func GetDB() *gorm.DB {
once.Do(initDB())
return db
}
func initDB() {
var err error
db, err = gorm.Open("mysql", "root:root@tcp(localhost:3306)/douban_movie?charset=utf8&parseTime=True&loc=Local")
if err != nil {
log.Fatal(err)
}
}
type Movie struct {
MovieId string `json:"id"`
Rate string `json:"rate"`
Cover string `json:"cover"`
Cover_X int `json:"cover_x"`
Cover_Y int `json:"cover_y"`
Title string `json:"title"`
Url string `json:"url"`
Playable bool `json:"playable"`
IsBeetleSubject bool `json:"is_beetle_subject"`
IsNew bool `json:"is_new"`
}
type MovieTag struct {
MovieId string
Tag string
}
type MovieClient struct {
Tag string
PageLimit int
PageStart int
Sort string
Type string
}
func NewMovieClient(tag string, pageLimit int, sort string) *MovieClient {
return &MovieClient{
Tag: tag,
PageLimit: pageLimit,
PageStart: 0,
Sort: sort,
Type: "movie",
}
}
func (m *MovieClient) getUrl() string {
val := make(url.Values)
val.Add(Type, m.Type)
val.Add(Tag, m.Tag)
val.Add(PageLimit, strconv.Itoa(m.PageLimit))
val.Add(PageStart, strconv.Itoa(m.PageStart))
val.Add(Sort, m.Sort)
return DefaultUrl + "?" + val.Encode()
}
func (m *MovieClient) setPageStart(start int) {
m.PageStart = start
}
func (m *MovieClient) Download() ([]*Movie, error) {
resp, err := http.Get(m.getUrl())
if err != nil {
return nil, err
}
defer resp.Body.Close()
data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, err
}
result := make(map[string][]*Movie)
err = json.Unmarshal(data, &result)
movieItem := result["subjects"]
m.setPageStart(m.PageStart + len(movieItem))
return movieItem, err
}
func (m *MovieClient) InsertTag(movieId string) {
tdb := GetDB().Table(TAG_TABLE)
tdb.Create(MovieTag{MovieId: movieId, Tag: m.Tag})
}
func (m *MovieClient) Run() {
// 最大爬1000页
for i := 0; i < 1000; i++ {
movieItems, err := m.Download()
if err != nil {
log.Fatal(err)
}
if len(movieItems) == 0 {
break
}
tdb := GetDB().Table(MOVIE_TABLE)
for _, movie := range movieItems {
tdb.Create(movie)
m.InsertTag(movie.MovieId)
}
}
}
func main() {
tags := []string{"热门", "最新", "经典", "可播放", "豆瓣高分", "冷门佳片", "华语", "欧美", "韩国", "日本", "动作", "喜剧", "爱情", "科幻", "悬疑", "恐怖", "治愈"}
for _, tag := range tags {
movieClient := NewMovieClient(tag, 20, "recommend")
movieClient.Run()
}
}
-- Create syntax for TABLE 'movie'
CREATE TABLE `movie` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`movie_id` varchar(256) NOT NULL DEFAULT '',
`rate` varchar(256) DEFAULT '',
`cover` varchar(256) DEFAULT '',
`cover_x` int(11) DEFAULT NULL,
`cover_y` int(11) DEFAULT NULL,
`title` varchar(256) DEFAULT '',
`url` varchar(256) DEFAULT '',
`playable` tinyint(1) DEFAULT NULL,
`is_beetle_subject` tinyint(1) DEFAULT NULL,
`is_new` tinyint(1) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `movie_id` (`movie_id`)
) ENGINE=InnoDB AUTO_INCREMENT=15613 DEFAULT CHARSET=utf8;
-- Create syntax for TABLE 'tag'
CREATE TABLE `tag` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`movie_id` varchar(32) NOT NULL DEFAULT '',
`tag` varchar(11) NOT NULL DEFAULT '',
PRIMARY KEY (`id`),
UNIQUE KEY `move_id` (`movie_id`,`tag`)
) ENGINE=InnoDB AUTO_INCREMENT=5045 DEFAULT CHARSET=utf8;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment