Last active
December 10, 2016 10:42
-
-
Save zweite/ef5e3aed3450ed99e1beadb21baa0d95 to your computer and use it in GitHub Desktop.
豆瓣“开放”API 爬取电影信息
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"io/ioutil" | |
"log" | |
"net/http" | |
"net/url" | |
"strconv" | |
"sync" | |
_ "github.com/go-sql-driver/mysql" | |
"github.com/jinzhu/gorm" | |
) | |
// url parameter | |
const ( | |
Tag = "tag" // url_encode(tag) | |
PageLimit = "page_limit" | |
PageStart = "page_start" | |
Sort = "sort" // recommend | |
Type = "type" // movie | |
DefaultUrl = "https://movie.douban.com/j/search_subjects" | |
) | |
// table name | |
const ( | |
MOVIE_TABLE = "movie" | |
TAG_TABLE = "tag" | |
) | |
var ( | |
db *gorm.DB | |
once sync.Once | |
) | |
// init db | |
func GetDB() *gorm.DB { | |
once.Do(initDB()) | |
return db | |
} | |
func initDB() { | |
var err error | |
db, err = gorm.Open("mysql", "root:root@tcp(localhost:3306)/douban_movie?charset=utf8&parseTime=True&loc=Local") | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
type Movie struct { | |
MovieId string `json:"id"` | |
Rate string `json:"rate"` | |
Cover string `json:"cover"` | |
Cover_X int `json:"cover_x"` | |
Cover_Y int `json:"cover_y"` | |
Title string `json:"title"` | |
Url string `json:"url"` | |
Playable bool `json:"playable"` | |
IsBeetleSubject bool `json:"is_beetle_subject"` | |
IsNew bool `json:"is_new"` | |
} | |
type MovieTag struct { | |
MovieId string | |
Tag string | |
} | |
type MovieClient struct { | |
Tag string | |
PageLimit int | |
PageStart int | |
Sort string | |
Type string | |
} | |
func NewMovieClient(tag string, pageLimit int, sort string) *MovieClient { | |
return &MovieClient{ | |
Tag: tag, | |
PageLimit: pageLimit, | |
PageStart: 0, | |
Sort: sort, | |
Type: "movie", | |
} | |
} | |
func (m *MovieClient) getUrl() string { | |
val := make(url.Values) | |
val.Add(Type, m.Type) | |
val.Add(Tag, m.Tag) | |
val.Add(PageLimit, strconv.Itoa(m.PageLimit)) | |
val.Add(PageStart, strconv.Itoa(m.PageStart)) | |
val.Add(Sort, m.Sort) | |
return DefaultUrl + "?" + val.Encode() | |
} | |
func (m *MovieClient) setPageStart(start int) { | |
m.PageStart = start | |
} | |
func (m *MovieClient) Download() ([]*Movie, error) { | |
resp, err := http.Get(m.getUrl()) | |
if err != nil { | |
return nil, err | |
} | |
defer resp.Body.Close() | |
data, err := ioutil.ReadAll(resp.Body) | |
if err != nil { | |
return nil, err | |
} | |
result := make(map[string][]*Movie) | |
err = json.Unmarshal(data, &result) | |
movieItem := result["subjects"] | |
m.setPageStart(m.PageStart + len(movieItem)) | |
return movieItem, err | |
} | |
func (m *MovieClient) InsertTag(movieId string) { | |
tdb := GetDB().Table(TAG_TABLE) | |
tdb.Create(MovieTag{MovieId: movieId, Tag: m.Tag}) | |
} | |
func (m *MovieClient) Run() { | |
// 最大爬1000页 | |
for i := 0; i < 1000; i++ { | |
movieItems, err := m.Download() | |
if err != nil { | |
log.Fatal(err) | |
} | |
if len(movieItems) == 0 { | |
break | |
} | |
tdb := GetDB().Table(MOVIE_TABLE) | |
for _, movie := range movieItems { | |
tdb.Create(movie) | |
m.InsertTag(movie.MovieId) | |
} | |
} | |
} | |
func main() { | |
tags := []string{"热门", "最新", "经典", "可播放", "豆瓣高分", "冷门佳片", "华语", "欧美", "韩国", "日本", "动作", "喜剧", "爱情", "科幻", "悬疑", "恐怖", "治愈"} | |
for _, tag := range tags { | |
movieClient := NewMovieClient(tag, 20, "recommend") | |
movieClient.Run() | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Create syntax for TABLE 'movie' | |
CREATE TABLE `movie` ( | |
`id` int(11) unsigned NOT NULL AUTO_INCREMENT, | |
`movie_id` varchar(256) NOT NULL DEFAULT '', | |
`rate` varchar(256) DEFAULT '', | |
`cover` varchar(256) DEFAULT '', | |
`cover_x` int(11) DEFAULT NULL, | |
`cover_y` int(11) DEFAULT NULL, | |
`title` varchar(256) DEFAULT '', | |
`url` varchar(256) DEFAULT '', | |
`playable` tinyint(1) DEFAULT NULL, | |
`is_beetle_subject` tinyint(1) DEFAULT NULL, | |
`is_new` tinyint(1) DEFAULT NULL, | |
PRIMARY KEY (`id`), | |
UNIQUE KEY `movie_id` (`movie_id`) | |
) ENGINE=InnoDB AUTO_INCREMENT=15613 DEFAULT CHARSET=utf8; | |
-- Create syntax for TABLE 'tag' | |
CREATE TABLE `tag` ( | |
`id` int(11) unsigned NOT NULL AUTO_INCREMENT, | |
`movie_id` varchar(32) NOT NULL DEFAULT '', | |
`tag` varchar(11) NOT NULL DEFAULT '', | |
PRIMARY KEY (`id`), | |
UNIQUE KEY `move_id` (`movie_id`,`tag`) | |
) ENGINE=InnoDB AUTO_INCREMENT=5045 DEFAULT CHARSET=utf8; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment