Last active
December 20, 2015 22:59
-
-
Save klb3713/6209281 to your computer and use it in GitHub Desktop.
python 爬取优酷视频信息,并存储到mongodb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
__author__ = 'klb3713' | |
import re | |
import json | |
import urllib2 | |
from lxml import etree | |
from multiprocessing import Process | |
from pymongo import Connection | |
con = Connection('127.0.0.1:27017') | |
youku_videos = con.weibo.youkuVideos | |
def get_html(url, coding='utf-8'): | |
try: | |
res = urllib2.urlopen(url) | |
return res.read().decode(coding) | |
except: | |
return "" | |
def getVideoInfo(urls, channel_name): | |
videos = [] | |
for url in urls: | |
try: | |
html = get_html(url) | |
vid = re.search(r"videoId = '(\d+)'", html).groups()[0] | |
tree = etree.HTML(html) | |
video = {} | |
video['url'] = url | |
video["navigation"] = tree.xpath('//div[@class="guide"]/div/a/text()') | |
title = tree.xpath('/html/head/title/text()')[0] | |
video["title"] = title[:title.find(u'—在线播放')] | |
video["upVideoTimes"] = int(tree.xpath('//*[@id="upVideoTimes"]/text()')[0].replace(',', '')) | |
video["downVideoTimes"] = int(tree.xpath('//*[@id="downVideoTimes"]/text()')[0].replace(',', '')) | |
video_info = json.loads(get_html("http://v.youku.com/QVideo/~ajax/getVideoPlayInfo?__rt=1&__ro=&id=%s&sid=0&type=vv" % vid)) | |
video["playTimes"] = video_info['vv'] | |
if tree.xpath('//*[@id="text_long"]'): | |
video["description"] = tree.xpath('//*[@id="text_long"]/text()')[0] | |
else: | |
video["description"] = "" | |
video["channel"] = channel_name | |
videos.append(video) | |
except Exception, e: | |
print url | |
print e | |
continue | |
return videos | |
def getVideos(channel_url, channel_name): | |
base_url = "http://www.youku.com" | |
html = get_html(channel_url) | |
tree = etree.HTML(html) | |
page_url = base_url + tree.xpath('//ul[@class="pages"]/li[last()]/a/@href')[0] | |
page_url_pre = re.split(r'\d+\.html$', page_url)[0] | |
max_page = int(re.search(r'(\d+)\.html$', page_url).groups()[0]) | |
#urls = re.findall(r'http://v.youku.com/v_show/id_[^?"]+', html) | |
urls = [] | |
for page in range(1, max_page+1): | |
try: | |
page_url = page_url_pre + str(page) + ".html" | |
page_html = get_html(page_url) | |
# urls.extend(re.findall(r'http://v.youku.com/v_show/id_[^?"]+', html)) | |
urls.extend(re.findall(r'http://v.youku.com/v_show/id_[^?"]+', page_html)) | |
except Exception, e: | |
print page_url | |
print e | |
continue | |
urls = list(set(urls)) | |
length = 0 | |
if len(urls) > 1000: | |
urls = urls[0:1000] | |
length = 1000 | |
else: | |
length = len(urls) | |
index = 0 | |
while index < length: | |
videos_info = getVideoInfo(urls[index:index+100], channel_name) | |
youku_videos.insert(videos_info) | |
index += 100 | |
def main(): | |
channels = [ | |
["http://www.youku.com/v_showlist/c91.html", "news"], | |
["http://www.youku.com/v_showlist/c105.html", "tech"], | |
["http://www.youku.com/v_showlist/c86.html", "ent"], | |
["http://www.youku.com/v_showlist/c94.html", "fun"], | |
["http://www.youku.com/v_showlist/c88.html", "travel"], | |
["http://www.youku.com/v_showlist/c90.html", "baby"], | |
["http://www.youku.com/v_olist/c_87.html", "edu"], | |
["http://www.youku.com/v_olist/c_84.html", "jilupian"], | |
["http://www.youku.com/v_showlist/c92.html", "dv"], | |
["http://www.youku.com/v_showlist/c98.html", "sports"], | |
["http://www.youku.com/v_showlist/c89.html", "fashion"], | |
["http://www.youku.com/v_showlist/c99.html", "game"], | |
["http://www.youku.com/v_showlist/c104.html", "auto"], | |
] | |
for channel in channels: | |
getVideos(channel[0], channel[1]) | |
# Multi-process | |
# record = [] | |
# for channel in channels: | |
# # channel_name = re.search(r'http://([^\.]+)\.', channel).groups()[0] | |
# process = Process(target=getVideos, args=(channel[0], channel[1])) | |
# process.start() | |
# record.append(process) | |
# | |
# for process in record: | |
# process.join() | |
if __name__ == "__main__": | |
main() | |
# getVideos("http://www.youku.com/v_showlist/c91.html", "news") | |
# getVideoInfo(["http://v.youku.com/v_show/id_XNTkzNjc3MzA0.html"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment