Created
June 8, 2014 17:28
-
-
Save cnbeining/1fc6a80e3e7982d079db to your computer and use it in GitHub Desktop.
Biligrab 0.1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Biligrab 0.1 | |
Beining@ACICFG | |
http://www.cnbeining.com/ | |
A most simple tool to download comments from Bilibili. | |
Require: Python 2.7 | |
Usage: | |
python Biligrab01.py | |
>>>av12450 | |
>>>p1 | |
Simple as it goes. | |
''' | |
import sys | |
import os | |
from StringIO import StringIO | |
import gzip | |
import urllib2 | |
import sys | |
#---------------------------------------------------------------------- | |
def find_cid_api(vid, p): | |
"""find cid and print video detail""" | |
global cid | |
global partname | |
global title | |
global videourl | |
cid = 0 | |
title = '' | |
partname = '' | |
biliurl = 'http://api.bilibili.tv/view?type=xml&appkey=876fe0ebd0e67a0f&id=' + str(vid) + '&page=' + str(p) | |
videourl = 'http://www.bilibili.tv/video/av'+ str(vid)+'/index_'+ str(p)+'.html' | |
print('Fetching webpage...') | |
try: | |
request = urllib2.Request(biliurl, headers={ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) | |
response = urllib2.urlopen(request) | |
data = response.read() | |
data_list = data.split('\n') | |
for lines in data_list: | |
if 'cid' in lines: | |
cid = lines[7:-6] | |
print('cid is ' + str(cid)) | |
if 'partname' in lines: | |
partname = lines[12:-11] | |
print('partname is ' + str(partname)) | |
if 'title' in lines: | |
title = lines[9:-8] | |
print('title is ' + str(title)) | |
except: #If API failed | |
print('ERROR: Cannot connect to API server!') | |
def main(vid, p): | |
cid = 0 | |
title = '' | |
partname = '' | |
biliurl = 'http://api.bilibili.tv/view?type=xml&appkey=876fe0ebd0e67a0f&id=' + str(vid) + '&page=' + str(p) | |
videourl = 'http://www.bilibili.tv/video/av'+vid+'/index_'+p+'.html' | |
print('Fetching webpage...') | |
request = urllib2.Request(biliurl) | |
response = urllib2.urlopen(request) | |
data = response.read() | |
data_list = data.split('\n') | |
for lines in data_list: | |
if 'cid' in lines: | |
cid = lines[7:-6] | |
print('cid is ' + str(cid)) | |
break | |
for lines in data_list: | |
if 'partname' in lines: | |
partname = lines[12:-11] | |
print('partname is ' + str(partname)) | |
break | |
for lines in data_list: | |
if 'title' in lines: | |
title = lines[9:-8] | |
print('title is ' + str(title)) | |
break | |
if cid is 0: | |
print('Cannot find cid, trying to do it brutely...') | |
print('Fetching webpage...') | |
request = urllib2.Request(videourl) | |
request.add_header('Accept-encoding', 'gzip') | |
response = urllib2.urlopen(request) | |
if response.info().get('Content-Encoding') == 'gzip': | |
buf = StringIO( response.read()) | |
f = gzip.GzipFile(fileobj=buf) | |
data = f.read() | |
data_list = data.split('\n') | |
#Todo: read title | |
for lines in data_list: | |
if 'cid=' in lines: | |
cid = lines.split('&') | |
cid = cid[0].split('=') | |
cid = cid[-1] | |
print('cid is ' + str(cid)) | |
break | |
''' | |
if cid is 0: | |
vid = int(int(vid) - 1) | |
p = 1 | |
find_cid_api(vid-1, p) | |
cid = str(int(cid) + 1) | |
biliurl = 'http://api.bilibili.tv/view?type=xml&appkey=876fe0ebd0e67a0f&id=' + str(vid) + '&page=' + str(p) | |
videourl = 'http://www.bilibili.tv/video/av'+ str(vid)+'/index_'+ str(p)+'.html' | |
print('Fetching webpage...') | |
request = urllib2.Request(biliurl) | |
response = urllib2.urlopen(request) | |
data = response.read() | |
data_list = data.split('\n') | |
for lines in data_list: | |
if 'cid' in lines: | |
cid = lines[7:-6] | |
print('cid is ' + str(cid)) | |
if 'partname' in lines: | |
partname = lines[12:-11] | |
print('partname is ' + str(partname)) | |
if 'title' in lines: | |
title = lines[9:-8] | |
print('title is ' + str(title)) | |
''' | |
if cid is 0: | |
cid = str(raw_input('Input the cid by yourself!')) | |
if cid is '': | |
exit() | |
#start to make folders... | |
if title is not '': | |
folder = title | |
else: | |
folder = cid | |
if partname is not '': | |
filename = partname | |
elif title is not '': | |
filename = title | |
else: | |
filename = cid | |
print('Fetching XML...') | |
os.system('curl -o "'+filename+'.xml" --compressed http://comment.bilibili.tv/'+cid+'.xml') | |
#os.system('gzip -d '+cid+'.xml.gz') | |
print('The XML file, ' + filename + '.xml should be ready...enjoy!') | |
exit() | |
vid = str(input('av')) | |
p = str(input('P')) | |
main(vid, p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment