Skip to content

Instantly share code, notes, and snippets.

@YieldNull
Created November 17, 2015 13:33
Show Gist options
  • Save YieldNull/fdfce91358bbf79549ca to your computer and use it in GitHub Desktop.
Save YieldNull/fdfce91358bbf79549ca to your computer and use it in GitHub Desktop.
A 'Crawler' for http://msdn.itellyou.cn/
#!/usr/bin/env python
# coding:utf-8
"""
A 'Crawler' for http://msdn.itellyou.cn/
"""
import re
import urllib
import urllib2
import json
import sqlite3
dbconn = sqlite3.Connection('msdn.db')
def do_post(url, params):
params = urllib.urlencode(params)
return urllib2.urlopen(url, params).read().decode('utf-8')
def get_menus():
"""
(<id>,<name>)+
"""
src = urllib.urlopen('http://msdn.itellyou.cn/').read().decode('utf-8')
pat = 'data-target="#collapse_(.*?)">(.*?)</a>'
return re.findall(pat, src)
def get_sub_menus(mid):
"""
({'id':<id>,'name':<name>})+
"""
src = do_post('http://msdn.itellyou.cn/Category/Index', {'id': mid})
return json.loads(src)
def get_lang_list(sid):
"""
{"status":true,"result":({'id':<id>,'lang':<lang>})+}
"""
src = do_post('http://msdn.itellyou.cn/Category/GetLang', {'id': sid})
return json.loads(src)['result']
def get_iso_list(sid, lid):
"""
{"status":true,"result":({'id':<id>,'name':<name>,'post':<date>,'url':<url>})+}
"""
src = do_post('http://msdn.itellyou.cn/Category/GetList',
{'id': sid, 'lang': lid, 'filter': 'true'})
return json.loads(src)['result']
def get_iso(iid):
"""
{"status":true,"result":({'DownLoad':<url>,'FileName':<fname>,
'PostDateString':<2008-09-08>,'SHA1':<SHA1>,'size':<size>})+
}
"""
src = do_post('http://msdn.itellyou.cn/Category/GetProduct', {'id': iid})
return json.loads(src)['result']
def create_db():
c = dbconn.cursor()
c.executescript("""
DROP TABLE IF EXISTS iso;
CREATE TABLE iso(
id INTEGER AUTO INCREMENT PRIMARY KEY,
name TEXT,
fname TEXT,
cate TEXT,
product TEXT,
lang TEXT,
url TEXT,
sha1 TEXT,
size TEXT,
date TEXT
);
""")
dbconn.commit()
c.close()
if __name__ == '__main__':
create_db()
c = dbconn.cursor()
for menu in get_menus():
for sub in get_sub_menus(menu[0]):
for lang in get_lang_list(sub['id']):
for iso in get_iso_list(sub['id'], lang['id']):
detail = get_iso(iso['id'])
print detail['FileName']
c.execute('INSERT INTO iso(name, fname, cate, product, lang,'
'url, sha1, size, date)'
'VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)',
(iso['name'], detail['FileName'],
menu[1], sub['name'],
lang['lang'], detail[
'DownLoad'], detail['SHA1'],
detail['size'], detail['PostDateString'])
)
dbconn.commit()
c.close()
dbconn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment