Skip to content

Instantly share code, notes, and snippets.

@muxuezi
Last active August 29, 2015 14:06
Show Gist options
  • Save muxuezi/2b8b51076291d483f93c to your computer and use it in GitHub Desktop.
Save muxuezi/2b8b51076291d483f93c to your computer and use it in GitHub Desktop.
crawl [duokan](http://www.duokan.com/list/1-1) books index by aiohttp
import asyncio
import aiohttp
import bs4
import tqdm
import re
allBook = [] #init allbook index
@asyncio.coroutine
def get(*args, **kwargs):
response = yield from aiohttp.request('GET', *args, **kwargs)
return (yield from response.text(encoding='utf-8'))
@asyncio.coroutine
def wait_with_progress(coros):
for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
yield from f
def first_magnet(page):
soup = bs4.BeautifulSoup(page)
a = soup.find_all('li', class_="u-bookitm1 j-bookitm")
name = lambda unit: unit.find('div', class_="wrap")
res = map(lambda x: '{name}\t{id}\n'.format(
name=name(x).find('a').text.strip(), id=x['data-id'].strip()), a)
return ''.join(res)
@asyncio.coroutine
def print_magnet(query):
url = 'http://www.duokan.com/list/1-{}'.format(query)
with (yield from sem):
page = yield from get(url, compress=True)
magnet = first_magnet(page)
allBook.append(magnet)
print(query)
# total page at homepage
allpages = range(1, 2015)
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_magnet(d) for d in allpages])
loop.run_until_complete(f)
with open('allbooks', 'w') as f:
f.writelines(allBook)
print('K.O.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment