Skip to content

Instantly share code, notes, and snippets.

@leVirve
Last active July 30, 2016 06:51
Show Gist options
  • Save leVirve/6e4af77a0ac547c1b1e800289c7aac15 to your computer and use it in GitHub Desktop.
Save leVirve/6e4af77a0ac547c1b1e800289c7aac15 to your computer and use it in GitHub Desktop.
Hack my own `dcard-spider` and make it get the power of incremental crawling.
import time
import datetime
from dcard import Dcard
def hack_forums_get_meta_function(dcard):
target_date = datetime.datetime.utcnow() - datetime.timedelta(days=1)
def get_paged_metas(self, pages, sort):
params = {'popular': False} if sort == 'new' else {}
for page in range(pages):
data = self.client.get(self.posts_meta_url, params=params)
if len(data) == 0:
logger.warning('[%s] 已到最末頁,第%d頁!' % (self.forum, page))
return
if data[-1]['updatedAt'] < target_date.isoformat():
return
params['before'] = data[-1]['id']
yield data
dcard.forums._get_paged_metas = get_paged_metas
if __name__ == '__main__':
s = time.time()
dcard = Dcard()
name = 'bg'
hack_forums_get_meta_function(dcard)
bound = 60000
metas = dcard.forums(name).get_metas(num=bound)
print(name, len(metas))
print('{:.05} sec'.format(time.time() - s))
import time
import datetime
from dcard import Dcard
if __name__ == '__main__':
s = time.time()
boundary_date = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
dcard = Dcard()
# Only get latest metas
name = 'bg'
metas = dcard.forums(name).get_metas(
num=dcard.forums.infinite_page,
timebound=boundary_date.isoformat()
)
print(name, len(metas))
# Fecth infinite metas!
name = 'freshman'
metas = dcard.forums(name).get_metas(
num=dcard.forums.infinite_page,
)
print(name, len(metas))
print('{:.05} sec'.format(time.time() - s))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment