Last active
July 30, 2016 06:51
-
-
Save leVirve/6e4af77a0ac547c1b1e800289c7aac15 to your computer and use it in GitHub Desktop.
Hack my own `dcard-spider` and make it get the power of incremental crawling.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import datetime | |
from dcard import Dcard | |
def hack_forums_get_meta_function(dcard): | |
target_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) | |
def get_paged_metas(self, pages, sort): | |
params = {'popular': False} if sort == 'new' else {} | |
for page in range(pages): | |
data = self.client.get(self.posts_meta_url, params=params) | |
if len(data) == 0: | |
logger.warning('[%s] 已到最末頁,第%d頁!' % (self.forum, page)) | |
return | |
if data[-1]['updatedAt'] < target_date.isoformat(): | |
return | |
params['before'] = data[-1]['id'] | |
yield data | |
dcard.forums._get_paged_metas = get_paged_metas | |
if __name__ == '__main__': | |
s = time.time() | |
dcard = Dcard() | |
name = 'bg' | |
hack_forums_get_meta_function(dcard) | |
bound = 60000 | |
metas = dcard.forums(name).get_metas(num=bound) | |
print(name, len(metas)) | |
print('{:.05} sec'.format(time.time() - s)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import datetime | |
from dcard import Dcard | |
if __name__ == '__main__': | |
s = time.time() | |
boundary_date = datetime.datetime.utcnow() - datetime.timedelta(hours=1) | |
dcard = Dcard() | |
# Only get latest metas | |
name = 'bg' | |
metas = dcard.forums(name).get_metas( | |
num=dcard.forums.infinite_page, | |
timebound=boundary_date.isoformat() | |
) | |
print(name, len(metas)) | |
# Fecth infinite metas! | |
name = 'freshman' | |
metas = dcard.forums(name).get_metas( | |
num=dcard.forums.infinite_page, | |
) | |
print(name, len(metas)) | |
print('{:.05} sec'.format(time.time() - s)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment