Skip to content

Instantly share code, notes, and snippets.

@bebound
Last active August 29, 2015 14:09
Show Gist options
  • Save bebound/478fe04ddee2134f7c16 to your computer and use it in GitHub Desktop.
Save bebound/478fe04ddee2134f7c16 to your computer and use it in GitHub Desktop.
import re
import requests
def find_users(url):
print('getting:', url)
r = requests.get(url)
if r.status_code == requests.codes.ok:
users = re.findall(r'utf-8&fr=pb" target="_blank">(.*?)</a>', r.text)
return users
else:
print('fetching', url, 'failed')
def write_txt(users):
with open('result.txt', 'w') as f:
users = ['@' + user for user in users]
lines = '\n'.join(users)
f.write(lines)
def main():
# base_url = 'http://tieba.baidu.com/p/3413476976'
while True:
base_url = input('url:')
end = 0
page = 0
all_users = []
url = base_url
while not end:
new_users = find_users(url)
all_users.extend(new_users)
if 'pn=' in base_url:
end=1
elif len(new_users) != 30:
end = 1
else:
page += 1
url = base_url + '?pn=' + str(page)
final_users = []
for user in all_users:
if user not in final_users:
final_users.append(user)
print(len(final_users), 'users\n', final_users)
write_txt(final_users)
print('write txt')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment