Skip to content

Instantly share code, notes, and snippets.

@gandroz
Created July 4, 2020 02:40
Show Gist options
  • Save gandroz/f4cc5ca52b3c2e84abf28e8a73bc708e to your computer and use it in GitHub Desktop.
Save gandroz/f4cc5ca52b3c2e84abf28e8a73bc708e to your computer and use it in GitHub Desktop.
Scrap the tag and number of claps of popular articles on medium
from bs4 import BeautifulSoup
import requests
def convert_to_num(s):
if isinstance(s, int):
return s
elif 'K' in s:
return int(1000 * float(s.replace('K','')))
else:
return int(s)
for post in posts:
post['tag'] = []
post['claps'] = -1
try:
r = requests.get(post['url'])
except:
print(post['url'])
continue
soup = BeautifulSoup(r.text)
# tags
hrefs = soup.findAll('a', href=True)
for href in hrefs:
res = re.search("^/tagged/(.*)", href['href'])
if res:
post['tag'].append(res.group(1))
# claps
buttons = soup.findAll('button')
for i, button in enumerate(buttons):
res = re.search("(.*) claps", button.text)
if res:
post['claps'] = convert_to_num(res.group(1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment