Skip to content

Instantly share code, notes, and snippets.

@rounakdatta
Created October 8, 2019 18:41
Show Gist options
  • Save rounakdatta/76de10ff48ea484303ca3bd41e959be4 to your computer and use it in GitHub Desktop.
Save rounakdatta/76de10ff48ea484303ca3bd41e959be4 to your computer and use it in GitHub Desktop.
Structuring data from techinternship.io
import requests
from bs4 import BeautifulSoup
import re
import json
with requests.session() as s:
login_payload = {
"email": "xxx",
"password": "yyy"
}
s.post('https://www.techinternship.io/login?next=discover', data=login_payload)
discoverPage = s.get("https://www.techinternship.io/discover")
pageSoup = BeautifulSoup(discoverPage.content, "html5lib")
scriptBlob = pageSoup.findAll('script')[6].string
varMatcher = re.compile('var all_prev = (.*?);')
matchedDataGroups = varMatcher.search(scriptBlob)
data = matchedDataGroups.groups()[0].replace("'", "")
print(data)
dataJson = json.loads(data)
for internship in dataJson:
print('---')
print(internship['name'])
print(internship['companyName'])
print(internship['link'])
print(internship['industry'])
print(internship['location'])
print('---')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment