Last active
April 16, 2022 08:56
-
-
Save ashutoshkrris/0c9993aaf36247a7c23ff29b48ae8d36 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import json | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
BROWSER = webdriver.Chrome(executable_path="chromedriver.exe") | |
TOTAL_PERSONS = 100 | |
def data_scraper(): | |
BROWSER.get("https://www.bloomberg.com/billionaires/") | |
html_source = BROWSER.page_source | |
BROWSER.close() | |
soup = BeautifulSoup(html_source, 'html.parser') | |
response_rank = soup.find_all('div', class_='table-cell t-rank') | |
ranks = [rank.get_text().strip() for rank in response_rank][:TOTAL_PERSONS] | |
response_name = soup.find_all('div', class_='table-cell t-name') | |
names = [name.get_text().strip() for name in response_name][:TOTAL_PERSONS] | |
links = [(name.find('a')['href']).replace("./", "") for name in response_name] | |
response_worth = soup.find_all('div', class_='table-cell active t-nw') | |
worths = [worth.get_text().strip() | |
for worth in response_worth][1:TOTAL_PERSONS+1] | |
response_last_change = soup.find_all('div', class_='t-lcd') | |
last_changes = [change.get_text().strip() | |
for change in response_last_change][1:TOTAL_PERSONS+1] | |
response_ytd = soup.find_all('div', class_='t-ycd') | |
ytds = [ytd.get_text().strip() for ytd in response_ytd][1:TOTAL_PERSONS+1] | |
response_country = soup.find_all('div', class_='table-cell t-country') | |
countries = [country.get_text().strip() | |
for country in response_country][1:TOTAL_PERSONS+1] | |
response_industry = soup.find_all('div', class_='table-cell t-industry') | |
industries = [industry.get_text().strip() | |
for industry in response_industry][1:TOTAL_PERSONS+1] | |
data_dict = { | |
"ranks": ranks, | |
"names": names, | |
"links": links, | |
"worths": worths, | |
"last_changes": last_changes, | |
"ytds": ytds, | |
"countries": countries, | |
"industries": industries | |
} | |
return data_dict | |
def write_to_csv(data: dict) -> None: | |
columns = ['Rank', 'Name', 'Link', | |
'Total net worth($)', '$ Last change', '$ YTD change', 'Country/Region', 'Industry'] | |
with open(f"top-{TOTAL_PERSONS}-persons.csv", "w", newline="") as csv_file: | |
writer = csv.DictWriter(csv_file, fieldnames=columns) | |
writer.writeheader() | |
for i in range(TOTAL_PERSONS): | |
temp = { | |
"Rank": data["ranks"][i], | |
"Name": data["names"][i], | |
"Link": f"https://www.bloomberg.com/billionaires/{data['links'][i]}", | |
"Total net worth($)": data["worths"][i], | |
"$ Last change": data["last_changes"][i], | |
"$ YTD change": data["ytds"][i], | |
"Country/Region": data["countries"][i], | |
"Industry": data["industries"][i] | |
} | |
writer.writerow(temp) | |
def write_to_json(data: dict) -> None: | |
data_list = [] | |
for i in range(TOTAL_PERSONS): | |
temp = { | |
"Rank": data["ranks"][i], | |
"Name": data["names"][i], | |
"Link": f"https://www.bloomberg.com/billionaires/{data['links'][i]}", | |
"Total net worth($)": data["worths"][i], | |
"$ Last change": data["last_changes"][i], | |
"$ YTD change": data["ytds"][i], | |
"Country/Region": data["countries"][i], | |
"Industry": data["industries"][i] | |
} | |
data_list.append(temp) | |
with open(f"top-{TOTAL_PERSONS}-persons.json", "w") as json_file: | |
json.dump(data_list, json_file) | |
if __name__ == '__main__': | |
data = data_scraper() | |
write_to_csv(data) | |
write_to_json(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment