Created
February 5, 2021 13:04
-
-
Save DriLLFreAK100/9ca419ea78ff16a8c108c16481f9b257 to your computer and use it in GitHub Desktop.
Python script to scrape Wikipedia's coffee producing countries 2019
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_coffee_production' | |
r = requests.get(url) | |
html = r.text | |
soup = BeautifulSoup(html) | |
table = soup.find('table', {"class": "wikitable"}) | |
rows = table.find_all('tr') | |
data = [] | |
for row in rows[1:]: | |
cols = row.find_all('td') | |
currentList = [] | |
for idx, ele in enumerate(cols): | |
# Remove comma from the numbers | |
x = ele.text.strip().replace(',', '') | |
# Clean the country name with unwanted text | |
if idx == 1: | |
oddCharacterIndex = x.find('(') | |
if oddCharacterIndex != -1: | |
x = x[0: oddCharacterIndex] | |
currentList.append(x) | |
data.append([item for item in currentList if item]) | |
result = pd.DataFrame( | |
data, columns=['Rank', 'Country', 'Bags', 'MetricTons', 'Pounds']) | |
with open(r'C:\Users\blusk\Downloads\temp.json', 'w') as f: | |
f.write(result.to_json(orient='records')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment