Skip to content

Instantly share code, notes, and snippets.

View alexpetralia's full-sized avatar
🎯
Focusing

Alex Petralia alexpetralia

🎯
Focusing
View GitHub Profile
domain = 'http://www.igmchicago.org'
response = requests.get(domain + '/igm-economic-experts-panel')
soup = BeautifulSoup(response.text, 'lxml')
polls = soup.findAll('div', {'class': 'poll-listing'})
topics = {}
for poll in polls:
topic = poll.find('h2').get_text()
handle = poll.find('h3', {'class': 'surveyQuestion'}).a['href']
topics[topic] = handle
data = []
for topic, handle in topics.items():
print(handle)
response = requests.get(domain + handle)
soup = BeautifulSoup(response.text, 'lxml')
survey_questions = list('ABCD')[:len(soup.findAll('h3', {'class': 'surveyQuestion'}))]
survey_date = soup.find('h6').get_text()
tables = soup.findAll('table', {'class': 'responseDetail'})
for survey_question, table in zip(survey_questions, tables):
rows = table.findAll('tr')#, {'class': 'parent-row'})
for row in rows:
if row.get('class') == ['parent-row']:
cells = row.findAll('td')
response = cells[2].get_text().strip()
confidence = cells[3].get_text().strip()
comment = cells[4].get_text().strip()
tmp_data = {
'survey_date': dtutil.parse(survey_date),
col_order = ['survey_date', 'topic_name', 'topic_url', 'survey_question', 'economist_name', 'economist_url', 'economist_headshot', 'institution', 'response', 'confidence', 'comment']
df = pd.DataFrame(data, columns=col_order)
import sqlalchemy
engine = sqlalchemy.create_engine('sqlite:///igmchicago.db')
df.to_sql('igmchicago', engine, index=False, if_exists='replace')
cnxn = engine.connect()
r = cnxn.execute("""
WITH igm AS (
SELECT economist_name, institution, AVG(confidence) AS avg_conf
FROM igmchicago
GROUP BY economist_name, institution
)
SELECT * FROM igm i
WHERE avg_conf >
(SELECT AVG(avg_conf) FROM igm g WHERE i.institution = g.institution)
df.columns
df.shape
df.describe()
df.info()
any(df.duplicated())
df.head()
df.tail()
df['institution'].value_counts().plot(kind='bar')
# Correct response types
df.loc[df['response'].str.contains(r'did not', case=False) | df['response'].str.contains(r'---'), 'response'] = np.nan
# Convert empty string comments into null types
df['comment'] = df['comment'].replace(r'^$', np.nan, regex=True)
# Assign sex variable to economists
sex = pd.read_csv(os.path.join(os.path.dirname(__file__), 'economist_sex_mapping.csv'), index_col='economist_name')
df['sex'] = df['economist_name'].map(sex['sex'])
facet_labels = ['economist_name', 'institution', 'sex']
facets = df.groupby(facet_labels).first().reset_index()[facet_labels]
# Summary statistics
len(df.groupby('topic_name')) # 132 topics
len(df.groupby(['topic_name', 'survey_question'])) # 195 survey questions
len(facets['economist_name'].unique()) # 51 economists
facets.groupby('sex').size() # 11 female, 40 male
facets.groupby('institution').size().sort_values(ascending=False)