This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
domain = 'http://www.igmchicago.org' | |
response = requests.get(domain + '/igm-economic-experts-panel') | |
soup = BeautifulSoup(response.text, 'lxml') | |
polls = soup.findAll('div', {'class': 'poll-listing'}) | |
topics = {} | |
for poll in polls: | |
topic = poll.find('h2').get_text() | |
handle = poll.find('h3', {'class': 'surveyQuestion'}).a['href'] | |
topics[topic] = handle |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = [] | |
for topic, handle in topics.items(): | |
print(handle) | |
response = requests.get(domain + handle) | |
soup = BeautifulSoup(response.text, 'lxml') | |
survey_questions = list('ABCD')[:len(soup.findAll('h3', {'class': 'surveyQuestion'}))] | |
survey_date = soup.find('h6').get_text() | |
tables = soup.findAll('table', {'class': 'responseDetail'}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for survey_question, table in zip(survey_questions, tables): | |
rows = table.findAll('tr')#, {'class': 'parent-row'}) | |
for row in rows: | |
if row.get('class') == ['parent-row']: | |
cells = row.findAll('td') | |
response = cells[2].get_text().strip() | |
confidence = cells[3].get_text().strip() | |
comment = cells[4].get_text().strip() | |
tmp_data = { | |
'survey_date': dtutil.parse(survey_date), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
col_order = ['survey_date', 'topic_name', 'topic_url', 'survey_question', 'economist_name', 'economist_url', 'economist_headshot', 'institution', 'response', 'confidence', 'comment'] | |
df = pd.DataFrame(data, columns=col_order) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlalchemy | |
engine = sqlalchemy.create_engine('sqlite:///igmchicago.db') | |
df.to_sql('igmchicago', engine, index=False, if_exists='replace') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cnxn = engine.connect() | |
r = cnxn.execute(""" | |
WITH igm AS ( | |
SELECT economist_name, institution, AVG(confidence) AS avg_conf | |
FROM igmchicago | |
GROUP BY economist_name, institution | |
) | |
SELECT * FROM igm i | |
WHERE avg_conf > | |
(SELECT AVG(avg_conf) FROM igm g WHERE i.institution = g.institution) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df.columns | |
df.shape | |
df.describe() | |
df.info() | |
any(df.duplicated()) | |
df.head() | |
df.tail() | |
df['institution'].value_counts().plot(kind='bar') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Correct response types | |
df.loc[df['response'].str.contains(r'did not', case=False) | df['response'].str.contains(r'---'), 'response'] = np.nan | |
# Convert empty string comments into null types | |
df['comment'] = df['comment'].replace(r'^$', np.nan, regex=True) | |
# Assign sex variable to economists | |
sex = pd.read_csv(os.path.join(os.path.dirname(__file__), 'economist_sex_mapping.csv'), index_col='economist_name') | |
df['sex'] = df['economist_name'].map(sex['sex']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
facet_labels = ['economist_name', 'institution', 'sex'] | |
facets = df.groupby(facet_labels).first().reset_index()[facet_labels] | |
# Summary statistics | |
len(df.groupby('topic_name')) # 132 topics | |
len(df.groupby(['topic_name', 'survey_question'])) # 195 survey questions | |
len(facets['economist_name'].unique()) # 51 economists | |
facets.groupby('sex').size() # 11 female, 40 male | |
facets.groupby('institution').size().sort_values(ascending=False) |