Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save macleginn/15d52037ba00cde6a1816afd0e83f6d2 to your computer and use it in GitHub Desktop.
Save macleginn/15d52037ba00cde6a1816afd0e83f6d2 to your computer and use it in GitHub Desktop.
Code for extracting speeches from the Stanford Congress Record dataset
from copy import copy
from collections import defaultdict, Counter
from functools import cache
from glob import glob
import pandas as pd
import json
from tqdm import tqdm
@cache
def add_date_hyphens(date_string):
date_string = str(date_string)
# Check if the string is the right format (8 digits)
if len(date_string) == 8 and date_string.isdigit():
# Extract year, month, day and add hyphens
year = date_string[0:4]
month = date_string[4:6]
day = date_string[6:8]
return f"{year}-{month}-{day}"
else:
return "Invalid date format"
speech_to_party = {}
for speaker_map in tqdm(
glob('../../corpora/congress-stanford/hein-daily/*_SpeakerMap.txt'),
desc='Speaker maps'
):
speaker_map_df = pd.read_csv(speaker_map, sep='|')
for t in speaker_map_df.itertuples():
speech_to_party[t.speech_id] = t.party
speech_info = {}
speech_to_speaker = defaultdict(list)
speech_descriptions = glob('../../corpora/congress-stanford/hein-daily/descr_*.txt')
for speech_descr in tqdm(speech_descriptions, desc='Speech descriptions'):
speech_descr_df = pd.read_csv(speech_descr, sep='|').fillna('Unknown')
speech_descr_df['date'] = speech_descr_df.date.map(add_date_hyphens)
# speech_id|chamber|date|number_within_file|speaker|first_name|last_name|state|gender
for t in speech_descr_df.itertuples():
speech_info[str(t.speech_id)] = {
'first_name': t.first_name,
'last_name': t.last_name,
'party': speech_to_party.get(t.speech_id, 'Unknown'),
'date': t.date,
'chamber': t.chamber,
'state': t.state,
'gender': t.gender
}
party_people = Counter(speech['party'] for speech in speech_info.values())
print(party_people.most_common())
republican_speeches = []
democratic_speeches = []
speeches = glob('../../corpora/congress-stanford/hein-daily/speeches_*.txt')
for s in tqdm(speeches, desc='Speech files'):
with open(s, 'r', encoding='utf-8', errors='replace') as inp:
for line in inp:
break
for line in inp:
line = line.strip()
speech_id, speech = line.split('|', maxsplit=1)
if len(speech.split()) < 20:
continue
try:
speech_meta = speech_info[speech_id]
except KeyError:
print(f'No info on speech {speech_id}.')
continue
speech_dict = copy(speech_meta)
speech_dict['speech'] = speech
for k, v in speech_dict.items():
assert type(v) == str or k == 'party', f'Wrong value in {speech_dict}'
if speech_dict['party'] == 'D':
democratic_speeches.append(speech_dict)
elif speech_dict['party'] == 'R':
republican_speeches.append(speech_dict)
with open('hein_speeches_by_party.json', 'w', encoding='utf-8') as out:
json.dump({'democrat': democratic_speeches,
'republican': republican_speeches},
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment