Created
June 2, 2025 13:40
-
-
Save macleginn/15d52037ba00cde6a1816afd0e83f6d2 to your computer and use it in GitHub Desktop.
Code for extracting speeches from the Stanford Congress Record dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from copy import copy | |
from collections import defaultdict, Counter | |
from functools import cache | |
from glob import glob | |
import pandas as pd | |
import json | |
from tqdm import tqdm | |
@cache | |
def add_date_hyphens(date_string): | |
date_string = str(date_string) | |
# Check if the string is the right format (8 digits) | |
if len(date_string) == 8 and date_string.isdigit(): | |
# Extract year, month, day and add hyphens | |
year = date_string[0:4] | |
month = date_string[4:6] | |
day = date_string[6:8] | |
return f"{year}-{month}-{day}" | |
else: | |
return "Invalid date format" | |
speech_to_party = {} | |
for speaker_map in tqdm( | |
glob('../../corpora/congress-stanford/hein-daily/*_SpeakerMap.txt'), | |
desc='Speaker maps' | |
): | |
speaker_map_df = pd.read_csv(speaker_map, sep='|') | |
for t in speaker_map_df.itertuples(): | |
speech_to_party[t.speech_id] = t.party | |
speech_info = {} | |
speech_to_speaker = defaultdict(list) | |
speech_descriptions = glob('../../corpora/congress-stanford/hein-daily/descr_*.txt') | |
for speech_descr in tqdm(speech_descriptions, desc='Speech descriptions'): | |
speech_descr_df = pd.read_csv(speech_descr, sep='|').fillna('Unknown') | |
speech_descr_df['date'] = speech_descr_df.date.map(add_date_hyphens) | |
# speech_id|chamber|date|number_within_file|speaker|first_name|last_name|state|gender | |
for t in speech_descr_df.itertuples(): | |
speech_info[str(t.speech_id)] = { | |
'first_name': t.first_name, | |
'last_name': t.last_name, | |
'party': speech_to_party.get(t.speech_id, 'Unknown'), | |
'date': t.date, | |
'chamber': t.chamber, | |
'state': t.state, | |
'gender': t.gender | |
} | |
party_people = Counter(speech['party'] for speech in speech_info.values()) | |
print(party_people.most_common()) | |
republican_speeches = [] | |
democratic_speeches = [] | |
speeches = glob('../../corpora/congress-stanford/hein-daily/speeches_*.txt') | |
for s in tqdm(speeches, desc='Speech files'): | |
with open(s, 'r', encoding='utf-8', errors='replace') as inp: | |
for line in inp: | |
break | |
for line in inp: | |
line = line.strip() | |
speech_id, speech = line.split('|', maxsplit=1) | |
if len(speech.split()) < 20: | |
continue | |
try: | |
speech_meta = speech_info[speech_id] | |
except KeyError: | |
print(f'No info on speech {speech_id}.') | |
continue | |
speech_dict = copy(speech_meta) | |
speech_dict['speech'] = speech | |
for k, v in speech_dict.items(): | |
assert type(v) == str or k == 'party', f'Wrong value in {speech_dict}' | |
if speech_dict['party'] == 'D': | |
democratic_speeches.append(speech_dict) | |
elif speech_dict['party'] == 'R': | |
republican_speeches.append(speech_dict) | |
with open('hein_speeches_by_party.json', 'w', encoding='utf-8') as out: | |
json.dump({'democrat': democratic_speeches, | |
'republican': republican_speeches}, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment