macleginn · June 2, 2025 13:40
diff --git a/extract_speeches_from_stanford_congress_record.py b/extract_speeches_from_stanford_congress_record.py
 from copy import copy
 from collections import defaultdict, Counter
 from functools import cache
 from glob import glob
 import pandas as pd
 import json
 from tqdm import tqdm


 @cache
 def add_date_hyphens(date_string):
    date_string = str(date_string)
    # Check if the string is the right format (8 digits)
    if len(date_string) == 8 and date_string.isdigit():
        # Extract year, month, day and add hyphens
        year = date_string[0:4]
        month = date_string[4:6]
        day = date_string[6:8]
        return f"{year}-{month}-{day}"
    else:
        return "Invalid date format"


 speech_to_party = {}
 for speaker_map in tqdm(
    glob('../../corpora/congress-stanford/hein-daily/*_SpeakerMap.txt'),
    desc='Speaker maps'
 ):
    speaker_map_df = pd.read_csv(speaker_map, sep='|')
    for t in speaker_map_df.itertuples():
        speech_to_party[t.speech_id] = t.party

 speech_info = {}
 speech_to_speaker = defaultdict(list)
 speech_descriptions = glob('../../corpora/congress-stanford/hein-daily/descr_*.txt')
 for speech_descr in tqdm(speech_descriptions, desc='Speech descriptions'):
    speech_descr_df = pd.read_csv(speech_descr, sep='|').fillna('Unknown')
    speech_descr_df['date'] = speech_descr_df.date.map(add_date_hyphens)
    # speech_id|chamber|date|number_within_file|speaker|first_name|last_name|state|gender
    for t in speech_descr_df.itertuples():
        speech_info[str(t.speech_id)] = {
            'first_name': t.first_name,
            'last_name': t.last_name,
            'party': speech_to_party.get(t.speech_id, 'Unknown'),
            'date': t.date,
            'chamber': t.chamber,
            'state': t.state,
            'gender': t.gender
        }
 party_people = Counter(speech['party'] for speech in speech_info.values())
 print(party_people.most_common())

 republican_speeches = []
 democratic_speeches = []
 speeches = glob('../../corpora/congress-stanford/hein-daily/speeches_*.txt')
 for s in tqdm(speeches, desc='Speech files'):
    with open(s, 'r', encoding='utf-8', errors='replace') as inp:
        for line in inp:
            break
        for line in inp:
            line = line.strip()
            speech_id, speech = line.split('|', maxsplit=1)
            if len(speech.split()) < 20:
                continue
            try:
                speech_meta = speech_info[speech_id]
            except KeyError:
                print(f'No info on speech {speech_id}.')
                continue
            speech_dict = copy(speech_meta)
            speech_dict['speech'] = speech
            for k, v in speech_dict.items():
                assert type(v) == str or k == 'party', f'Wrong value in {speech_dict}'
            if speech_dict['party'] == 'D':
                democratic_speeches.append(speech_dict)
            elif speech_dict['party'] == 'R':
                republican_speeches.append(speech_dict)
 with open('hein_speeches_by_party.json', 'w', encoding='utf-8') as out:
    json.dump({'democrat': democratic_speeches,
               'republican': republican_speeches},
	from copy import copy
	from collections import defaultdict, Counter
	from functools import cache
	from glob import glob
	import pandas as pd
	import json
	from tqdm import tqdm


	@cache
	def add_date_hyphens(date_string):
	date_string = str(date_string)
	# Check if the string is the right format (8 digits)
	if len(date_string) == 8 and date_string.isdigit():
	# Extract year, month, day and add hyphens
	year = date_string[0:4]
	month = date_string[4:6]
	day = date_string[6:8]
	return f"{year}-{month}-{day}"
	else:
	return "Invalid date format"


	speech_to_party = {}
	for speaker_map in tqdm(
	glob('../../corpora/congress-stanford/hein-daily/*_SpeakerMap.txt'),
	desc='Speaker maps'
	):
	speaker_map_df = pd.read_csv(speaker_map, sep='\|')
	for t in speaker_map_df.itertuples():
	speech_to_party[t.speech_id] = t.party

	speech_info = {}
	speech_to_speaker = defaultdict(list)
	speech_descriptions = glob('../../corpora/congress-stanford/hein-daily/descr_*.txt')
	for speech_descr in tqdm(speech_descriptions, desc='Speech descriptions'):
	speech_descr_df = pd.read_csv(speech_descr, sep='\|').fillna('Unknown')
	speech_descr_df['date'] = speech_descr_df.date.map(add_date_hyphens)
	# speech_id\|chamber\|date\|number_within_file\|speaker\|first_name\|last_name\|state\|gender
	for t in speech_descr_df.itertuples():
	speech_info[str(t.speech_id)] = {
	'first_name': t.first_name,
	'last_name': t.last_name,
	'party': speech_to_party.get(t.speech_id, 'Unknown'),
	'date': t.date,
	'chamber': t.chamber,
	'state': t.state,
	'gender': t.gender
	}
	party_people = Counter(speech['party'] for speech in speech_info.values())
	print(party_people.most_common())

	republican_speeches = []
	democratic_speeches = []
	speeches = glob('../../corpora/congress-stanford/hein-daily/speeches_*.txt')
	for s in tqdm(speeches, desc='Speech files'):
	with open(s, 'r', encoding='utf-8', errors='replace') as inp:
	for line in inp:
	break
	for line in inp:
	line = line.strip()
	speech_id, speech = line.split('\|', maxsplit=1)
	if len(speech.split()) < 20:
	continue
	try:
	speech_meta = speech_info[speech_id]
	except KeyError:
	print(f'No info on speech {speech_id}.')
	continue
	speech_dict = copy(speech_meta)
	speech_dict['speech'] = speech
	for k, v in speech_dict.items():
	assert type(v) == str or k == 'party', f'Wrong value in {speech_dict}'
	if speech_dict['party'] == 'D':
	democratic_speeches.append(speech_dict)
	elif speech_dict['party'] == 'R':
	republican_speeches.append(speech_dict)
	with open('hein_speeches_by_party.json', 'w', encoding='utf-8') as out:
	json.dump({'democrat': democratic_speeches,
	'republican': republican_speeches},