Skip to content

Instantly share code, notes, and snippets.

@mhl
Created May 24, 2012 13:48
Show Gist options
  • Save mhl/2781638 to your computer and use it in GitHub Desktop.
Save mhl/2781638 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os, re, sys, xml.sax
from lxml import etree
from collections import defaultdict
from xml.sax.handler import ContentHandler
from guess_mp_genders import *
import datetime
since = datetime.date(2010, 5, 6)
debates_directory = "/media/no2id/ukparse-git-svn/parldata/scrapedxml/debates"
class MinisterParser(ContentHandler):
def __init__(self):
self.ministers = set([])
def startElement(self, name, attr):
if name == "moffice":
self.ministers.add(attr['matchid'])
minister_parser = MinisterParser()
with open("ministers-2010.xml") as fp:
xml.sax.parse(fp, minister_parser)
member_to_gender = {}
members_to_check = set([])
for members_filename in ("all-members-2010.xml", "all-members.xml"):
root = etree.parse(members_filename).getroot()
for member in root.xpath('member'):
member_id = member.attrib['id']
first_name = member.attrib['firstname'].strip()
last_name = member.attrib['lastname']
gender = guess_gender_from_first_name(first_name, 0.05)
member_to_gender[member_id] = gender
if gender == Gender.CHECK:
members_to_check.add((first_name, last_name, member_id))
for t in sorted(members_to_check):
print "Need to check %s %s (%s)" % t
class GenderResult:
def __init__(self):
self.speeches = 0
self.speeches_interrupted = 0
for just_ministers in (False, True):
gender_results = [GenderResult(), GenderResult()]
for e in os.listdir(debates_directory):
# Only load debates from the 2010 parliament:
m = re.search("^debates(\d{4}-\d{2}-\d{2})", e)
if not m:
continue
date = datetime.datetime.strptime(m.group(1), "%Y-%m-%d").date()
if date < since:
continue
filename = os.path.join(debates_directory, e)
root = etree.parse(filename).getroot()
for speech in root.xpath('speech'):
if 'speakerid' not in speech.attrib:
continue
speaker_id = speech.attrib['speakerid']
if speaker_id == 'unknown':
continue
if speaker_id not in member_to_gender:
print "Missing speaker from members file:", speaker_id
continue
is_minister = speaker_id in minister_parser.ministers
gender = member_to_gender[speaker_id]
if gender == Gender.CHECK:
continue
interrupted_elements = speech.xpath(".//*[contains(text(),'Interruption.')]")
interrupted = len(interrupted_elements) > 0
if is_minister != just_ministers:
continue
gender_results[gender].speeches += 1
if interrupted:
gender_results[gender].speeches_interrupted += 1
print "------------------------------------------------------------------------"
if just_ministers:
print "ONLY MINISTERS:"
else:
print "ONLY NON MINISTERS"
for gender in (Gender.MALE, Gender.FEMALE):
gu = Gender.to_unicode(gender)
total = gender_results[gender].speeches
interrupted = gender_results[gender].speeches_interrupted
print gu, "speeches", total
print gu, "speeches interrupted", interrupted
print "proportion of", gu, "speeches interrupted", interrupted / float(total)
#!/usr/bin/env python
import re
class Gender:
MALE = 0
FEMALE = 1
CHECK = 2
unicode_versions = (u'male', u'female', u'check')
@staticmethod
def to_unicode(g):
return Gender.unicode_versions[g]
names_to_frequency = [{}, {}]
# These data sets are from: http://www.census.gov/genealogy/names/names%5Ffiles.html
# As an alternative, the ONS have similar UK statistics.
us_filename = ["dist.male.first",
"dist.female.first"]
for gender in (Gender.MALE, Gender.FEMALE):
with open(us_filename[gender]) as fp:
for line in fp:
m = re.search(r'(\w+)\s+([0-9\.]+)', line)
name = m.group(1)
frequency = float(m.group(2))
names_to_frequency[gender][name] = frequency
def probability_male_from_first_name(first_name):
first_name = first_name.upper()
male_frequency = names_to_frequency[Gender.MALE].get(first_name, 0)
female_frequency = names_to_frequency[Gender.FEMALE].get(first_name, 0)
total_frequency = male_frequency + female_frequency
if total_frequency == 0:
return 0.5
else:
return male_frequency / total_frequency
def guess_gender_from_first_name(first_name, cutoff):
p = probability_male_from_first_name(first_name)
if p < cutoff:
return Gender.FEMALE
elif p > (1 - cutoff):
return Gender.MALE
else:
return Gender.CHECK
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment