Created
May 24, 2012 13:48
-
-
Save mhl/2781638 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os, re, sys, xml.sax | |
from lxml import etree | |
from collections import defaultdict | |
from xml.sax.handler import ContentHandler | |
from guess_mp_genders import * | |
import datetime | |
since = datetime.date(2010, 5, 6) | |
debates_directory = "/media/no2id/ukparse-git-svn/parldata/scrapedxml/debates" | |
class MinisterParser(ContentHandler): | |
def __init__(self): | |
self.ministers = set([]) | |
def startElement(self, name, attr): | |
if name == "moffice": | |
self.ministers.add(attr['matchid']) | |
minister_parser = MinisterParser() | |
with open("ministers-2010.xml") as fp: | |
xml.sax.parse(fp, minister_parser) | |
member_to_gender = {} | |
members_to_check = set([]) | |
for members_filename in ("all-members-2010.xml", "all-members.xml"): | |
root = etree.parse(members_filename).getroot() | |
for member in root.xpath('member'): | |
member_id = member.attrib['id'] | |
first_name = member.attrib['firstname'].strip() | |
last_name = member.attrib['lastname'] | |
gender = guess_gender_from_first_name(first_name, 0.05) | |
member_to_gender[member_id] = gender | |
if gender == Gender.CHECK: | |
members_to_check.add((first_name, last_name, member_id)) | |
for t in sorted(members_to_check): | |
print "Need to check %s %s (%s)" % t | |
class GenderResult: | |
def __init__(self): | |
self.speeches = 0 | |
self.speeches_interrupted = 0 | |
for just_ministers in (False, True): | |
gender_results = [GenderResult(), GenderResult()] | |
for e in os.listdir(debates_directory): | |
# Only load debates from the 2010 parliament: | |
m = re.search("^debates(\d{4}-\d{2}-\d{2})", e) | |
if not m: | |
continue | |
date = datetime.datetime.strptime(m.group(1), "%Y-%m-%d").date() | |
if date < since: | |
continue | |
filename = os.path.join(debates_directory, e) | |
root = etree.parse(filename).getroot() | |
for speech in root.xpath('speech'): | |
if 'speakerid' not in speech.attrib: | |
continue | |
speaker_id = speech.attrib['speakerid'] | |
if speaker_id == 'unknown': | |
continue | |
if speaker_id not in member_to_gender: | |
print "Missing speaker from members file:", speaker_id | |
continue | |
is_minister = speaker_id in minister_parser.ministers | |
gender = member_to_gender[speaker_id] | |
if gender == Gender.CHECK: | |
continue | |
interrupted_elements = speech.xpath(".//*[contains(text(),'Interruption.')]") | |
interrupted = len(interrupted_elements) > 0 | |
if is_minister != just_ministers: | |
continue | |
gender_results[gender].speeches += 1 | |
if interrupted: | |
gender_results[gender].speeches_interrupted += 1 | |
print "------------------------------------------------------------------------" | |
if just_ministers: | |
print "ONLY MINISTERS:" | |
else: | |
print "ONLY NON MINISTERS" | |
for gender in (Gender.MALE, Gender.FEMALE): | |
gu = Gender.to_unicode(gender) | |
total = gender_results[gender].speeches | |
interrupted = gender_results[gender].speeches_interrupted | |
print gu, "speeches", total | |
print gu, "speeches interrupted", interrupted | |
print "proportion of", gu, "speeches interrupted", interrupted / float(total) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
class Gender: | |
MALE = 0 | |
FEMALE = 1 | |
CHECK = 2 | |
unicode_versions = (u'male', u'female', u'check') | |
@staticmethod | |
def to_unicode(g): | |
return Gender.unicode_versions[g] | |
names_to_frequency = [{}, {}] | |
# These data sets are from: http://www.census.gov/genealogy/names/names%5Ffiles.html | |
# As an alternative, the ONS have similar UK statistics. | |
us_filename = ["dist.male.first", | |
"dist.female.first"] | |
for gender in (Gender.MALE, Gender.FEMALE): | |
with open(us_filename[gender]) as fp: | |
for line in fp: | |
m = re.search(r'(\w+)\s+([0-9\.]+)', line) | |
name = m.group(1) | |
frequency = float(m.group(2)) | |
names_to_frequency[gender][name] = frequency | |
def probability_male_from_first_name(first_name): | |
first_name = first_name.upper() | |
male_frequency = names_to_frequency[Gender.MALE].get(first_name, 0) | |
female_frequency = names_to_frequency[Gender.FEMALE].get(first_name, 0) | |
total_frequency = male_frequency + female_frequency | |
if total_frequency == 0: | |
return 0.5 | |
else: | |
return male_frequency / total_frequency | |
def guess_gender_from_first_name(first_name, cutoff): | |
p = probability_male_from_first_name(first_name) | |
if p < cutoff: | |
return Gender.FEMALE | |
elif p > (1 - cutoff): | |
return Gender.MALE | |
else: | |
return Gender.CHECK |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment