Created
August 19, 2014 21:08
-
-
Save anonymous/6d31d0a495f82db5748d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import networkx as nx | |
from lxml import etree | |
import re | |
import itertools | |
def getNamesInAction(action,textNames,nameDict): | |
# go through the names, in order of length, get them from the action, then remove them before looping | |
act = action | |
sortNames = sorted(textNames, key=len, reverse=True) | |
returnNames = [] | |
for nm in sortNames: | |
if nm in act: | |
returnNames.append(nameDict[nm]) | |
act = act.replace(nm," ") | |
return returnNames | |
# obviously you point to the .htm file wherever you have it | |
# find the full plays at: | |
# http://shakespeare.mit.edu/ | |
filename = "ShakespearGraphs/Antony and Cleopatra Entire Play.htm" | |
f = open(filename) | |
html = etree.HTML(f.read()) | |
f.close() | |
# get all the characters in the play | |
chars = html.xpath("//b/text()") | |
# go through the list and make a mapping of the various spellings to the correct spelling we'll use for the graph | |
inTextToActual = {} | |
for i,c in enumerate(chars): | |
c2 = re.sub("\s\s+"," ",c) | |
c2 = c2.title() | |
inTextToActual[c] = c2 | |
# the character names as we want them | |
charNames =list( set(inTextToActual.values())) | |
# the character names as they are in the text | |
charTextNames = list(set(inTextToActual.keys())) | |
# the italics elements contain all of the entrances and exits (and other things) | |
# go through the text and get all the actions, and the speakers that the action is associated with | |
italics = html.xpath("//i") | |
speaker = [] | |
action = [] | |
for i in italics: | |
ac = i.findtext(".") | |
try: | |
# the general stucture of the play is: | |
# <a> | |
# <b> <name of speaker> </b> | |
# </a> | |
# <blockquote> | |
# .... | |
# <p> | |
# <i> <the action that happens> </i> | |
# </p> | |
# </blockquote> | |
# so we back up from the italics to get the name of the speaker | |
# this lets us associate an "exit" to a character name | |
sp = i.getparent().getparent().getprevious().xpath("b/text()")[0] | |
speaker.append(sp) | |
except: | |
# in case there isn't a speaker or there is an error | |
speaker.append(" ") | |
action.append(ac) | |
# make a list of the groups of characters that are on stage together at the same time | |
# start with an empty list | |
charactersTogether = [[]] | |
ind = 0 | |
# loop through actions and add characters to the set | |
# whenever a character is removed or added, we make a new set | |
for ac,sp in zip(action,speaker): | |
firstWord = re.search("(Enter|enter|Exeunt|Exit|exeunt|exit)",ac) | |
if firstWord is None: | |
continue | |
#print ac#,"\n\t", | |
firstWord = firstWord.group() | |
if firstWord.lower() == "enter": | |
# figure out who entered | |
entrants = getNamesInAction(ac,charTextNames,inTextToActual) | |
if entrants == []: | |
#print "" | |
continue | |
#print firstWord, "\t", entrants | |
prevList = list(charactersTogether[ind]) | |
prevList.extend(entrants) | |
charactersTogether.append(prevList) | |
ind += 1 | |
elif firstWord.lower() == "exit": | |
# then the character exists | |
# it could be the speaking character, or not | |
leavers = getNamesInAction(ac,charTextNames,inTextToActual) | |
if leavers == []: | |
#print "Exit \t",[sp] | |
leavers = [inTextToActual[sp]] | |
#else: | |
# print "Exit \t",leavers | |
prevChars = list(charactersTogether[ind]) | |
# remove the characters | |
for c in leavers: | |
try: | |
prevChars.remove(c) | |
except: | |
continue | |
charactersTogether.append(prevChars) | |
ind += 1 | |
elif firstWord.lower() == "exeunt": | |
# figure out if everyone leaves or just some characters | |
if ac.lower().strip() == "exeunt": | |
# then everyone leaves | |
#print "Everyone exits" | |
charactersTogether.append([]) | |
ind += 1 | |
elif "all but" in ac.lower(): | |
# then everyone but some characters leave | |
nonLeavers = getNamesInAction(ac,charTextNames,inTextToActual) | |
charactersTogether.append(nonLeavers) | |
ind += 1 | |
else: | |
# then the characters in the list leave | |
leavers = getNamesInAction(ac,charTextNames,inTextToActual) | |
#print "Exit \t", leavers | |
prevChars = list(charactersTogether[ind]) | |
# remove the characters | |
for c in leavers: | |
try: | |
prevChars.remove(c) | |
except: | |
continue | |
charactersTogether.append(prevChars) | |
ind += 1 | |
else: | |
print "" | |
# do a quick check for multiples of the same character | |
csToRem = [] | |
for c in charactersTogether[ind]: | |
if charactersTogether[ind].count(c) > 1: | |
csToRem.append(c) | |
csToRem = list(set(csToRem)) | |
#print csToRem | |
for c in csToRem: | |
charactersTogether[ind].remove(c) | |
#print "\t",charactersTogether[ind] | |
# with the characters listed, make the adjacency matrix | |
nChars = len(charNames) | |
adj = np.zeros((nChars,nChars)) | |
# loop over all pairs of characters in each stage group | |
# make their connection in the adjancency matrix 1 | |
# there will be tons of overlap here | |
for sitch in charactersTogether: | |
for a,b in itertools.product(sitch,sitch): | |
if a == b: | |
continue | |
inda = charNames.index(a) | |
indb = charNames.index(b) | |
adj[inda,indb] = 1 | |
# make the graph and save it as a .gexf for Gephi | |
G = nx.Graph(adj) | |
G = nx.relabel_nodes(G,{i:n for i,n in enumerate(charNames)}) | |
nx.write_gexf(G,"%s.gexf"%filename[:-4]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment