August 19, 2014 21:08
diff --git a/gistfile1.py b/gistfile1.py
 import networkx as nx
 from lxml import etree
 import re
 import itertools

 def getNamesInAction(action,textNames,nameDict):
    # go through the names, in order of length, get them from the action, then remove them before looping
    act = action
    sortNames = sorted(textNames, key=len, reverse=True)
    returnNames = []
    for nm in sortNames:
        if nm in act:
            returnNames.append(nameDict[nm])
            act = act.replace(nm," ")
    return returnNames

 # obviously you point to the .htm file wherever you have it
 # find the full plays at:
 # http://shakespeare.mit.edu/
 filename = "ShakespearGraphs/Antony and Cleopatra  Entire Play.htm"
 f = open(filename)
 html = etree.HTML(f.read())
 f.close()
 # get all the characters in the play
 chars = html.xpath("//b/text()")
 # go through the list and make a mapping of the various spellings to the correct spelling we'll use for the graph
 inTextToActual = {}
 for i,c in enumerate(chars):
    c2 = re.sub("\s\s+"," ",c)
    c2 = c2.title()
    inTextToActual[c] = c2
 # the character names as we want them
 charNames =list( set(inTextToActual.values()))
 # the character names as they are in the text
 charTextNames = list(set(inTextToActual.keys()))
 # the italics elements contain all of the entrances and exits (and other things)
 # go through the text and get all the actions, and the speakers that the action is associated with
 italics = html.xpath("//i")
 speaker = []
 action = []
 for i in italics:
    ac = i.findtext(".")
    try:
        # the general stucture of the play is:
        # <a>
        #    <b> <name of speaker> </b>
        # </a>
        # <blockquote>
        #    ....
        #    <p>
        #       <i> <the action that happens> </i>
        #    </p>
        # </blockquote>
        # so we back up from the italics to get the name of the speaker
        # this lets us associate an "exit" to a character name
        sp = i.getparent().getparent().getprevious().xpath("b/text()")[0]
        speaker.append(sp)
    except:
        # in case there isn't a speaker or there is an error
        speaker.append(" ")
    action.append(ac)
 # make a list of the groups of characters that are on stage together at the same time
 # start with an empty list
 charactersTogether = [[]]
 ind = 0
 # loop through actions and add characters to the set
 # whenever a character is removed or added, we make a new set
 for ac,sp in zip(action,speaker):
    firstWord = re.search("(Enter|enter|Exeunt|Exit|exeunt|exit)",ac)
    if firstWord is None:
        continue
    #print ac#,"\n\t",
    firstWord = firstWord.group()
    if firstWord.lower() == "enter":
        # figure out who entered
        entrants = getNamesInAction(ac,charTextNames,inTextToActual)
        if entrants == []:
            #print ""
            continue
        #print firstWord, "\t", entrants
        prevList = list(charactersTogether[ind])
        prevList.extend(entrants)
        charactersTogether.append(prevList)
        ind += 1
    elif firstWord.lower() == "exit":
        # then the character exists
        # it could be the speaking character, or not
        leavers = getNamesInAction(ac,charTextNames,inTextToActual)
        if leavers == []:
            #print "Exit \t",[sp]
            leavers = [inTextToActual[sp]]
        #else:
        #    print "Exit \t",leavers
        prevChars = list(charactersTogether[ind])
        # remove the characters
        for c in leavers:
            try:
                prevChars.remove(c)
            except:
                continue
        charactersTogether.append(prevChars)
        ind += 1
    elif firstWord.lower() == "exeunt":
        # figure out if everyone leaves or just some characters
        if ac.lower().strip() == "exeunt":
            # then everyone leaves
            #print "Everyone exits"
            charactersTogether.append([])
            ind += 1
        elif "all but" in ac.lower():
            # then everyone but some characters leave
            nonLeavers = getNamesInAction(ac,charTextNames,inTextToActual)
            charactersTogether.append(nonLeavers)
            ind += 1
        else:
            # then the characters in the list leave
            leavers = getNamesInAction(ac,charTextNames,inTextToActual)
            #print "Exit \t", leavers
            prevChars = list(charactersTogether[ind])
            # remove the characters
            for c in leavers:
                try:
                    prevChars.remove(c)
                except:
                    continue
            charactersTogether.append(prevChars)
            ind += 1
    else:
        print ""
    # do a quick check for multiples of the same character
    csToRem = []
    for c in charactersTogether[ind]:
        if charactersTogether[ind].count(c) > 1:
            csToRem.append(c)
    csToRem = list(set(csToRem))
    #print csToRem
    for c in csToRem:
        charactersTogether[ind].remove(c)
    #print "\t",charactersTogether[ind]
    
 # with the characters listed, make the adjacency matrix
 nChars = len(charNames)
 adj = np.zeros((nChars,nChars))
 # loop over all pairs of characters in each stage group
 # make their connection in the adjancency matrix 1
 # there will be tons of overlap here
 for sitch in charactersTogether:
    for a,b in itertools.product(sitch,sitch):
        if a == b:
            continue
        inda = charNames.index(a)
        indb = charNames.index(b)
        adj[inda,indb] = 1
 # make the graph and save it as a .gexf for Gephi
 G = nx.Graph(adj)
 G = nx.relabel_nodes(G,{i:n for i,n in enumerate(charNames)})
 nx.write_gexf(G,"%s.gexf"%filename[:-4])
	import networkx as nx
	from lxml import etree
	import re
	import itertools

	def getNamesInAction(action,textNames,nameDict):
	# go through the names, in order of length, get them from the action, then remove them before looping
	act = action
	sortNames = sorted(textNames, key=len, reverse=True)
	returnNames = []
	for nm in sortNames:
	if nm in act:
	returnNames.append(nameDict[nm])
	act = act.replace(nm," ")
	return returnNames

	# obviously you point to the .htm file wherever you have it
	# find the full plays at:
	# http://shakespeare.mit.edu/
	filename = "ShakespearGraphs/Antony and Cleopatra Entire Play.htm"
	f = open(filename)
	html = etree.HTML(f.read())
	f.close()
	# get all the characters in the play
	chars = html.xpath("//b/text()")
	# go through the list and make a mapping of the various spellings to the correct spelling we'll use for the graph
	inTextToActual = {}
	for i,c in enumerate(chars):
	c2 = re.sub("\s\s+"," ",c)
	c2 = c2.title()
	inTextToActual[c] = c2
	# the character names as we want them
	charNames =list( set(inTextToActual.values()))
	# the character names as they are in the text
	charTextNames = list(set(inTextToActual.keys()))
	# the italics elements contain all of the entrances and exits (and other things)
	# go through the text and get all the actions, and the speakers that the action is associated with
	italics = html.xpath("//i")
	speaker = []
	action = []
	for i in italics:
	ac = i.findtext(".")
	try:
	# the general stucture of the play is:
	# <a>
	# <b> <name of speaker> </b>
	# </a>
	# <blockquote>
	# ....
	# <p>
	# <i> <the action that happens> </i>
	# </p>
	# </blockquote>
	# so we back up from the italics to get the name of the speaker
	# this lets us associate an "exit" to a character name
	sp = i.getparent().getparent().getprevious().xpath("b/text()")[0]
	speaker.append(sp)
	except:
	# in case there isn't a speaker or there is an error
	speaker.append(" ")
	action.append(ac)
	# make a list of the groups of characters that are on stage together at the same time
	# start with an empty list
	charactersTogether = [[]]
	ind = 0
	# loop through actions and add characters to the set
	# whenever a character is removed or added, we make a new set
	for ac,sp in zip(action,speaker):
	firstWord = re.search("(Enter\|enter\|Exeunt\|Exit\|exeunt\|exit)",ac)
	if firstWord is None:
	continue
	#print ac#,"\n\t",
	firstWord = firstWord.group()
	if firstWord.lower() == "enter":
	# figure out who entered
	entrants = getNamesInAction(ac,charTextNames,inTextToActual)
	if entrants == []:
	#print ""
	continue
	#print firstWord, "\t", entrants
	prevList = list(charactersTogether[ind])
	prevList.extend(entrants)
	charactersTogether.append(prevList)
	ind += 1
	elif firstWord.lower() == "exit":
	# then the character exists
	# it could be the speaking character, or not
	leavers = getNamesInAction(ac,charTextNames,inTextToActual)
	if leavers == []:
	#print "Exit \t",[sp]
	leavers = [inTextToActual[sp]]
	#else:
	# print "Exit \t",leavers
	prevChars = list(charactersTogether[ind])
	# remove the characters
	for c in leavers:
	try:
	prevChars.remove(c)
	except:
	continue
	charactersTogether.append(prevChars)
	ind += 1
	elif firstWord.lower() == "exeunt":
	# figure out if everyone leaves or just some characters
	if ac.lower().strip() == "exeunt":
	# then everyone leaves
	#print "Everyone exits"
	charactersTogether.append([])
	ind += 1
	elif "all but" in ac.lower():
	# then everyone but some characters leave
	nonLeavers = getNamesInAction(ac,charTextNames,inTextToActual)
	charactersTogether.append(nonLeavers)
	ind += 1
	else:
	# then the characters in the list leave
	leavers = getNamesInAction(ac,charTextNames,inTextToActual)
	#print "Exit \t", leavers
	prevChars = list(charactersTogether[ind])
	# remove the characters
	for c in leavers:
	try:
	prevChars.remove(c)
	except:
	continue
	charactersTogether.append(prevChars)
	ind += 1
	else:
	print ""
	# do a quick check for multiples of the same character
	csToRem = []
	for c in charactersTogether[ind]:
	if charactersTogether[ind].count(c) > 1:
	csToRem.append(c)
	csToRem = list(set(csToRem))
	#print csToRem
	for c in csToRem:
	charactersTogether[ind].remove(c)
	#print "\t",charactersTogether[ind]

	# with the characters listed, make the adjacency matrix
	nChars = len(charNames)
	adj = np.zeros((nChars,nChars))
	# loop over all pairs of characters in each stage group
	# make their connection in the adjancency matrix 1
	# there will be tons of overlap here
	for sitch in charactersTogether:
	for a,b in itertools.product(sitch,sitch):
	if a == b:
	continue
	inda = charNames.index(a)
	indb = charNames.index(b)
	adj[inda,indb] = 1
	# make the graph and save it as a .gexf for Gephi
	G = nx.Graph(adj)
	G = nx.relabel_nodes(G,{i:n for i,n in enumerate(charNames)})
	nx.write_gexf(G,"%s.gexf"%filename[:-4])
No results found