Created
November 13, 2015 21:16
-
-
Save ejcer/aab9517eb2bee9e8e800 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
from math import sqrt | |
import json | |
import numpy as np | |
import pandas as pd | |
from pandas import Series, DataFrame | |
import os | |
import io | |
os.chdir('/home/edward/workspace/school/context_slices') | |
os.getcwd() | |
#combine aliases to a uniform name | |
def filter(item, tag, aliases): | |
for alias in aliases: | |
if tag in alias and item in alias: | |
##print item, "--", alias[1] | |
return alias[1] | |
return item | |
#find co-occurance of items | |
def cooccur(item1, item2): | |
cooccurence = 0 | |
for doc in root.iter('document'): | |
childtexts = [] | |
for child in doc: | |
item = child.text | |
if item != None and child.tag != 'docID' and child.tag != 'docText': | |
childtexts.append(item) | |
if item1 in childtexts and item2 in childtexts: | |
#print item1, "& ", item2, " cooccur in ", doc[0].text | |
cooccurence += 1 | |
return cooccurence | |
#build item-item matrix | |
# def item_item(): | |
# item_item = [] | |
# for pri_item in items: | |
# cooccurence = 0 | |
# row = [] | |
# #print pri_item | |
# for itr_item in items: | |
# #print itr_item | |
# cooccurence = cooccur(pri_item, itr_item) | |
# row.append(cooccurence) | |
# #print cooccurence | |
# item_item.append(row) | |
# return item_item | |
tree = ET.parse('datafiles/crescent.xml') | |
root = tree.getroot() | |
#build aliases dictionary | |
aliasedType = [] | |
aliases = [] | |
for name in root.iter('alias'): | |
pid = [] | |
oid = [] | |
alias = [name[0][0].tag,name[0][0].text] | |
#check if primary id and other id are of the same type | |
for n in name: | |
if "primary" in n.tag: | |
pid.append(n) | |
if "other" in n.tag: | |
oid.append(n) | |
alias.append(n[0].text) | |
##print "pid is ", pid, "oid is ", oid | |
aliases.append(alias) | |
for o in oid: | |
for p in pid: | |
if (o == p): | |
print "xml file error: id of different type ===> index: ", oid.index(o) | |
break | |
#extract the types that have aliases | |
elif p[0].tag not in aliasedType: | |
##print p[0].tag | |
aliasedType.append(name[0][0].tag); | |
##print aliasedType, aliases | |
#doc_item = doc_item() | |
#find items | |
items = [] | |
for doc in root.iter('document'): | |
for child in doc: | |
item = child.text | |
if item not in items and item != None and child.tag != 'docID' and child.tag != 'docText': | |
if child.tag in aliasedType: | |
item = filter(item, child.tag, aliases) | |
items.append(item) | |
items.sort() | |
item_item = [] | |
graph = {"nodes":[],"links":[]} | |
for idx, node_name in enumerate(items): | |
graph["nodes"].append({"group":idx,"name":node_name}) | |
link_count = 0 | |
for idx1, item1 in enumerate(items): | |
cooccurence = 0 | |
row = [] | |
for idx2, item2 in enumerate(items): | |
cooccurence = cooccur(item1, item2) | |
if cooccurence is 1 and link_count < 100: | |
graph["links"].append({"source":idx1,"target":idx2,"value":cooccurence}) | |
link_count+=1 | |
if cooccurence > 1: | |
graph["links"].append({"source":idx1,"target":idx2,"value":cooccurence}) | |
link_count+=1 | |
# for idx1, item1 in enumerate(items): | |
# cooccurence = 0 | |
# row = [] | |
# print "this happened" | |
# for idx2, item2 in enumerate(items): | |
# cooccurence = cooccur(item1, item2) | |
# row.append(cooccurence) | |
# item_item.append(row) | |
# print item_item | |
# | |
# df = DataFrame(item_item, columns = items, index=items) | |
# df.to_csv("item_item_dataframe.csv") | |
#node_links = {"nodes":[{"name":"Myriel","group":1}], "links":[{"source":1,"target":0,"value":1}]} | |
with io.open('text-team-webPrototype/graph.json', 'w', encoding='utf-8') as f: | |
f.write(unicode(json.dumps(graph, indent=4, separators=(',', ': '), ensure_ascii=False))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment