Last active
January 18, 2016 19:15
-
-
Save christopherkullenberg/9d44056a222d5ebbdcae to your computer and use it in GitHub Desktop.
swepubgexf.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import listdir | |
from lxml import etree as ET | |
from gexf import * | |
from itertools import combinations | |
#import xml.etree.ElementTree as ET #Use this if you don't have lxml installed | |
# Open up a gexf file | |
gexf = Gexf("Author-Institution network", "GU") | |
graph = gexf.addGraph("undirected", "static", "Swepub network") | |
attribute_node = graph.addNodeAttribute("University", "default_value", "string") | |
attribute_nodetwo = graph.addNodeAttribute("Institution", "default_value", "string") | |
records = 0 # Just a counter for control | |
therecords = [] # A list for storing many dictionaries created in the loop below | |
# The loop for extracting author/institution from the xml-files | |
for filename in listdir("GUYear2015N47761Searched20160110/"): | |
with open("GUYear2015N47761Searched20160110/" + filename) as currentFile: | |
tree = ET.parse(currentFile) | |
root = tree.getroot() | |
for child in root[0]: | |
records += 1 #Add to counter above | |
#print("-" * 10) | |
coauthors = {} | |
for c in child: #This iterates over the records | |
if c.get("tag") == "100": # The 100 Value is first author | |
authorlist = [] | |
for value in c: | |
if value.get("code") == "a": # a is author name | |
author = value.text | |
#print(author) | |
elif value.get("code") == "u": # u is institution | |
#print(value.text) | |
institution = value.text | |
#print(institution) | |
coauthors.update({author: institution}) | |
elif c.get("tag") == "700": # The 700 value is authors | |
for value in c: | |
if value.get("code") == "a": | |
author = value.text | |
#print(author) | |
elif value.get("code") == "u": | |
#print(value.text) | |
institution = value.text | |
#print(institution) | |
coauthors.update({author: institution}) | |
therecords.append(coauthors) # Add each dictionary to the list above | |
#this removes only EXACT duplicate dictionaries from therecords list | |
#Possible false negative: Two articles may have identical groups of authors | |
seen = set() | |
therecordsdeduplicated = [] | |
for d in therecords: | |
t = tuple(d.items()) | |
if t not in seen: | |
seen.add(t) | |
therecordsdeduplicated.append(d) | |
#this creates a list which can be used to create edges | |
edges = [] | |
coauthorcounter = 0 #just a counter | |
for t in therecordsdeduplicated: | |
if len(t) > 1: #This removes single author articles, only more than one authors are allowed | |
coauthorcounter += 1 # counts the number of articles with minimum 2 authors | |
#print("--") | |
#print(len(t)) | |
edgelist = [] | |
for key, value in t.items(): | |
coauthoredge = list(combinations(t, 2)) #This function calculates all possible relations between authors of an article. | |
for c in coauthoredge: | |
edgelist.append(c) | |
newvalue = value.split(', ') | |
n = graph.addNode(key, key) | |
try: | |
print(newvalue[0]) | |
n.addAttribute(attribute_node, newvalue[0]) | |
except IndexError: | |
print("nothing here...") | |
try: | |
print(newvalue[1]) | |
n.addAttribute(attribute_nodetwo, newvalue[1]) | |
except IndexError: | |
print("nothing here...") | |
#graph.addNode(value, value) #This adds the Institutions as nodes | |
edges.append(edgelist) | |
#print(edgelist) | |
#this creates a list of edges then enumerates it and creates edges | |
authoredges = [] | |
for e in edges: | |
for ee in e: | |
#print(ee) | |
authoredges.append(ee) | |
for enumer, e in enumerate(authoredges): | |
#print(enumer, e[0], e[1]) | |
graph.addEdge(enumer, e[0], e[1]) | |
# Print some meta-data | |
print("There are " + str(records) + " records") | |
print("There are " + str(coauthorcounter) + " co-authored articles (more than 1 author)\n") | |
#write file | |
gexf_file = open("coauthors.gexf", "wb") | |
gexf.write(gexf_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment