|
""" |
|
|
|
Reads a Graphml file and instantiate the right type Networkx graph. |
|
|
|
Notes: |
|
1. This is still work in progress. At its current state, the code will comfortably read "simple" graphml files. |
|
2. The next step is to enable the routine to selectively read in (graph, node, edge) level data based on the |
|
namespace used. |
|
3. This will probably be implemented with named tuples and it should support at least simple reads (if not full |
|
complex writes too). |
|
|
|
:author:Athansios Anastasiou |
|
:date: April 2020 |
|
""" |
|
import sys |
|
import lxml.etree |
|
import networkx |
|
|
|
if __name__ == "__main__": |
|
# SETUP THE INPUT FILE HERE |
|
input_file = "airlines.graphml" |
|
# input_file = "yworks_graphml.graphml" |
|
|
|
doc = lxml.etree.parse(input_file) |
|
# Get the namespace map |
|
nsmap = doc.getroot().nsmap |
|
# Find the default namespace |
|
default_ns = list(filter(lambda x:"http://graphml.graphdrawing.org/xmlns" in x[1],nsmap.items())) |
|
# If you cannot find a default namespace, maybe this is not a graphml file |
|
if len(default_ns)<1: |
|
print(f"{input_file} does not contain the default graphml namespace") |
|
sys.exit(1) |
|
# If the default namespace has a name, use it, otherwise, assign a generic name to it |
|
if default_ns[0][0] is not None: |
|
glns = default_ns[0][0] |
|
else: |
|
glns = "_q_q"; # Or something else as absurd, with zero chance of overwriting another key from the nsmap |
|
nsmap[glns] = default_ns[0][1] |
|
del(nsmap[None]) |
|
# Let's create the graph |
|
# Get all the graph attributes |
|
# Note, there are more attributes which could be more telling about the nature of the graph, |
|
# please see: http://graphml.graphdrawing.org/specification/dtd.html#key |
|
graph_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='graph']",namespaces=nsmap) |
|
# Get all the node attributes |
|
node_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='node']",namespaces=nsmap) |
|
# Get all the edge attributes |
|
edge_attr = doc.xpath(f"/{glns}:graphml/{glns}:key[@for='edge']",namespaces=nsmap) |
|
# Get all the nodes |
|
graph_nodes = doc.xpath(f"/{glns}:graphml/{glns}:graph/{glns}:node",namespaces=nsmap) |
|
# Get all the edges |
|
graph_edges = doc.xpath(f"/{glns}:graphml/{glns}:graph/{glns}:edge",namespaces=nsmap) |
|
# We are now ready to build a network. This next step is not strictly |
|
# required, but it allows us to figure out more information about what sort of |
|
# networkx network should be created. |
|
# Setup a data type conversion map |
|
# This maps the attr.type of graphml to the appropriate native data type |
|
data_type_conversion_map = {"boolean":bool,"int":int,"long":int,"float":float, "double":float, "string":str} |
|
# Create a dictionary dict<node_id>:<node_data> |
|
node_dict = {} |
|
for a_node in graph_nodes: |
|
node_id = a_node.attrib["id"] |
|
# The following line is a bit long but all the data required for the data marhsalling is available on a per |
|
# data item basis. |
|
# TODO: HIGH, This is missing the "or" part that substitutes the default values |
|
node_data = dict( |
|
map(lambda x:(x.attrib["attr.name"], |
|
data_type_conversion_map[x.attrib["attr.type"]]( |
|
a_node.xpath(f"{glns}:data[@key='{x.attrib['attr.name']}']",namespaces=nsmap)[0].text) |
|
),node_attr)) |
|
|
|
# Some basic validation here |
|
if node_id not in node_dict: |
|
node_dict[node_id] = node_data |
|
else: |
|
print(f"Node id {node_id} is not unique in {input_file}") |
|
sys.exit(1) |
|
# Create a dictionary dict<edge_from,edge_to>:dict<edge_id:edge_data>. The structure of this id will also help in |
|
# determining if the graph is a multigraph |
|
edge_dict = {} |
|
for an_edge in graph_edges: |
|
edge_from = an_edge.attrib["source"] |
|
edge_to = an_edge.attrib["target"] |
|
edge_id = an_edge.attrib["id"] |
|
edge_key = (edge_from, edge_to) |
|
edge_data = dict(map(lambda x:(x.attrib["attr.name"],data_type_conversion_map[x.attrib["attr.type"]](an_edge.xpath(f"{glns}:data[@key='{x.attrib['attr.name']}']",namespaces=nsmap)[0].text)),edge_attr)) |
|
|
|
if edge_key not in edge_dict: |
|
edge_dict[edge_key] = {} |
|
|
|
edge_dict[edge_key][edge_id] = edge_data |
|
|
|
# Is this a directed or undirected network? |
|
is_directed = doc.xpath(f"/{glns}:graphml/{glns}:graph/@edgedefault",namespaces=nsmap)[0].lower()=="directed" |
|
# Is it a multigraph? |
|
# To determine this check if there are more than two edges towards the same direction between two nodes |
|
is_multigraph = len(list(filter(lambda x:len(x[1])>1,edge_dict.items())))>0 |
|
# This mapping determines the right networkx data type based on the available data read from the file. |
|
graph_type_map = {(False, False):networkx.Graph, |
|
(False,True):networkx.MultiGraph, |
|
(True, False):networkx.DiGraph, |
|
(True, True):networkx.MultiDiGraph} |
|
# Create the networkx graph |
|
G = graph_type_map[(is_directed,is_multigraph)]() |
|
# Add both nodes and edges |
|
for an_edge in edge_dict.items(): |
|
source_node_id = an_edge[0][0] |
|
target_node_id = an_edge[0][1] |
|
# Add the endpoints if they do not yet exist |
|
if not source_node_id in G.nodes: |
|
G.add_node(source_node_id, **node_dict[source_node_id]) |
|
if not target_node_id in G.nodes: |
|
G.add_node(target_node_id, **node_dict[target_node_id]) |
|
# Now add all edges. This step is identical for Graphs and MultiGraphs. |
|
for an_edge_id in an_edge[1].items(): |
|
G.add_edge(an_edge[0][0], an_edge[0][1], key=an_edge_id[0], **an_edge_id[1]) |