Created
November 29, 2013 10:41
-
-
Save butlermh/7704077 to your computer and use it in GitHub Desktop.
Python script to convert Apple forum data set http://sifaka.cs.uiuc.edu/~wang296/Data/index.html to GML so it can be imported into Gephi.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import os | |
import networkx as nx | |
from unidecode import unidecode | |
from dateutil.parser import parse | |
''' This file requires the networkx and unidecode packages to be installed e.g. | |
easy_install networkx | |
easy_install unidecode''' | |
'''This class converts the Apple Forums files to gml graphs''' | |
class ForumToGraph(object): | |
def __init__(self): | |
self.topic_graph = nx.Graph() | |
self.author_graph = nx.Graph() | |
self.author_to_topics = {} | |
self.topic_to_authors = {} | |
'''Reads in data from a file and stores it in the graphs containing topics and authors. | |
If two authors contributed to the same topic they are regarded as being connected.''' | |
def read_file(self, filename): | |
with open(filename) as f: | |
topic_id = '' | |
topic_title = '' | |
for line in f: | |
if line.startswith('<ID>'): | |
if topic_id == '': | |
topic_id = line[4:].strip() | |
self.topic_to_authors[topic_id] = set() | |
else: | |
self.topic_graph.node[topic_id]['number_of_posts'] += 1 | |
if line.startswith('<Author>'): | |
# unfortunately gml does not support unicode | |
author_id = unidecode(line[8:].strip().replace(' ', '_')) | |
self.topic_to_authors[topic_id].add(author_id) | |
if author_id in self.author_graph: | |
author = self.author_graph.node[author_id] | |
self.author_graph.node[author_id]['number_of_posts'] += 1 | |
self.author_to_topics[author_id].add(topic_id) | |
else: | |
self.author_graph.add_node(author_id, number_of_posts=1, oldest='', newest='') | |
author = self.author_graph.node[author_id] | |
self.author_to_topics[author_id] = set() | |
self.author_to_topics[author_id].add(topic_id) | |
if (line.startswith('<Title>') and topic_title == ''): | |
topic_title = unidecode(line[7:].strip()) | |
print filename, topic_title | |
self.topic_graph.add_node(topic_id, title = topic_title, number_of_posts=1, oldest='', newest='') | |
if line.startswith('<Time>'): | |
post_time = line[6:].strip() | |
if (author['oldest'] == '' or author['oldest'] > post_time): | |
author['oldest'] = post_time | |
if (author['newest'] == '' or author['newest'] < post_time): | |
author['newest'] = post_time | |
topic = self.topic_graph.node[topic_id] | |
if (topic['oldest'] == '' or topic['oldest'] > post_time): | |
topic['oldest'] = post_time | |
if (topic['newest'] == '' or topic['newest'] < post_time): | |
topic['newest'] = post_time | |
for authorA in self.topic_to_authors[topic_id]: | |
for authorB in self.topic_to_authors[topic_id]: | |
if (not authorA == authorB): | |
self.author_graph.add_edge(authorA, authorB) | |
'''Calculate how long each author contributed to the forums and add edges between topics | |
that had contributions from the same author''' | |
def add_topic_links(self): | |
for topic_id in self.topic_to_authors: | |
self.topic_graph.node[topic_id]['length_of_posting'] = \ | |
(parse(self.topic_graph.node[topic_id]['newest']) - | |
parse(self.topic_graph.node[topic_id]['oldest'])).days | |
for author_id in self.author_to_topics: | |
self.author_graph.node[author_id]['length_of_posting'] = \ | |
(parse(self.author_graph.node[author_id]['newest']) - | |
parse(self.author_graph.node[author_id]['oldest'])).days | |
topics = self.author_to_topics[author_id] | |
for topicA in topics: | |
for topicB in topics: | |
if (not topicA == topicB): | |
self.topic_graph.add_edge(topicA, topicB) | |
'''Calculate some summary statistics''' | |
def print_statistics(self): | |
print "Author graph" | |
print "------------" | |
num_authors = len(self.author_to_topics) | |
print "Total number of authors", num_authors | |
print "Total number of links between authors", len(self.author_graph.edges()) | |
author_no_contacts = 0 | |
only_one_post = 0 | |
sum_contacts = 0 | |
sum_posts = 0 | |
sum_membership_period = 0 | |
for author_id in self.author_to_topics: | |
contacts = len(self.author_graph.neighbors(author_id)) | |
if contacts == 0: | |
author_no_contacts += 1 | |
sum_contacts += contacts | |
posts = self.author_graph.node[author_id]['number_of_posts'] | |
if posts == 1: | |
only_one_post += 1 | |
sum_posts += posts | |
membership_period = self.author_graph.node[author_id]['length_of_posting'] | |
sum_membership_period += membership_period | |
percent = 100.00 | |
author_no_contacts_percent = author_no_contacts * percent / num_authors | |
print "Total number of authors with no contacts", \ | |
author_no_contacts, "=", author_no_contacts_percent, "%" | |
only_one_post_percent = only_one_post * percent / num_authors | |
print "Total number of authors who only posted once", \ | |
only_one_post, "=", only_one_post_percent, "%" | |
print "Average number of contacts per author", sum_contacts / num_authors * 1.0 | |
print "Average number of posts per author", sum_posts / num_authors * 1.0 | |
print "Average length of author membership", sum_membership_period / num_authors, "days" | |
print "Topic graph" | |
print "-----------" | |
topic_num = len(self.topic_to_authors) | |
print "Total number of topics", topic_num | |
print "Total number of links between topics", len(self.topic_graph.edges()) | |
sum_topic_lifetime = 0 | |
for topic_id in self.topic_to_authors: | |
topic_lifetime = self.topic_graph.node[topic_id]['length_of_posting'] | |
sum_topic_lifetime += topic_lifetime | |
print "Average topic lifetime", sum_topic_lifetime / topic_num, "days" | |
topic_only_one_author = 0 | |
for topic_id in self.topic_to_authors: | |
if (len(self.topic_to_authors[topic_id]) == 1): | |
topic_only_one_author += 1 | |
topic_only_one_author_percent = topic_only_one_author * percent / topic_num | |
print "Total number of topics with only one author", \ | |
topic_only_one_author, "=", topic_only_one_author_percent, "%" | |
'''Store the graphs in GML format so they can be processed with Gephi''' | |
def save_graphs(self): | |
## make GML file containing only authors | |
nx.write_gml(self.author_graph, 'authors.gml') | |
## make GML file containing both authors and posts | |
nx.write_gml(self.topic_graph, 'topics.gml') | |
''' Read all the files in a number of directories under a main directory''' | |
def read_dir(self, file_path): | |
currentpath = os.path.dirname(os.path.realpath(__file__)) | |
for directory in os.listdir(file_path): | |
os.chdir(os.path.join(file_path, directory)) | |
for filename in glob.glob('*.html.token'): | |
self.read_file(filename) | |
os.chdir(currentpath) | |
f = ForumToGraph() | |
f.read_dir('apple') | |
f.add_topic_links() | |
f.print_statistics() | |
f.save_graphs() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment