Created
June 2, 2010 18:21
-
-
Save anonymous/422761 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Target is to generate a graph file in gexf format (http://gexf.net/format) for Gephi | |
#----------------------------------------------------- | |
# STEP 1 | |
# Generate nodes and edgelist from each email log file | |
#----------------------------------------------------- | |
setwd("C:/R") | |
# use sqldf for operations suited for db http://code.google.com/p/sqldf/ | |
library(sqldf) | |
# define utility functions | |
object.sizes <- function(obs=ls(envir=.GlobalEnv)){return(rev(sort(sapply(obs, function (object.name) object.size(get(object.name))))))} | |
# Create an empty data frame from a header list | |
empty.df<- function(header){ | |
df<-data.frame(matrix(matrix(rep(1,length(header)),1),1)) | |
colnames(df)<-header | |
return(df[NULL,]) | |
} | |
# break down large data problem into smaller ones | |
max_rows<-200000 | |
filelist<-c("Emails Dec 2009.txt", "Emails Jan 2010.txt", "Emails Feb 2010.txt") | |
# data format in the email logs is as below | |
# date from_address from_name to_address to_name | |
# 12-12-2009 [email protected] John Doe [email protected] Jane Smith | |
for(k in 1:length(filelist)){ | |
system.time(raw_emails<-read.csv(filelist[k],sep="\t",header=T,strip.white=TRUE)) | |
# remove system-generated emails | |
emails<-raw_emails[-grep('donotreply|error|webmaster|www', paste(raw_emails[,3],raw_emails[,5]), ignore.case=TRUE, value=FALSE),] | |
# filenames for collecting nodes and edges | |
node_file<-paste("nodes",k,"-",filelist[k],sep="") | |
edge_file<-paste("edges",k,"-",filelist[k],sep="") | |
# get to_nodes in "node_id, node_label" format | |
to_nodes<-emails[c(-1,-2,-3)] | |
# get from_nodes in "node_id, node_label" format | |
from_nodes<-emails[c(-1,-4,-5)] | |
# get edgelist in "from_node_id, to_node_id" format | |
edgelist<-emails[c(-1,-3,-5)] | |
# change column names for rbind | |
colnames(to_nodes)<-c("id","name") | |
colnames(from_nodes)<-c("id","name") | |
all_nodes<-rbind(to_nodes,from_nodes) | |
# convert all nodes and edgelist to lowercase... using SQL | |
system.time(all_nodes_lowercase<-sqldf('SELECT LOWER(id) uid, LOWER(name) label FROM all_nodes')) | |
system.time(edgelist_lowercase<-sqldf('SELECT LOWER(Originator) originator, LOWER(Recipient) recipient FROM edgelist')) | |
unique_nodes<-unique(all_nodes_lowercase) | |
sorted_unique_nodes<- unique_nodes[order(unique_nodes[,1]),] | |
write.csv(sorted_unique_nodes, file = node_file, row.names=FALSE, quote = FALSE) | |
num_blocks<-ceiling(nrow(edgelist_lowercase)/max_rows) | |
start_row<-0 | |
edgecount <- empty.df(c("originator","recipient","count(1)")) | |
for(i in 1:num_blocks){ | |
sql_statement<-paste('select originator, recipient, count(1) FROM (select originator, recipient FROM edgelist_lowercase LIMIT ', start_row, ',', max_rows, ') group by originator, recipient order by originator, recipient') | |
print(system.time(counts<-sqldf(sql_statement))) | |
edgecount<- rbind(edgecount, counts) | |
start_row<-start_row + max_rows | |
} | |
system.time(sqldf("create index edgecount1 on edgecount (originator, recipient)")) | |
system.time(final_edgecount <- sqldf("select originator, recipient, sum(count_1_) FROM edgecount group by originator, recipient order by originator, recipient")) | |
write.csv(final_edgecount, file = edge_file, row.names=FALSE, quote = FALSE) | |
} | |
#---------------------------------------------------------------------- | |
# STEP 2 | |
# Combine node and edgelist files into one large node and edgelist file | |
#---------------------------------------------------------------------- | |
all_file_nodes <- empty.df(c("id","label")) | |
all_file_edges <- empty.df(c("originator","recipient", "sum.count_1_.")) | |
for(k in 1:length(filelist)){ | |
node_file<-paste("nodes",k,"-",filelist[k],sep="") | |
edge_file<-paste("edges",k,"-",filelist[k],sep="") | |
# read each node file and rbind with all_file_nodes | |
system.time(nodes<-read.csv(node_file,sep=",",header=T,strip.white=TRUE)) | |
all_file_nodes <- rbind(all_file_nodes, nodes) | |
# read each edge file and rbind with all_file_edges | |
system.time(edges<-read.csv(edge_file,sep=",",header=T,strip.white=TRUE)) | |
all_file_edges <- rbind(all_file_edges, edges) | |
} | |
unique_all_file_nodes<-unique(all_file_nodes) | |
sorted_unique_all_file_nodes<- unique_all_file_nodes[order(unique_all_file_nodes[,1]),] | |
# write nodes in this form --- <node id="0" label="Hello" /> | |
nodexml<-paste("<node id=\"", sorted_unique_all_file_nodes[,1], "\"", " label=\"", sorted_unique_all_file_nodes[,2],"\""," />", sep="") | |
write.csv(as.data.frame(nodexml, optional=TRUE), file = "All Nodes.txt", quote = FALSE, row.names = FALSE) | |
# edge operations | |
# use pragma table_info to see the table attributes to use in sum sql below | |
# sqldf("pragma table_info(all_file_edges)") | |
unique_all_file_edges<-sqldf('select originator, recipient, sum(sum_count_1__) FROM all_file_edges group by originator, recipient order by sum(sum_count_1__)') | |
nrow(unique_all_file_edges) | |
# filter out edges with wt 0 to 2 | |
thicker_edges<-unique_all_file_edges[unique_all_file_edges[,3]>3,] | |
# write edges in this form --- <edge id="0" source="0" target="1" type="directed" weight="2.4" /> | |
edgelistxml<-paste("<edge id=\"", rownames(thicker_edges), "\" ", "source=\"", thicker_edges[,1], "\" target=\"", thicker_edges[,2], "\" weight=\"", thicker_edges[,3], "\"/>", sep="") | |
# write edges for gexf file. Convert to data.frame to prevent printing the column name | |
write.csv(as.data.frame(edgelistxml, optional=TRUE), file = "All Edges.txt", quote = FALSE, row.names = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment