Skip to content

Instantly share code, notes, and snippets.

@numeroteca
Last active May 17, 2021 10:36
Show Gist options
  • Save numeroteca/aa040b0488c914d1e4a37e40117ef062 to your computer and use it in GitHub Desktop.
Save numeroteca/aa040b0488c914d1e4a37e40117ef062 to your computer and use it in GitHub Desktop.
minor changes to adapt network.py to v2 API for twarc2. See https://github.com/DocNow/twarc/issues/461
#!/usr/bin/env python
# build a reply, quote, retweet network from a file of tweets and write it
# out as a gexf, dot, json or html file. You will need to have networkx
# installed and pydotplus if you want to use dot. The html presentation
# uses d3 to display the network graph in your browser.
#
# ./network.py tweets.jsonl network.html
#
# or
# ./network.py tweets.jsonl network.dot
#
# or
#
# ./network.py tweets.jsonl network.gexf
#
# if you would rather have the network oriented around nodes that are users
# instead of tweets use the --users flag
#
# ./network.py --users tweets.jsonl network.gexf
#
# if you would rather have the network oriented around nodes that are hashtags
# instead of tweets or users, use the --hashtags flag
#
# TODO: this is mostly here some someone can improve it :)
import sys
import json
import networkx
import optparse
import itertools
import time
from networkx import nx_pydot
from networkx.readwrite import json_graph
usage = "network.py tweets.jsonl graph.html"
opt_parser = optparse.OptionParser(usage=usage)
opt_parser.add_option(
"--retweets",
dest="retweets",
action="store_true",
help="include retweets"
)
opt_parser.add_option(
"--min_subgraph_size",
dest="min_subgraph_size",
type="int",
help="remove any subgraphs with a size smaller than this number"
)
opt_parser.add_option(
"--max_subgraph_size",
dest="max_subgraph_size",
type="int",
help="remove any subgraphs with a size larger than this number"
)
opt_parser.add_option(
"--users",
dest="users",
action="store_true",
help="show user relations instead of tweet relations"
)
opt_parser.add_option(
"--hashtags",
dest="hashtags",
action="store_true",
help="show hashtag relations instead of tweet relations"
)
options, args = opt_parser.parse_args()
if len(args) != 2:
opt_parser.error("must supply input and output file names")
tweets, output = args
G = networkx.DiGraph()
def add(from_user, from_id, to_user, to_id, type, created_at=None):
"adds a relation to the graph"
# storing start_data will allow for timestamps for gephi timeline, where nodes will appear on screen at their start dataset
# and stay on forever after
if (options.users or options.hashtags) and to_user:
G.add_node(from_user, screen_name=from_user, start_date=created_at)
G.add_node(to_user, screen_name=to_user, start_date=created_at)
if G.has_edge(from_user, to_user):
weight = G[from_user][to_user]['weight'] + 1
else:
weight = 1
G.add_edge(from_user, to_user, type=type, weight=weight)
elif not options.users and to_id:
G.add_node(from_id, screen_name=from_user, type=type)
if to_user:
G.add_node(to_id, screen_name=to_user)
else:
G.add_node(to_id)
G.add_edge(from_id, to_id, type=type)
def to_json(g):
j = {"nodes": [], "links": []}
for node_id, node_attrs in g.nodes(True):
j["nodes"].append({
"id": node_id,
"type": node_attrs.get("type"),
"screen_name": node_attrs.get("screen_name")
})
for source, target, attrs in g.edges(data=True):
j["links"].append({
"source": source,
"target": target,
"type": attrs.get("type")
})
return j
for line in open(tweets):
try:
t = json.loads(line)
except:
continue
from_id = t['id']
from_user = t['author']['username']
from_user_id = t['author']['id']
to_user = None
to_id = None
# standardize raw created at date to dd/MM/yyyy HH:mm:ss
# created_at_date = time.strftime('%d/%m/%Y %H:%M:%S', time.strptime(t["created_at"],'%a %b %d %H:%M:%S +0000 %Y'))
created_at_date = time.strftime('%d/%m/%Y %H:%M:%S', time.strptime(t["created_at"],'%Y-%m-%dT%H:%M:%S.%fZ'))
if options.users:
for u in t['entities'].get('user_mentions', []):
add(from_user, from_id, u['username'], None, 'reply', created_at_date)
elif options.hashtags:
hashtags = t['entities'].get('hashtags', [])
hashtag_pairs = list(itertools.combinations(hashtags, 2)) # list of all possible hashtag pairs
for u in hashtag_pairs:
# source hashtag: u[0]['text']
# target hashtag: u[1]['text']
add('#' + u[0]['text'], None, '#' + u[1]['text'], None, 'hashtag', created_at_date)
else:
if t.get('in_reply_to_status_id_str'):
to_id = t['in_reply_to_status_id_str']
to_user = t['in_reply_to_screen_name']
add(from_user, from_id, to_user, to_id, "reply")
if t.get('quoted_status'):
to_id = t['quoted_status']['id']
to_user = t['quoted_status']['author']['username']
to_user_id = t['quoted_status']['author']['id']
add(from_user, from_id, to_user, to_id, "quote")
if options.retweets and t.get('retweeted_status'):
to_id = t['retweeted_status']['id']
to_user = t['retweeted_status']['author']['username']
to_user_id = t['retweeted_status']['author']['id']
add(from_user, from_id, to_user, to_id, "retweet")
if options.min_subgraph_size or options.max_subgraph_size:
g_copy = G.copy()
for g in networkx.connected_component_subgraphs(G):
if options.min_subgraph_size and len(g) < options.min_subgraph_size:
g_copy.remove_nodes_from(g.nodes())
elif options.max_subgraph_size and len(g) > options.max_subgraph_size:
g_copy.remove_nodes_from(g.nodes())
G = g_copy
if output.endswith(".gexf"):
networkx.write_gexf(G, output)
elif output.endswith(".gml"):
networkx.write_gml(G, output)
elif output.endswith(".dot"):
nx_pydot.write_dot(G, output)
elif output.endswith(".json"):
json.dump(to_json(G), open(output, "w"), indent=2)
elif output.endswith(".html"):
graph_data = json.dumps(to_json(G), indent=2)
html = """<!DOCTYPE html>
<meta charset="utf-8">
<script src="https://platform.twitter.com/widgets.js"></script>
<script src="https://d3js.org/d3.v4.min.js"></script>
<script src="https://code.jquery.com/jquery-3.1.1.min.js"></script>
<style>
.links line {
stroke: #999;
stroke-opacity: 0.8;
stroke-width: 2px;
}
line.reply {
stroke: #999;
}
line.retweet {
stroke-dasharray: 5;
}
line.quote {
stroke-dasharray: 5;
}
.nodes circle {
stroke: red;
fill: red;
stroke-width: 1.5px;
}
circle.retweet {
fill: white;
stroke: #999;
}
circle.reply {
fill: #999;
stroke: #999;
}
circle.quote {
fill: yellow;
stroke: yellow;
}
#graph {
width: 99vw;
height: 99vh;
}
#tweet {
position: absolute;
left: 100px;
top: 150px;
}
</style>
<svg id="graph"></svg>
<div id="tweet"></div>
<script>
var width = $(window).width();
var height = $(window).height();
var svg = d3.select("svg")
.attr("height", height)
.attr("width", width);
var color = d3.scaleOrdinal(d3.schemeCategory20c);
var simulation = d3.forceSimulation()
.velocityDecay(0.6)
.force("link", d3.forceLink().id(function(d) { return d.id; }))
.force("charge", d3.forceManyBody())
.force("center", d3.forceCenter(width / 2, height / 2));
var graph = %s;
var link = svg.append("g")
.attr("class", "links")
.selectAll("line")
.data(graph.links)
.enter().append("line")
.attr("class", function(d) { return d.type; });
var node = svg.append("g")
.attr("class", "nodes")
.selectAll("circle")
.data(graph.nodes)
.enter().append("circle")
.attr("r", 5)
.attr("class", function(d) { return d.type; })
.call(d3.drag()
.on("start", dragstarted)
.on("drag", dragged)
.on("end", dragended));
node.append("title")
.text(function(d) { return d.id; });
node.on("click", function(d) {
$("#tweet").empty();
var rect = this.getBoundingClientRect();
var paneHeight = d.type == "retweet" ? 50 : 200;
var paneWidth = d.type == "retweet" ? 75 : 500;
var left = rect.x - paneWidth / 2;
if (rect.y > height / 2) {
var top = rect.y - paneHeight;
} else {
var top = rect.y + 10;
}
var tweet = $("#tweet");
tweet.css({left: left, top: top});
if (d.type == "retweet") {
twttr.widgets.createFollowButton(d.screen_name, tweet[0], {size: "large"});
} else {
twttr.widgets.createTweet(d.id, tweet[0], {conversation: "none"});
}
d3.event.stopPropagation();
});
svg.on("click", function(d) {
$("#tweet").empty();
});
simulation
.nodes(graph.nodes)
.on("tick", ticked);
simulation.force("link")
.links(graph.links);
function ticked() {
link
.attr("x1", function(d) { return d.source.x; })
.attr("y1", function(d) { return d.source.y; })
.attr("x2", function(d) { return d.target.x; })
.attr("y2", function(d) { return d.target.y; });
node
.attr("cx", function(d) { return d.x; })
.attr("cy", function(d) { return d.y; });
}
function dragstarted(d) {
if (!d3.event.active) simulation.alphaTarget(0.3).restart();
d.fx = d.x;
d.fy = d.y;
}
function dragged(d) {
d.fx = d3.event.x;
d.fy = d3.event.y;
}
function dragended(d) {
if (!d3.event.active) simulation.alphaTarget(0);
d.fx = null;
d.fy = null;
}
</script>
""" % graph_data
open(output, "w").write(html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment