Last active
May 17, 2021 10:36
-
-
Save numeroteca/aa040b0488c914d1e4a37e40117ef062 to your computer and use it in GitHub Desktop.
minor changes to adapt network.py to v2 API for twarc2. See https://github.com/DocNow/twarc/issues/461
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# build a reply, quote, retweet network from a file of tweets and write it | |
# out as a gexf, dot, json or html file. You will need to have networkx | |
# installed and pydotplus if you want to use dot. The html presentation | |
# uses d3 to display the network graph in your browser. | |
# | |
# ./network.py tweets.jsonl network.html | |
# | |
# or | |
# ./network.py tweets.jsonl network.dot | |
# | |
# or | |
# | |
# ./network.py tweets.jsonl network.gexf | |
# | |
# if you would rather have the network oriented around nodes that are users | |
# instead of tweets use the --users flag | |
# | |
# ./network.py --users tweets.jsonl network.gexf | |
# | |
# if you would rather have the network oriented around nodes that are hashtags | |
# instead of tweets or users, use the --hashtags flag | |
# | |
# TODO: this is mostly here some someone can improve it :) | |
import sys | |
import json | |
import networkx | |
import optparse | |
import itertools | |
import time | |
from networkx import nx_pydot | |
from networkx.readwrite import json_graph | |
usage = "network.py tweets.jsonl graph.html" | |
opt_parser = optparse.OptionParser(usage=usage) | |
opt_parser.add_option( | |
"--retweets", | |
dest="retweets", | |
action="store_true", | |
help="include retweets" | |
) | |
opt_parser.add_option( | |
"--min_subgraph_size", | |
dest="min_subgraph_size", | |
type="int", | |
help="remove any subgraphs with a size smaller than this number" | |
) | |
opt_parser.add_option( | |
"--max_subgraph_size", | |
dest="max_subgraph_size", | |
type="int", | |
help="remove any subgraphs with a size larger than this number" | |
) | |
opt_parser.add_option( | |
"--users", | |
dest="users", | |
action="store_true", | |
help="show user relations instead of tweet relations" | |
) | |
opt_parser.add_option( | |
"--hashtags", | |
dest="hashtags", | |
action="store_true", | |
help="show hashtag relations instead of tweet relations" | |
) | |
options, args = opt_parser.parse_args() | |
if len(args) != 2: | |
opt_parser.error("must supply input and output file names") | |
tweets, output = args | |
G = networkx.DiGraph() | |
def add(from_user, from_id, to_user, to_id, type, created_at=None): | |
"adds a relation to the graph" | |
# storing start_data will allow for timestamps for gephi timeline, where nodes will appear on screen at their start dataset | |
# and stay on forever after | |
if (options.users or options.hashtags) and to_user: | |
G.add_node(from_user, screen_name=from_user, start_date=created_at) | |
G.add_node(to_user, screen_name=to_user, start_date=created_at) | |
if G.has_edge(from_user, to_user): | |
weight = G[from_user][to_user]['weight'] + 1 | |
else: | |
weight = 1 | |
G.add_edge(from_user, to_user, type=type, weight=weight) | |
elif not options.users and to_id: | |
G.add_node(from_id, screen_name=from_user, type=type) | |
if to_user: | |
G.add_node(to_id, screen_name=to_user) | |
else: | |
G.add_node(to_id) | |
G.add_edge(from_id, to_id, type=type) | |
def to_json(g): | |
j = {"nodes": [], "links": []} | |
for node_id, node_attrs in g.nodes(True): | |
j["nodes"].append({ | |
"id": node_id, | |
"type": node_attrs.get("type"), | |
"screen_name": node_attrs.get("screen_name") | |
}) | |
for source, target, attrs in g.edges(data=True): | |
j["links"].append({ | |
"source": source, | |
"target": target, | |
"type": attrs.get("type") | |
}) | |
return j | |
for line in open(tweets): | |
try: | |
t = json.loads(line) | |
except: | |
continue | |
from_id = t['id'] | |
from_user = t['author']['username'] | |
from_user_id = t['author']['id'] | |
to_user = None | |
to_id = None | |
# standardize raw created at date to dd/MM/yyyy HH:mm:ss | |
# created_at_date = time.strftime('%d/%m/%Y %H:%M:%S', time.strptime(t["created_at"],'%a %b %d %H:%M:%S +0000 %Y')) | |
created_at_date = time.strftime('%d/%m/%Y %H:%M:%S', time.strptime(t["created_at"],'%Y-%m-%dT%H:%M:%S.%fZ')) | |
if options.users: | |
for u in t['entities'].get('user_mentions', []): | |
add(from_user, from_id, u['username'], None, 'reply', created_at_date) | |
elif options.hashtags: | |
hashtags = t['entities'].get('hashtags', []) | |
hashtag_pairs = list(itertools.combinations(hashtags, 2)) # list of all possible hashtag pairs | |
for u in hashtag_pairs: | |
# source hashtag: u[0]['text'] | |
# target hashtag: u[1]['text'] | |
add('#' + u[0]['text'], None, '#' + u[1]['text'], None, 'hashtag', created_at_date) | |
else: | |
if t.get('in_reply_to_status_id_str'): | |
to_id = t['in_reply_to_status_id_str'] | |
to_user = t['in_reply_to_screen_name'] | |
add(from_user, from_id, to_user, to_id, "reply") | |
if t.get('quoted_status'): | |
to_id = t['quoted_status']['id'] | |
to_user = t['quoted_status']['author']['username'] | |
to_user_id = t['quoted_status']['author']['id'] | |
add(from_user, from_id, to_user, to_id, "quote") | |
if options.retweets and t.get('retweeted_status'): | |
to_id = t['retweeted_status']['id'] | |
to_user = t['retweeted_status']['author']['username'] | |
to_user_id = t['retweeted_status']['author']['id'] | |
add(from_user, from_id, to_user, to_id, "retweet") | |
if options.min_subgraph_size or options.max_subgraph_size: | |
g_copy = G.copy() | |
for g in networkx.connected_component_subgraphs(G): | |
if options.min_subgraph_size and len(g) < options.min_subgraph_size: | |
g_copy.remove_nodes_from(g.nodes()) | |
elif options.max_subgraph_size and len(g) > options.max_subgraph_size: | |
g_copy.remove_nodes_from(g.nodes()) | |
G = g_copy | |
if output.endswith(".gexf"): | |
networkx.write_gexf(G, output) | |
elif output.endswith(".gml"): | |
networkx.write_gml(G, output) | |
elif output.endswith(".dot"): | |
nx_pydot.write_dot(G, output) | |
elif output.endswith(".json"): | |
json.dump(to_json(G), open(output, "w"), indent=2) | |
elif output.endswith(".html"): | |
graph_data = json.dumps(to_json(G), indent=2) | |
html = """<!DOCTYPE html> | |
<meta charset="utf-8"> | |
<script src="https://platform.twitter.com/widgets.js"></script> | |
<script src="https://d3js.org/d3.v4.min.js"></script> | |
<script src="https://code.jquery.com/jquery-3.1.1.min.js"></script> | |
<style> | |
.links line { | |
stroke: #999; | |
stroke-opacity: 0.8; | |
stroke-width: 2px; | |
} | |
line.reply { | |
stroke: #999; | |
} | |
line.retweet { | |
stroke-dasharray: 5; | |
} | |
line.quote { | |
stroke-dasharray: 5; | |
} | |
.nodes circle { | |
stroke: red; | |
fill: red; | |
stroke-width: 1.5px; | |
} | |
circle.retweet { | |
fill: white; | |
stroke: #999; | |
} | |
circle.reply { | |
fill: #999; | |
stroke: #999; | |
} | |
circle.quote { | |
fill: yellow; | |
stroke: yellow; | |
} | |
#graph { | |
width: 99vw; | |
height: 99vh; | |
} | |
#tweet { | |
position: absolute; | |
left: 100px; | |
top: 150px; | |
} | |
</style> | |
<svg id="graph"></svg> | |
<div id="tweet"></div> | |
<script> | |
var width = $(window).width(); | |
var height = $(window).height(); | |
var svg = d3.select("svg") | |
.attr("height", height) | |
.attr("width", width); | |
var color = d3.scaleOrdinal(d3.schemeCategory20c); | |
var simulation = d3.forceSimulation() | |
.velocityDecay(0.6) | |
.force("link", d3.forceLink().id(function(d) { return d.id; })) | |
.force("charge", d3.forceManyBody()) | |
.force("center", d3.forceCenter(width / 2, height / 2)); | |
var graph = %s; | |
var link = svg.append("g") | |
.attr("class", "links") | |
.selectAll("line") | |
.data(graph.links) | |
.enter().append("line") | |
.attr("class", function(d) { return d.type; }); | |
var node = svg.append("g") | |
.attr("class", "nodes") | |
.selectAll("circle") | |
.data(graph.nodes) | |
.enter().append("circle") | |
.attr("r", 5) | |
.attr("class", function(d) { return d.type; }) | |
.call(d3.drag() | |
.on("start", dragstarted) | |
.on("drag", dragged) | |
.on("end", dragended)); | |
node.append("title") | |
.text(function(d) { return d.id; }); | |
node.on("click", function(d) { | |
$("#tweet").empty(); | |
var rect = this.getBoundingClientRect(); | |
var paneHeight = d.type == "retweet" ? 50 : 200; | |
var paneWidth = d.type == "retweet" ? 75 : 500; | |
var left = rect.x - paneWidth / 2; | |
if (rect.y > height / 2) { | |
var top = rect.y - paneHeight; | |
} else { | |
var top = rect.y + 10; | |
} | |
var tweet = $("#tweet"); | |
tweet.css({left: left, top: top}); | |
if (d.type == "retweet") { | |
twttr.widgets.createFollowButton(d.screen_name, tweet[0], {size: "large"}); | |
} else { | |
twttr.widgets.createTweet(d.id, tweet[0], {conversation: "none"}); | |
} | |
d3.event.stopPropagation(); | |
}); | |
svg.on("click", function(d) { | |
$("#tweet").empty(); | |
}); | |
simulation | |
.nodes(graph.nodes) | |
.on("tick", ticked); | |
simulation.force("link") | |
.links(graph.links); | |
function ticked() { | |
link | |
.attr("x1", function(d) { return d.source.x; }) | |
.attr("y1", function(d) { return d.source.y; }) | |
.attr("x2", function(d) { return d.target.x; }) | |
.attr("y2", function(d) { return d.target.y; }); | |
node | |
.attr("cx", function(d) { return d.x; }) | |
.attr("cy", function(d) { return d.y; }); | |
} | |
function dragstarted(d) { | |
if (!d3.event.active) simulation.alphaTarget(0.3).restart(); | |
d.fx = d.x; | |
d.fy = d.y; | |
} | |
function dragged(d) { | |
d.fx = d3.event.x; | |
d.fy = d3.event.y; | |
} | |
function dragended(d) { | |
if (!d3.event.active) simulation.alphaTarget(0); | |
d.fx = null; | |
d.fy = null; | |
} | |
</script> | |
""" % graph_data | |
open(output, "w").write(html) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment