Last active
October 16, 2019 10:12
-
-
Save agmangas/ad67c484430a1a05ffec62402a533e16 to your computer and use it in GitHub Desktop.
Eleccia Dataset Builder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import math | |
import pprint | |
from pymongo import MongoClient | |
MONGO_URL = "mongodb://devit.fundacionctic.org:27117" | |
LOG_SCALE = False | |
SIZE_THRESHOLD = 200 | |
def find_node_party(client, name): | |
doc = client.politica.seguidores_cuentas_info.find_one({"cuenta": name}) | |
return doc["partido"] if doc else None | |
def find_node_image(client, name): | |
doc = client.politica.seguidores_cuentas_info.find_one({"cuenta": name}) | |
return doc["profile_image_url"] if doc else None | |
def find_node_human_name(client, name): | |
doc = client.politica.seguidores_cuentas_info.find_one({"cuenta": name}) | |
return doc["nombre"] if doc else None | |
def main(): | |
client = MongoClient(MONGO_URL) | |
print("Finding nodes") | |
node_docs = list(client.politica.seguidores_simplificado_nodos.find({ | |
"size": {"$gte": SIZE_THRESHOLD} | |
})) | |
print("Building nodes") | |
nodes = [{ | |
"id": idx, | |
"name": doc["id_node"], | |
"label": doc["id_node"], | |
"title": doc["id_node"], | |
"value": math.log(doc["size"], 2) if LOG_SCALE else doc["size"], | |
"group": find_node_party(client, doc["id_node"]) | |
} for idx, doc in enumerate(node_docs)] | |
for node in nodes: | |
image_url = find_node_image(client, node["name"]) | |
if image_url is not None: | |
node.update({"shape": "circularImage", "image": image_url}) | |
human_name = find_node_human_name(client, node["name"]) | |
if human_name is not None: | |
node.update({"label": human_name, "title": human_name}) | |
nodes_by_name = {item["name"]: item for item in nodes} | |
assert len(nodes) == len(nodes_by_name) | |
print("Building edges") | |
edge_docs = list(client.politica.seguidores_simplificado_ejes.find()) | |
edges = [{ | |
"id": idx, | |
"from": nodes_by_name.get(doc["id_node"], {}).get("id", None), | |
"to": nodes_by_name.get(doc["cuenta"], {}).get("id", None) | |
} for idx, doc in enumerate(edge_docs)] | |
edges = [ | |
item for item in edges | |
if item["from"] is not None and item["to"] is not None | |
] | |
print("Final # nodes: {}".format(len(nodes))) | |
print("Final # edges: {}".format(len(edges))) | |
with open("./output.json", "w") as fh: | |
fh.write(json.dumps({"nodes": nodes, "edges": edges})) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment