Instantly share code, notes, and snippets.
Created
July 3, 2015 18:24
-
Star
1
(1)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save alexwoolford/2e0f96ba39dca045e597 to your computer and use it in GitHub Desktop.
Python script to build a Neo4j graph from MySQL data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import MySQLdb | |
| from py2neo import Graph | |
| import logging | |
| import sys | |
| from py2neo.packages.httpstream import http | |
| import json | |
| """\ | |
| Export cookies and urls from MySQL, then import to Neo4j | |
| """ | |
| root = logging.getLogger() | |
| root.setLevel(logging.DEBUG) | |
| ch = logging.StreamHandler(sys.stdout) | |
| ch.setLevel(logging.DEBUG) | |
| formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| ch.setFormatter(formatter) | |
| root.addHandler(ch) | |
| class CookieUrlExport: | |
| # TODO: delete csv files if they already exist | |
| def __init__(self): | |
| logging.info("connecting to MySQL") | |
| self.conn = MySQLdb.connect(user="root", passwd="????????", host="localhost", db="test") | |
| self.cursor = self.conn.cursor() | |
| def write_urls(self): | |
| logging.info("writing urls.csv") | |
| sql = """\ | |
| SELECT 'url' | |
| UNION ALL | |
| SELECT DISTINCT | |
| CONCAT('`', url, '`') AS url | |
| FROM cookie_guid_url_labels | |
| ORDER BY url | |
| INTO OUTFILE '/tmp/urls.csv' | |
| FIELDS TERMINATED BY ',' ENCLOSED BY '"' | |
| LINES TERMINATED BY '\\n' """ | |
| self.cursor.execute(sql) | |
| def write_cookies(self): | |
| logging.info("writing cookies.csv") | |
| sql = """\ | |
| SELECT 'cookie' | |
| UNION ALL | |
| SELECT DISTINCT | |
| CONCAT('`', cookie, '`') AS cookie | |
| FROM cookie_guid_url_labels | |
| ORDER BY cookie | |
| INTO OUTFILE '/tmp/cookies.csv' | |
| FIELDS TERMINATED BY ',' ENCLOSED BY '"' | |
| LINES TERMINATED BY '\\n'; | |
| """ | |
| self.cursor.execute(sql) | |
| def write_relationships(self): | |
| logging.info("writing relationships.csv") | |
| sql = """\ | |
| SELECT 'cookie', 'url', 'action' | |
| UNION ALL | |
| SELECT DISTINCT | |
| CONCAT('`', cookie, '`') AS cookie, | |
| CONCAT('`', url, '`') AS url, | |
| 'VISITED' | |
| FROM cookie_guid_url_labels | |
| ORDER BY cookie, url | |
| INTO OUTFILE '/tmp/relationships.csv' | |
| FIELDS TERMINATED BY ',' ENCLOSED BY '"' | |
| LINES TERMINATED BY '\\n'; | |
| """ | |
| self.cursor.execute(sql) | |
| def __del__(self): | |
| logging.info("closing MySQL connection") | |
| self.conn.close() | |
| class CookieUrlImport: | |
| def __init__(self): | |
| http.socket_timeout = 9999 | |
| self.graph = Graph() | |
| def delete_all_nodes_relationships(self): | |
| logging.info("deleting all nodes and relationships from neo4j") | |
| self.graph.delete_all() | |
| def load_url_nodes(self): | |
| logging.info("loading url nodes") | |
| cypher_query = """\ | |
| USING PERIODIC COMMIT 500 | |
| LOAD CSV WITH HEADERS FROM "file:/tmp/urls.csv" AS csvLine | |
| CREATE (u:Url { url: csvLine.url }) | |
| """ | |
| self.graph.cypher.execute(cypher_query) | |
| def load_cookie_nodes(self): | |
| logging.info("loading cookie nodes") | |
| cypher_query = """\ | |
| USING PERIODIC COMMIT 500 | |
| LOAD CSV WITH HEADERS FROM "file:/tmp/cookies.csv" AS csvLine | |
| CREATE (c:Cookie { cookie: csvLine.cookie }) | |
| """ | |
| self.graph.cypher.execute(cypher_query) | |
| def create_unique_constraints(self): | |
| # This speeds up the `MATCH` function below: | |
| logging.info("creating unique constraint on url nodes") | |
| self.graph.cypher.execute('CREATE CONSTRAINT ON (url:Url) ASSERT url.url IS UNIQUE') | |
| logging.info("creating unique constraint on cookie nodes") | |
| self.graph.cypher.execute('CREATE CONSTRAINT ON (cookie:Cookie) ASSERT cookie.cookie IS UNIQUE') | |
| def load_relationships(self): | |
| logging.info("loading relationships") | |
| cypher_query = """\ | |
| USING PERIODIC COMMIT 500 | |
| LOAD CSV WITH HEADERS FROM "file:/tmp/relationships.csv" AS csvLine | |
| MATCH (cookie:Cookie { cookie: csvLine.cookie}),(url:Url { url: csvLine.url}) | |
| CREATE (cookie)-[:VISITED { url: csvLine.url }]->(url) | |
| """ | |
| self.graph.cypher.execute(cypher_query) | |
| class CookieUrlLabels: | |
| def __init__(self): | |
| logging.info("connecting to MySQL") | |
| self.conn = MySQLdb.connect(user="awoolford", passwd="????????", host="deepthought", db="test") | |
| self.cursor = self.conn.cursor() | |
| self.graph = Graph() | |
| def get_url_labels(self): | |
| sql = """\ | |
| SELECT | |
| url, | |
| label_1 AS label, | |
| score_1 AS score, | |
| '1' AS level | |
| FROM cookie_guid_url_labels | |
| WHERE label_1 != '' | |
| UNION | |
| SELECT | |
| url, | |
| label_2 AS label, | |
| score_2 AS score, | |
| '2' AS level | |
| FROM cookie_guid_url_labels | |
| WHERE label_2 != '' | |
| UNION | |
| SELECT | |
| url, | |
| label_3 AS label, | |
| score_3 AS score, | |
| '3' AS level | |
| FROM cookie_guid_url_labels | |
| WHERE label_3 != '' | |
| """ | |
| self.cursor.execute(sql) | |
| records = self.cursor.fetchall() | |
| for record in records: | |
| url, taxonomy, score, level = record | |
| cypher_query = """MATCH (u:Url {{url: "`{0}`"}}) SET u.taxonomy = {1}, u.score = {2}""".format(url, json.dumps(taxonomy), score) | |
| self.graph.cypher.execute(cypher_query) | |
| if __name__ == "__main__": | |
| exp = CookieUrlExport() | |
| exp.write_urls() | |
| exp.write_cookies() | |
| exp.write_relationships() | |
| del exp | |
| imp = CookieUrlImport() | |
| imp.delete_all_nodes_relationships() | |
| imp.load_url_nodes() | |
| imp.load_cookie_nodes() | |
| imp.create_unique_constraints() | |
| imp.load_relationships() | |
| del imp | |
| lab = CookieUrlLabels() | |
| lab.get_url_labels() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment