Created
April 3, 2014 12:29
-
-
Save taniki/9953402 to your computer and use it in GitHub Desktop.
import wikipedia visits dataset to neo4j
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import requests | |
| dataset = "data/wkp-steps.20140402.json" | |
| neo4j_endpoint = "http://localhost:7474/db/data/cypher" | |
| pages = set() | |
| users = set() | |
| visits = [] | |
| def main(): | |
| for step_json in open(dataset): | |
| data = json.loads(step_json) | |
| print(data) | |
| pages.add(data["url"]) | |
| users.add(data["userId"]) | |
| visits.append({ | |
| "user_id": data["userId"], | |
| "page_url": data["url"], | |
| "time": data["time"] | |
| }) | |
| print("pages: %i" % len(pages)) | |
| print("users: %i" % len(users)) | |
| import_pages() | |
| import_users() | |
| import_visits() | |
| def import_pages(): | |
| r = {} | |
| properties = [] | |
| r["query"] = "CREATE (p:Page { properties }) RETURN p" | |
| for url in pages: | |
| p = { | |
| "url": url, | |
| "title": url | |
| } | |
| properties.append(p) | |
| r["params"] = { | |
| "properties": properties | |
| } | |
| result = requests.post(neo4j_endpoint, data=json.dumps(r)) | |
| def import_users(): | |
| r = {} | |
| properties = [] | |
| r["query"] = "CREATE (p:User { properties }) RETURN p" | |
| for id in users: | |
| p = { | |
| "google_id": id, | |
| } | |
| properties.append(p) | |
| r["params"] = { | |
| "properties": properties | |
| } | |
| result = requests.post(neo4j_endpoint, data=json.dumps(r)) | |
| def import_visits(): | |
| for v in visits: | |
| r = {} | |
| r["query"] = """ | |
| MATCH (u:User),(p:Page) | |
| WHERE u.google_id = {user} AND p.url = {page} | |
| CREATE (u)-[r:visited { time: {time} }]->(p) | |
| RETURN r | |
| """ | |
| r["params"] = { | |
| "user": v["user_id"], | |
| "page": v["page_url"], | |
| "time": v["time"] | |
| } | |
| result = requests.post(neo4j_endpoint, data=json.dumps(r)) | |
| return | |
| def url2title(url): | |
| return url | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment