Skip to content

Instantly share code, notes, and snippets.

@neelabalan
Created August 21, 2024 07:31
Show Gist options
  • Save neelabalan/67942b1593f379c24a58ebd55f1ba119 to your computer and use it in GitHub Desktop.
Save neelabalan/67942b1593f379c24a58ebd55f1ba119 to your computer and use it in GitHub Desktop.
Load from gh archive
import os
import gzip
import json
from pymongo import MongoClient
mongo_uri = "mongodb://nebula.local:27017"
client = MongoClient(mongo_uri)
db = client['analytics']
collection = db['gh_archive']
directory = '/Users/neelabalan/data_gharchive'
def insert_jsonl_files(directory):
# List all .json.gz files in the directory
for filename in os.listdir(directory):
if filename.endswith(".json.gz"):
file_path = os.path.join(directory, filename)
with gzip.open(file_path, 'rt', encoding='utf-8') as gz_file:
# Read the file line by line
bulk_data = []
for line in gz_file:
# Parse the JSON object
json_obj = json.loads(line)
bulk_data.append(json_obj)
# Insert data in bulk every 1000 records to optimize performance
if len(bulk_data) >= 1000:
collection.insert_many(bulk_data)
bulk_data = []
# Insert any remaining data
if bulk_data:
collection.insert_many(bulk_data)
print(f"Inserted data from {filename} into MongoDB.")
if __name__ == "__main__":
insert_jsonl_files(directory)
@neelabalan
Copy link
Author

wget https://data.gharchive.org/2020-01-{01..31}-{0..23}.json.gz

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment