Created
August 21, 2024 07:31
-
-
Save neelabalan/67942b1593f379c24a58ebd55f1ba119 to your computer and use it in GitHub Desktop.
Load from gh archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import gzip | |
import json | |
from pymongo import MongoClient | |
mongo_uri = "mongodb://nebula.local:27017" | |
client = MongoClient(mongo_uri) | |
db = client['analytics'] | |
collection = db['gh_archive'] | |
directory = '/Users/neelabalan/data_gharchive' | |
def insert_jsonl_files(directory): | |
# List all .json.gz files in the directory | |
for filename in os.listdir(directory): | |
if filename.endswith(".json.gz"): | |
file_path = os.path.join(directory, filename) | |
with gzip.open(file_path, 'rt', encoding='utf-8') as gz_file: | |
# Read the file line by line | |
bulk_data = [] | |
for line in gz_file: | |
# Parse the JSON object | |
json_obj = json.loads(line) | |
bulk_data.append(json_obj) | |
# Insert data in bulk every 1000 records to optimize performance | |
if len(bulk_data) >= 1000: | |
collection.insert_many(bulk_data) | |
bulk_data = [] | |
# Insert any remaining data | |
if bulk_data: | |
collection.insert_many(bulk_data) | |
print(f"Inserted data from {filename} into MongoDB.") | |
if __name__ == "__main__": | |
insert_jsonl_files(directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
wget https://data.gharchive.org/2020-01-{01..31}-{0..23}.json.gz