Created
March 13, 2024 18:32
-
-
Save jeremyboggs/f0d35fb1dcd80bf53dd512be18baeaec to your computer and use it in GitHub Desktop.
Converts JSONL generated from Browsertrix to CSV.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
# Identify the JSONL fields we wish to retrieve and write | |
# to the CSV file. | |
fields = ['id','title','text','url'] | |
# Open a new CSV file and open the existing JSONL file. | |
with open('pages.csv', 'w') as csv_file, open("pages.jsonl", "r") as json_file: | |
# Create a CSV writer object. | |
writer = csv.writer(csv_file) | |
# Write the header role to the the CSV using our predefined fields. | |
writer.writerow(fields) | |
# Loop over each line in the JSONL file to read each page object. | |
for i,line in enumerate(json_file.readlines()): | |
# Skip the first line in the JSONL, since that contains metadata | |
# about the crawl and is not a specific page object. | |
if i == 0: | |
continue | |
json_data = json.loads(line) | |
datarow = [] | |
for field in fields: | |
# Confirm the field is a key in the JSONL object. | |
if field in json_data.keys(): | |
# Convert value to a string and replace new lines with a space. | |
value = str(json_data[field]).replace('\n',' ') | |
datarow.append(value) | |
else: | |
# If the JSON object does not contain the field, insert an empty placeholder. | |
datarow.append("") | |
writer.writerow(datarow) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment