jeremyboggs · March 13, 2024 18:32
diff --git a/jsonl-to-csv.py b/jsonl-to-csv.py
 import json
 import csv

 # Identify the JSONL fields we wish to retrieve and write
 # to the CSV file.
 fields = ['id','title','text','url']

 # Open a new CSV file and open the existing JSONL file.
 with open('pages.csv', 'w') as csv_file, open("pages.jsonl", "r") as json_file:
    # Create a CSV writer object.
    writer = csv.writer(csv_file)

    # Write the header role to the the CSV using our predefined fields.
    writer.writerow(fields)

    # Loop over each line in the JSONL file to read each page object.
    for i,line in enumerate(json_file.readlines()):

        # Skip the first line in the JSONL, since that contains metadata
        # about the crawl and is not a specific page object.
        if i == 0:
            continue
        json_data = json.loads(line)
        datarow = []
        for field in fields:

            # Confirm the field is a key in the JSONL object.
            if field in json_data.keys():

                # Convert value to a string and replace new lines with a space.
                value = str(json_data[field]).replace('\n',' ')
                datarow.append(value)
            else:
                # If the JSON object does not contain the field, insert an empty placeholder.
                datarow.append("")
        writer.writerow(datarow)
	import json
	import csv

	# Identify the JSONL fields we wish to retrieve and write
	# to the CSV file.
	fields = ['id','title','text','url']

	# Open a new CSV file and open the existing JSONL file.
	with open('pages.csv', 'w') as csv_file, open("pages.jsonl", "r") as json_file:
	# Create a CSV writer object.
	writer = csv.writer(csv_file)

	# Write the header role to the the CSV using our predefined fields.
	writer.writerow(fields)

	# Loop over each line in the JSONL file to read each page object.
	for i,line in enumerate(json_file.readlines()):

	# Skip the first line in the JSONL, since that contains metadata
	# about the crawl and is not a specific page object.
	if i == 0:
	continue
	json_data = json.loads(line)
	datarow = []
	for field in fields:

	# Confirm the field is a key in the JSONL object.
	if field in json_data.keys():

	# Convert value to a string and replace new lines with a space.
	value = str(json_data[field]).replace('\n',' ')
	datarow.append(value)
	else:
	# If the JSON object does not contain the field, insert an empty placeholder.
	datarow.append("")
	writer.writerow(datarow)
No results found