Skip to content

Instantly share code, notes, and snippets.

@AnalogJ
Last active October 19, 2024 00:57
Show Gist options
  • Save AnalogJ/4c965e547a887f1cdcb5676efe3a08a5 to your computer and use it in GitHub Desktop.
Save AnalogJ/4c965e547a887f1cdcb5676efe3a08a5 to your computer and use it in GitHub Desktop.
import json
import csv
import os
# Define the input files and output file
input_files = [f'pitchbook-{i}.json' for i in range(1, 12)]
output_file = 'investors_data.csv'
# Define the headers for the CSV file
headers = [
"Investor", "Investments", "Active Portfolio", "AUM", "Primary Investor Type", "HQ Location", "Description",
"Preferred Industry", "Preferred Investment Types", "Website", "Primary Contact", "Primary Contact Title",
"Primary Contact Phone", "Primary Contact Email"
]
# Open the CSV file for writing
with open(output_file, 'w', newline='') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(headers)
# Loop through each file
for input_file in input_files:
# Load data from the JSON file
if os.path.exists(input_file):
with open(input_file, 'r') as f:
data = json.load(f)
# Extract the relevant data rows
data_rows = data.get("dataRows", [])
# Loop through each data row to extract necessary fields
for row in data_rows:
columns = row.get("columnValues", {})
investor = (columns.get("investorName", [{}]) or [{}])[0].get("name", "")
investments = (columns.get("investments", [{}]) or [{}])[0].get("value", "")
active_portfolio = (columns.get("activeInvestedCompanies", [{}]) or [{}])[0].get("value", "")
aum = (columns.get("capitalUnderManagement", [{}]) or [{}])[0].get("amount", "")
primary_investor_type = (columns.get("primaryInvestorType", [{}]) or [{}])[0].get("value", "")
hq_location = (columns.get("hqLocation", [{}]) or [{}])[0].get("value", "")
description = (columns.get("description", [{}]) or [{}])[0].get("value", "")
# Handle lists for Preferred Industry and Preferred Investment Types
preferred_industry = ", ".join([entry.get("value", "") for entry in columns.get("preferredIndustry", [])])
preferred_investment_types = ", ".join([entry.get("value", "") for entry in columns.get("preferredInvestmentTypes", [])])
website = (columns.get("investorWebsite", [{}]) or [{}])[0].get("value", "")
primary_contact = (columns.get("primaryContact", [{}]) or [{}])[0].get("name", "")
primary_contact_title = (columns.get("primaryContactTitle", [{}]) or [{}])[0].get("value", "")
primary_contact_phone = (columns.get("primaryContactPhone", [{}]) or [{}])[0].get("value", "")
primary_contact_email = (columns.get("primaryContactEmail", [{}]) or [{}])[0].get("value", "")
# Write the extracted row to the CSV file
csvwriter.writerow([
investor, investments, active_portfolio, aum, primary_investor_type, hq_location, description,
preferred_industry, preferred_investment_types, website, primary_contact, primary_contact_title,
primary_contact_phone, primary_contact_email
])
print(f"Data extraction complete. CSV file saved as {output_file}")
import json
import csv
import os
# Load the JSON data from the file
input_file_pattern = 'pitchbook-{}.json'
output_file_path = 'extracted_deals.csv'
# Define the headers for the CSV file
headers = [
"Company Name", "Deal Date", "Deal Type", "Deal Size", "Investor Name",
"Deal Synopsis", "Financing Status", "Investors", "Primary Industry Code",
"Verticals", "Company Website", "HQ Location"
]
def extract_deal_data(deal):
"""
Extract relevant fields from a single deal data entry.
"""
data = deal.get('columnValues', {})
return {
"Company Name": (data.get("companyName", [{}]) or [{}])[0].get("name", ""),
"Deal Date": (data.get("dealDate", [{}]) or [{}])[0].get("asOfdate", ""),
"Deal Type": (data.get("dealType", [{}]) or [{}])[0].get("value", ""),
"Deal Size": (data.get("dealSize", [{}]) or [{}])[0].get("amount", ""),
"Deal Synopsis": (data.get("dealSynopsis", [{}]) or [{}])[0].get("value", ""),
"Financing Status": (data.get("dealFinancingStatus", [{}]) or [{}])[0].get("value", ""),
"Investors": ", ".join([investor.get("name", "") for investor in data.get("investors", [])]),
"Primary Industry Code": (data.get("primaryIndustryCode", [{}]) or [{}])[0].get("value", ""),
"Verticals": ", ".join([vertical.get("value", "") for vertical in data.get("verticals", [])]),
"Company Website": (data.get("companyWebsite", [{}]) or [{}])[0].get("value", ""),
"HQ Location": (data.get("hqLocation", [{}]) or [{}])[0].get("value", "")
}
# Open the CSV file for writing
with open(output_file_path, 'w', newline='') as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames=headers)
csv_writer.writeheader()
# Loop through the files from pitchbook-1.json to pitchbook-21.json
for i in range(1, 22):
input_file_path = input_file_pattern.format(i)
# Check if the file exists
if not os.path.exists(input_file_path):
print(f"File {input_file_path} not found, skipping...")
continue
# Read the input JSON file
with open(input_file_path, 'r') as json_file:
data = json.load(json_file)
# Iterate through the JSON data and write each row to the CSV
for deal in data.get('dataRows', []):
row = extract_deal_data(deal)
csv_writer.writerow(row)
print(f"Data successfully extracted to {output_file_path}")
set -eo pipefail
for i in {1..100}; do
curl -o "pitchbook-$i.json" "https://example.com/fizz/page/$i?ref=pagination"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment