Last active
October 19, 2024 00:57
-
-
Save AnalogJ/4c965e547a887f1cdcb5676efe3a08a5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
import os | |
# Define the input files and output file | |
input_files = [f'pitchbook-{i}.json' for i in range(1, 12)] | |
output_file = 'investors_data.csv' | |
# Define the headers for the CSV file | |
headers = [ | |
"Investor", "Investments", "Active Portfolio", "AUM", "Primary Investor Type", "HQ Location", "Description", | |
"Preferred Industry", "Preferred Investment Types", "Website", "Primary Contact", "Primary Contact Title", | |
"Primary Contact Phone", "Primary Contact Email" | |
] | |
# Open the CSV file for writing | |
with open(output_file, 'w', newline='') as csvfile: | |
csvwriter = csv.writer(csvfile) | |
csvwriter.writerow(headers) | |
# Loop through each file | |
for input_file in input_files: | |
# Load data from the JSON file | |
if os.path.exists(input_file): | |
with open(input_file, 'r') as f: | |
data = json.load(f) | |
# Extract the relevant data rows | |
data_rows = data.get("dataRows", []) | |
# Loop through each data row to extract necessary fields | |
for row in data_rows: | |
columns = row.get("columnValues", {}) | |
investor = (columns.get("investorName", [{}]) or [{}])[0].get("name", "") | |
investments = (columns.get("investments", [{}]) or [{}])[0].get("value", "") | |
active_portfolio = (columns.get("activeInvestedCompanies", [{}]) or [{}])[0].get("value", "") | |
aum = (columns.get("capitalUnderManagement", [{}]) or [{}])[0].get("amount", "") | |
primary_investor_type = (columns.get("primaryInvestorType", [{}]) or [{}])[0].get("value", "") | |
hq_location = (columns.get("hqLocation", [{}]) or [{}])[0].get("value", "") | |
description = (columns.get("description", [{}]) or [{}])[0].get("value", "") | |
# Handle lists for Preferred Industry and Preferred Investment Types | |
preferred_industry = ", ".join([entry.get("value", "") for entry in columns.get("preferredIndustry", [])]) | |
preferred_investment_types = ", ".join([entry.get("value", "") for entry in columns.get("preferredInvestmentTypes", [])]) | |
website = (columns.get("investorWebsite", [{}]) or [{}])[0].get("value", "") | |
primary_contact = (columns.get("primaryContact", [{}]) or [{}])[0].get("name", "") | |
primary_contact_title = (columns.get("primaryContactTitle", [{}]) or [{}])[0].get("value", "") | |
primary_contact_phone = (columns.get("primaryContactPhone", [{}]) or [{}])[0].get("value", "") | |
primary_contact_email = (columns.get("primaryContactEmail", [{}]) or [{}])[0].get("value", "") | |
# Write the extracted row to the CSV file | |
csvwriter.writerow([ | |
investor, investments, active_portfolio, aum, primary_investor_type, hq_location, description, | |
preferred_industry, preferred_investment_types, website, primary_contact, primary_contact_title, | |
primary_contact_phone, primary_contact_email | |
]) | |
print(f"Data extraction complete. CSV file saved as {output_file}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
import os | |
# Load the JSON data from the file | |
input_file_pattern = 'pitchbook-{}.json' | |
output_file_path = 'extracted_deals.csv' | |
# Define the headers for the CSV file | |
headers = [ | |
"Company Name", "Deal Date", "Deal Type", "Deal Size", "Investor Name", | |
"Deal Synopsis", "Financing Status", "Investors", "Primary Industry Code", | |
"Verticals", "Company Website", "HQ Location" | |
] | |
def extract_deal_data(deal): | |
""" | |
Extract relevant fields from a single deal data entry. | |
""" | |
data = deal.get('columnValues', {}) | |
return { | |
"Company Name": (data.get("companyName", [{}]) or [{}])[0].get("name", ""), | |
"Deal Date": (data.get("dealDate", [{}]) or [{}])[0].get("asOfdate", ""), | |
"Deal Type": (data.get("dealType", [{}]) or [{}])[0].get("value", ""), | |
"Deal Size": (data.get("dealSize", [{}]) or [{}])[0].get("amount", ""), | |
"Deal Synopsis": (data.get("dealSynopsis", [{}]) or [{}])[0].get("value", ""), | |
"Financing Status": (data.get("dealFinancingStatus", [{}]) or [{}])[0].get("value", ""), | |
"Investors": ", ".join([investor.get("name", "") for investor in data.get("investors", [])]), | |
"Primary Industry Code": (data.get("primaryIndustryCode", [{}]) or [{}])[0].get("value", ""), | |
"Verticals": ", ".join([vertical.get("value", "") for vertical in data.get("verticals", [])]), | |
"Company Website": (data.get("companyWebsite", [{}]) or [{}])[0].get("value", ""), | |
"HQ Location": (data.get("hqLocation", [{}]) or [{}])[0].get("value", "") | |
} | |
# Open the CSV file for writing | |
with open(output_file_path, 'w', newline='') as csv_file: | |
csv_writer = csv.DictWriter(csv_file, fieldnames=headers) | |
csv_writer.writeheader() | |
# Loop through the files from pitchbook-1.json to pitchbook-21.json | |
for i in range(1, 22): | |
input_file_path = input_file_pattern.format(i) | |
# Check if the file exists | |
if not os.path.exists(input_file_path): | |
print(f"File {input_file_path} not found, skipping...") | |
continue | |
# Read the input JSON file | |
with open(input_file_path, 'r') as json_file: | |
data = json.load(json_file) | |
# Iterate through the JSON data and write each row to the CSV | |
for deal in data.get('dataRows', []): | |
row = extract_deal_data(deal) | |
csv_writer.writerow(row) | |
print(f"Data successfully extracted to {output_file_path}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set -eo pipefail | |
for i in {1..100}; do | |
curl -o "pitchbook-$i.json" "https://example.com/fizz/page/$i?ref=pagination" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment