AnalogJ · October 19, 2024 00:57
diff --git a/extract-investor.py b/extract-investor.py
 import json
 import csv
 import os

 # Define the input files and output file
 input_files = [f'pitchbook-{i}.json' for i in range(1, 12)]
 output_file = 'investors_data.csv'

 # Define the headers for the CSV file
 headers = [
    "Investor", "Investments", "Active Portfolio", "AUM", "Primary Investor Type", "HQ Location", "Description", 
    "Preferred Industry", "Preferred Investment Types", "Website", "Primary Contact", "Primary Contact Title", 
    "Primary Contact Phone", "Primary Contact Email"
 ]

 # Open the CSV file for writing
 with open(output_file, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(headers)
    
    # Loop through each file
    for input_file in input_files:
        # Load data from the JSON file
        if os.path.exists(input_file):
            with open(input_file, 'r') as f:
                data = json.load(f)
                
                # Extract the relevant data rows
                data_rows = data.get("dataRows", [])
                
                # Loop through each data row to extract necessary fields
                for row in data_rows:
                    columns = row.get("columnValues", {})
                    
                    investor = (columns.get("investorName", [{}]) or [{}])[0].get("name", "")
                    investments = (columns.get("investments", [{}]) or [{}])[0].get("value", "")
                    active_portfolio = (columns.get("activeInvestedCompanies", [{}]) or [{}])[0].get("value", "")
                    aum = (columns.get("capitalUnderManagement", [{}]) or [{}])[0].get("amount", "")
                    primary_investor_type = (columns.get("primaryInvestorType", [{}]) or [{}])[0].get("value", "")
                    hq_location = (columns.get("hqLocation", [{}]) or [{}])[0].get("value", "")
                    description = (columns.get("description", [{}]) or [{}])[0].get("value", "")
                    
                    # Handle lists for Preferred Industry and Preferred Investment Types
                    preferred_industry = ", ".join([entry.get("value", "") for entry in columns.get("preferredIndustry", [])])
                    preferred_investment_types = ", ".join([entry.get("value", "") for entry in columns.get("preferredInvestmentTypes", [])])
                    
                    website = (columns.get("investorWebsite", [{}]) or [{}])[0].get("value", "")
                    primary_contact = (columns.get("primaryContact", [{}]) or [{}])[0].get("name", "")
                    primary_contact_title = (columns.get("primaryContactTitle", [{}]) or [{}])[0].get("value", "")
                    primary_contact_phone = (columns.get("primaryContactPhone", [{}]) or [{}])[0].get("value", "")
                    primary_contact_email = (columns.get("primaryContactEmail", [{}]) or [{}])[0].get("value", "")
                    
                    # Write the extracted row to the CSV file
                    csvwriter.writerow([
                        investor, investments, active_portfolio, aum, primary_investor_type, hq_location, description, 
                        preferred_industry, preferred_investment_types, website, primary_contact, primary_contact_title, 
                        primary_contact_phone, primary_contact_email
                    ])

 print(f"Data extraction complete. CSV file saved as {output_file}")
diff --git a/extract.py b/extract.py
 import json
 import csv
 import os

 # Load the JSON data from the file
 input_file_pattern = 'pitchbook-{}.json'
 output_file_path = 'extracted_deals.csv'

 # Define the headers for the CSV file
 headers = [
    "Company Name", "Deal Date", "Deal Type", "Deal Size", "Investor Name",
    "Deal Synopsis", "Financing Status", "Investors", "Primary Industry Code",
    "Verticals", "Company Website", "HQ Location"
 ]

 def extract_deal_data(deal):
    """
    Extract relevant fields from a single deal data entry.
    """
    data = deal.get('columnValues', {})
    return {
        "Company Name": (data.get("companyName", [{}]) or [{}])[0].get("name", ""),
        "Deal Date": (data.get("dealDate", [{}]) or [{}])[0].get("asOfdate", ""),
        "Deal Type": (data.get("dealType", [{}]) or [{}])[0].get("value", ""),
        "Deal Size": (data.get("dealSize", [{}]) or [{}])[0].get("amount", ""),
        "Deal Synopsis": (data.get("dealSynopsis", [{}]) or [{}])[0].get("value", ""),
        "Financing Status": (data.get("dealFinancingStatus", [{}]) or [{}])[0].get("value", ""),
        "Investors": ", ".join([investor.get("name", "") for investor in data.get("investors", [])]),
        "Primary Industry Code": (data.get("primaryIndustryCode", [{}]) or [{}])[0].get("value", ""),
        "Verticals": ", ".join([vertical.get("value", "") for vertical in data.get("verticals", [])]),
        "Company Website": (data.get("companyWebsite", [{}]) or [{}])[0].get("value", ""),
        "HQ Location": (data.get("hqLocation", [{}]) or [{}])[0].get("value", "")
    }

 # Open the CSV file for writing
 with open(output_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames=headers)
    csv_writer.writeheader()

    # Loop through the files from pitchbook-1.json to pitchbook-21.json
    for i in range(1, 22):
        input_file_path = input_file_pattern.format(i)
        
        # Check if the file exists
        if not os.path.exists(input_file_path):
            print(f"File {input_file_path} not found, skipping...")
            continue

        # Read the input JSON file
        with open(input_file_path, 'r') as json_file:
            data = json.load(json_file)

        # Iterate through the JSON data and write each row to the CSV
        for deal in data.get('dataRows', []):
            row = extract_deal_data(deal)
            csv_writer.writerow(row)

 print(f"Data successfully extracted to {output_file_path}")
diff --git a/test.sh b/test.sh
 set -eo pipefail

 for i in {1..100}; do
  curl -o "pitchbook-$i.json" "https://example.com/fizz/page/$i?ref=pagination"
 done
	import json
	import csv
	import os

	# Define the input files and output file
	input_files = [f'pitchbook-{i}.json' for i in range(1, 12)]
	output_file = 'investors_data.csv'

	# Define the headers for the CSV file
	headers = [
	"Investor", "Investments", "Active Portfolio", "AUM", "Primary Investor Type", "HQ Location", "Description",
	"Preferred Industry", "Preferred Investment Types", "Website", "Primary Contact", "Primary Contact Title",
	"Primary Contact Phone", "Primary Contact Email"
	]

	# Open the CSV file for writing
	with open(output_file, 'w', newline='') as csvfile:
	csvwriter = csv.writer(csvfile)
	csvwriter.writerow(headers)

	# Loop through each file
	for input_file in input_files:
	# Load data from the JSON file
	if os.path.exists(input_file):
	with open(input_file, 'r') as f:
	data = json.load(f)

	# Extract the relevant data rows
	data_rows = data.get("dataRows", [])

	# Loop through each data row to extract necessary fields
	for row in data_rows:
	columns = row.get("columnValues", {})

	investor = (columns.get("investorName", [{}]) or [{}])[0].get("name", "")
	investments = (columns.get("investments", [{}]) or [{}])[0].get("value", "")
	active_portfolio = (columns.get("activeInvestedCompanies", [{}]) or [{}])[0].get("value", "")
	aum = (columns.get("capitalUnderManagement", [{}]) or [{}])[0].get("amount", "")
	primary_investor_type = (columns.get("primaryInvestorType", [{}]) or [{}])[0].get("value", "")
	hq_location = (columns.get("hqLocation", [{}]) or [{}])[0].get("value", "")
	description = (columns.get("description", [{}]) or [{}])[0].get("value", "")

	# Handle lists for Preferred Industry and Preferred Investment Types
	preferred_industry = ", ".join([entry.get("value", "") for entry in columns.get("preferredIndustry", [])])
	preferred_investment_types = ", ".join([entry.get("value", "") for entry in columns.get("preferredInvestmentTypes", [])])

	website = (columns.get("investorWebsite", [{}]) or [{}])[0].get("value", "")
	primary_contact = (columns.get("primaryContact", [{}]) or [{}])[0].get("name", "")
	primary_contact_title = (columns.get("primaryContactTitle", [{}]) or [{}])[0].get("value", "")
	primary_contact_phone = (columns.get("primaryContactPhone", [{}]) or [{}])[0].get("value", "")
	primary_contact_email = (columns.get("primaryContactEmail", [{}]) or [{}])[0].get("value", "")

	# Write the extracted row to the CSV file
	csvwriter.writerow([
	investor, investments, active_portfolio, aum, primary_investor_type, hq_location, description,
	preferred_industry, preferred_investment_types, website, primary_contact, primary_contact_title,
	primary_contact_phone, primary_contact_email
	])

	print(f"Data extraction complete. CSV file saved as {output_file}")
	set -eo pipefail

	for i in {1..100}; do
	curl -o "pitchbook-$i.json" "https://example.com/fizz/page/$i?ref=pagination"
	done