thunderpoot · March 2, 2025 12:45
diff --git a/readme.txt b/readme.txt
 Parquet Example Programs
 ========================

 These example programs demonstrate simple interactions with Parquet files using Python.

 1. `write_parquet.py`: This program generates a small example Parquet file. It shows the ease of creating Parquet files with Python using the `pandas` library.
 2. `read_parquet.py`: This program reads and displays the contents of the example Parquet file generated by `write_parquet.py`.
 3. `describe_parquet.py`: This program demonstrates how to read Parquet files and extract information such as column names, schema, and file size, using the `pyarrow` library.
 4. `filter_parquet.py`: This program demonstrates efficient filtering of a Parquet file, applying "predicate pushdown"

 Dependencies:
 - Python 3.x
 - `pandas` library
 - `pyarrow` library

 For more information about Parquet files and the libraries, please refer to the official documentation:
 - parquet: https://parquet.apache.org/documentation/latest/
 - pandas: https://pandas.pydata.org/
 - pyarrow: https://arrow.apache.org/docs/python/index.html
diff --git a/describe_parquet.py b/describe_parquet.py
 import os
 import pyarrow.parquet as pq

 def describe_parquet(file_path):
    file_size = os.path.getsize(file_path)
    print(f"File Size: {file_size} bytes")

    table   = pq.read_table(file_path)
    columns = table.column_names

    print(f"Number of rows: {table.num_rows}")
    print(f"Number of columns: {len(columns)}")

    print("Columns:")
    for column in columns:
        print(column)

 describe_parquet("example.parquet")

 # Output:
 # File Size: 3557 bytes
 # Number of rows: 7
 # Number of columns: 4
 # Columns:
 # Captain
 # Actor
 # Ship
 # Quote
diff --git a/filter_parquet.py b/filter_parquet.py
 import pyarrow.parquet as pq

 # To extract entries from a Parquet file where the Ship column exactly matches `USS Enterprise-D`
 # without loading the entire file into memory, you can use the filters argument in PyArrow's `read_table`
 # function to apply "predicate pushdown".  This method allows you to specify conditions that are used to
 # filter data during the read operation, which can significantly reduce memory usage by only loading the
 # relevant subset of data.  This is useful when dealing with Common Crawl's indexes, because they're huge!

 # Define filters to apply predicate pushdown
 # Here we specify that we only want rows where the 'Ship' column is 'USS Enterprise-D'
 filters = [('Ship', '=', 'USS Enterprise-D')]

 # Read the Parquet file with the filters applied to avoid loading a monstrously large file into memory
 table = pq.read_table('example.parquet', filters=filters)

 # Convert to Pandas DataFrame for easier viewing/manipulation (optional)
 filtered_df = table.to_pandas()

 print(filtered_df)

 # Output:
 #            Captain            Actor              Ship         Quote
 # 0  Jean-Luc Picard  Patrick Stewart  USS Enterprise-D   Make it so.
 # 1   Edward Jellico        Ronny Cox  USS Enterprise-D  Get it done.
diff --git a/read_parquet.py b/read_parquet.py
 import pandas as pd

 # Read the Parquet file into a DataFrame
 df = pd.read_parquet('example.parquet')

 # Display the contents of the DataFrame
 print("Contents of the Parquet file:")
 print(df)


 # Output:
 # Contents of the Parquet file:
 #             Captain            Actor              Ship                            Quote
 # 0     James T. Kirk  William Shatner    USS Enterprise              Beam me up, Scotty!
 # 1   Jean-Luc Picard  Patrick Stewart  USS Enterprise-D                      Make it so.
 # 2    Benjamin Sisko     Avery Brooks      Deep Space 9                 It's a faaaaake!
 # 3   Kathryn Janeway     Kate Mulgrew       USS Voyager   There's coffee in that nebula.
 # 4   Jonathan Archer     Scott Bakula  Enterprise NX-01  We're not out here to play God.
 # 5  William T. Riker  Jonathan Frakes         USS Titan         I love surprise parties.
 # 6    Edward Jellico        Ronny Cox  USS Enterprise-D                     Get it done.
diff --git a/write_parquet.py b/write_parquet.py
 import pandas as pd

 # Create a sample DataFrame
 data = {
    'Captain': ['James T. Kirk', 'Jean-Luc Picard', 'Benjamin Sisko', 'Kathryn Janeway', 'Jonathan Archer', 'William T. Riker', 'Edward Jellico'],
    'Actor': ['William Shatner', 'Patrick Stewart', 'Avery Brooks', 'Kate Mulgrew', 'Scott Bakula', 'Jonathan Frakes', 'Ronny Cox'],
    'Ship': ['USS Enterprise', 'USS Enterprise-D', 'Deep Space 9', 'USS Voyager', 'Enterprise NX-01', 'USS Titan', 'USS Enterprise-D'],
    'Quote': ['Beam me up, Scotty!', 'Make it so.', "It's a faaaaake!", "There's coffee in that nebula.", "We're not out here to play God.", 'I love surprise parties.', 'Get it done.']
 }
 df = pd.DataFrame(data)

 # Write DataFrame to Parquet file
 df.to_parquet('example.parquet', index=False)

 print("Parquet file 'example.parquet' has been created successfully.")
	Parquet Example Programs
	========================

	These example programs demonstrate simple interactions with Parquet files using Python.

	1. `write_parquet.py`: This program generates a small example Parquet file. It shows the ease of creating Parquet files with Python using the `pandas` library.
	2. `read_parquet.py`: This program reads and displays the contents of the example Parquet file generated by `write_parquet.py`.
	3. `describe_parquet.py`: This program demonstrates how to read Parquet files and extract information such as column names, schema, and file size, using the `pyarrow` library.
	4. `filter_parquet.py`: This program demonstrates efficient filtering of a Parquet file, applying "predicate pushdown"

	Dependencies:
	- Python 3.x
	- `pandas` library
	- `pyarrow` library

	For more information about Parquet files and the libraries, please refer to the official documentation:
	- parquet: https://parquet.apache.org/documentation/latest/
	- pandas: https://pandas.pydata.org/
	- pyarrow: https://arrow.apache.org/docs/python/index.html
	import os
	import pyarrow.parquet as pq

	def describe_parquet(file_path):
	file_size = os.path.getsize(file_path)
	print(f"File Size: {file_size} bytes")

	table = pq.read_table(file_path)
	columns = table.column_names

	print(f"Number of rows: {table.num_rows}")
	print(f"Number of columns: {len(columns)}")

	print("Columns:")
	for column in columns:
	print(column)

	describe_parquet("example.parquet")

	# Output:
	# File Size: 3557 bytes
	# Number of rows: 7
	# Number of columns: 4
	# Columns:
	# Captain
	# Actor
	# Ship
	# Quote
	import pyarrow.parquet as pq

	# To extract entries from a Parquet file where the Ship column exactly matches `USS Enterprise-D`
	# without loading the entire file into memory, you can use the filters argument in PyArrow's `read_table`
	# function to apply "predicate pushdown". This method allows you to specify conditions that are used to
	# filter data during the read operation, which can significantly reduce memory usage by only loading the
	# relevant subset of data. This is useful when dealing with Common Crawl's indexes, because they're huge!

	# Define filters to apply predicate pushdown
	# Here we specify that we only want rows where the 'Ship' column is 'USS Enterprise-D'
	filters = [('Ship', '=', 'USS Enterprise-D')]

	# Read the Parquet file with the filters applied to avoid loading a monstrously large file into memory
	table = pq.read_table('example.parquet', filters=filters)

	# Convert to Pandas DataFrame for easier viewing/manipulation (optional)
	filtered_df = table.to_pandas()

	print(filtered_df)

	# Output:
	# Captain Actor Ship Quote
	# 0 Jean-Luc Picard Patrick Stewart USS Enterprise-D Make it so.
	# 1 Edward Jellico Ronny Cox USS Enterprise-D Get it done.
	import pandas as pd

	# Read the Parquet file into a DataFrame
	df = pd.read_parquet('example.parquet')

	# Display the contents of the DataFrame
	print("Contents of the Parquet file:")
	print(df)


	# Output:
	# Contents of the Parquet file:
	# Captain Actor Ship Quote
	# 0 James T. Kirk William Shatner USS Enterprise Beam me up, Scotty!
	# 1 Jean-Luc Picard Patrick Stewart USS Enterprise-D Make it so.
	# 2 Benjamin Sisko Avery Brooks Deep Space 9 It's a faaaaake!
	# 3 Kathryn Janeway Kate Mulgrew USS Voyager There's coffee in that nebula.
	# 4 Jonathan Archer Scott Bakula Enterprise NX-01 We're not out here to play God.
	# 5 William T. Riker Jonathan Frakes USS Titan I love surprise parties.
	# 6 Edward Jellico Ronny Cox USS Enterprise-D Get it done.
	import pandas as pd

	# Create a sample DataFrame
	data = {
	'Captain': ['James T. Kirk', 'Jean-Luc Picard', 'Benjamin Sisko', 'Kathryn Janeway', 'Jonathan Archer', 'William T. Riker', 'Edward Jellico'],
	'Actor': ['William Shatner', 'Patrick Stewart', 'Avery Brooks', 'Kate Mulgrew', 'Scott Bakula', 'Jonathan Frakes', 'Ronny Cox'],
	'Ship': ['USS Enterprise', 'USS Enterprise-D', 'Deep Space 9', 'USS Voyager', 'Enterprise NX-01', 'USS Titan', 'USS Enterprise-D'],
	'Quote': ['Beam me up, Scotty!', 'Make it so.', "It's a faaaaake!", "There's coffee in that nebula.", "We're not out here to play God.", 'I love surprise parties.', 'Get it done.']
	}
	df = pd.DataFrame(data)

	# Write DataFrame to Parquet file
	df.to_parquet('example.parquet', index=False)

	print("Parquet file 'example.parquet' has been created successfully.")