Last active
February 5, 2024 04:46
-
-
Save t04glovern/8f7766b9b38e3d8b5bde6ca896fc3af1 to your computer and use it in GitHub Desktop.
Example of using PyIceberg API with an existing iceberg table (created with https://gist.github.com/t04glovern/04f6f2934353eb1d0fffd487e9b9b6a3).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# -- Run other script to create the Iceberg table | |
# | |
# pip install boto3 | |
# curl https://gist.githubusercontent.com/t04glovern/04f6f2934353eb1d0fffd487e9b9b6a3/raw \ | |
# > lets-try-iceberg.py \ | |
# && chmod +x lets-try-iceberg.py | |
# ./lets-try-iceberg.py --table lets_try_iceberg | |
# | |
# -- Run this script to connect to the Iceberg table using PyIceberg | |
# | |
# pip install pyarrow pyiceberg | |
# ./lets-try-pyiceberg.py --bucket <bucket-name> | |
import argparse | |
import logging | |
import os | |
from pyiceberg.catalog import load_catalog | |
logging.basicConfig(level=logging.INFO) | |
aws_region: str = "us-west-2" | |
# Required for pyarrow to work with S3 | |
os.environ['AWS_DEFAULT_REGION'] = aws_region | |
def main(bucket_name: str): | |
# Configure the catalog | |
catalog = load_catalog( | |
'default', | |
**{ | |
'type': 'glue', | |
'uri': 's3://{bucket_name}'.format(bucket_name=bucket_name), | |
'region_name': aws_region | |
} | |
) | |
# List the namespaces | |
namespaces = catalog.list_namespaces() | |
logging.info(f'Namespaces: {namespaces}') | |
# List the tables (assumes the default namespace) | |
tables = catalog.list_tables('default') | |
logging.info(f'Tables: {tables}') | |
# Load a table | |
table = catalog.load_table(('default', 'lets_try_iceberg')) | |
# Scan the table | |
scan = table.scan( | |
selected_fields=('id', 'timestamp', 'speed', 'temperature', 'location'), | |
limit=10 | |
).to_arrow() | |
# Print the results | |
logging.info(f'Scan: {scan}') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="PyIceberg sample script") | |
parser.add_argument( | |
"--bucket", | |
type=str, | |
help="The S3 bucket name where the existing Iceberg table is stored.", | |
required=True, | |
) | |
args = parser.parse_args() | |
bucket_name = args.bucket | |
main(bucket_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment