Last active
September 30, 2024 21:11
-
-
Save GarrettMooney/38a02a8b3ffe760faa03a81d7df40152 to your computer and use it in GitHub Desktop.
Query athena, return a polars dataframe, subquery via duckdb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "duckdb", | |
# "polars", | |
# "pyarrow", | |
# "pyathena", | |
# "s3fs", | |
# ] | |
# /// | |
import duckdb | |
import polars as pl | |
from pyathena import connect | |
from pyathena.arrow.cursor import ArrowCursor | |
class Config: | |
staging: str = "s3://<XXX>/" | |
region: str = "<YYY>" | |
cursor = connect( | |
s3_staging_dir=Config.staging, region_name=Config.region, cursor_class=ArrowCursor | |
).cursor() | |
def query_athena(query) -> pl.DataFrame: | |
return pl.from_arrow(cursor.execute(query).as_arrow()) | |
def query_polars(query) -> pl.DataFrame: | |
"""Assumes in-memory dataframe to query""" | |
return duckdb.sql(query).pl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment