Skip to content

Instantly share code, notes, and snippets.

@GarrettMooney
Last active September 30, 2024 21:11
Show Gist options
  • Save GarrettMooney/38a02a8b3ffe760faa03a81d7df40152 to your computer and use it in GitHub Desktop.
Save GarrettMooney/38a02a8b3ffe760faa03a81d7df40152 to your computer and use it in GitHub Desktop.
Query athena, return a polars dataframe, subquery via duckdb
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "duckdb",
# "polars",
# "pyarrow",
# "pyathena",
# "s3fs",
# ]
# ///
import duckdb
import polars as pl
from pyathena import connect
from pyathena.arrow.cursor import ArrowCursor
class Config:
staging: str = "s3://<XXX>/"
region: str = "<YYY>"
cursor = connect(
s3_staging_dir=Config.staging, region_name=Config.region, cursor_class=ArrowCursor
).cursor()
def query_athena(query) -> pl.DataFrame:
return pl.from_arrow(cursor.execute(query).as_arrow())
def query_polars(query) -> pl.DataFrame:
"""Assumes in-memory dataframe to query"""
return duckdb.sql(query).pl()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment