Skip to content

Instantly share code, notes, and snippets.

@koenvo
Last active October 20, 2023 12:52
Show Gist options
  • Save koenvo/e7d521e2867b710a47a8f1255c2d7894 to your computer and use it in GitHub Desktop.
Save koenvo/e7d521e2867b710a47a8f1255c2d7894 to your computer and use it in GitHub Desktop.
import time
# Make sure you have duckdb==0.7.0. Earlier versions might fail with GIL problems ( https://twitter.com/mr_le_fox/status/1620535141675433986 )
import duckdb
import s3fs
from fsspec.implementations.cached import SimpleCacheFileSystem
# Create the s3 file system. This one does not have caching
s3_file_system = s3fs.S3FileSystem(
anon=True,
default_block_size=100 * 1024 * 1024,
client_kwargs={
'region_name': 'us-east-1'
}
)
# Wrap the S3 file system in a caching file system
# Note 1: We use the `SimpleCacheFileSystem` as this cached the entire file instead of per block.
# The regular `CacheFileSystem` (cache in blocks) don't seem to work when same query is performed in from same connection..
# Note 2: Some method calls to the cache will reach the original fs even when an cache item exists.
# In this case this will cause some HEAD requests to s3, which are taking some time. The data itself is cached.
fs = SimpleCacheFileSystem(
fs=s3_file_system,
cache_storage="./tmp/"
)
# Create a new duckdb connection
con = duckdb.connect()
# Register the cached file system. Read more about duckdb fsspec support: https://duckdb.org/docs/guides/python/filesystems.html
con.register_filesystem(fs)
# Run the query the first time
t0 = time.time()
df = con.execute(f'''
select *
from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv')
limit 10
'''
)
t1 = time.time()
# Same query second time
df = con.execute(f'''
select *
from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv')
limit 10
'''
)
t2 = time.time()
print(f"First run took: {(t1 - t0) * 1000:.1f}ms")
print(f"Second run took: {(t2 - t1) * 1000:.1f}ms")
"""
First run took: 16892.7ms
Second run took: 341.2ms
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment