Last active
October 20, 2023 12:52
-
-
Save koenvo/e7d521e2867b710a47a8f1255c2d7894 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
# Make sure you have duckdb==0.7.0. Earlier versions might fail with GIL problems ( https://twitter.com/mr_le_fox/status/1620535141675433986 ) | |
import duckdb | |
import s3fs | |
from fsspec.implementations.cached import SimpleCacheFileSystem | |
# Create the s3 file system. This one does not have caching | |
s3_file_system = s3fs.S3FileSystem( | |
anon=True, | |
default_block_size=100 * 1024 * 1024, | |
client_kwargs={ | |
'region_name': 'us-east-1' | |
} | |
) | |
# Wrap the S3 file system in a caching file system | |
# Note 1: We use the `SimpleCacheFileSystem` as this cached the entire file instead of per block. | |
# The regular `CacheFileSystem` (cache in blocks) don't seem to work when same query is performed in from same connection.. | |
# Note 2: Some method calls to the cache will reach the original fs even when an cache item exists. | |
# In this case this will cause some HEAD requests to s3, which are taking some time. The data itself is cached. | |
fs = SimpleCacheFileSystem( | |
fs=s3_file_system, | |
cache_storage="./tmp/" | |
) | |
# Create a new duckdb connection | |
con = duckdb.connect() | |
# Register the cached file system. Read more about duckdb fsspec support: https://duckdb.org/docs/guides/python/filesystems.html | |
con.register_filesystem(fs) | |
# Run the query the first time | |
t0 = time.time() | |
df = con.execute(f''' | |
select * | |
from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv') | |
limit 10 | |
''' | |
) | |
t1 = time.time() | |
# Same query second time | |
df = con.execute(f''' | |
select * | |
from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv') | |
limit 10 | |
''' | |
) | |
t2 = time.time() | |
print(f"First run took: {(t1 - t0) * 1000:.1f}ms") | |
print(f"Second run took: {(t2 - t1) * 1000:.1f}ms") | |
""" | |
First run took: 16892.7ms | |
Second run took: 341.2ms | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment