koenvo · October 20, 2023 12:52
diff --git a/duckdb_cached.py b/duckdb_cached.py
 import time

 # Make sure you have duckdb==0.7.0. Earlier versions might fail with GIL problems ( https://twitter.com/mr_le_fox/status/1620535141675433986 )
 import duckdb

 import s3fs

 from fsspec.implementations.cached import SimpleCacheFileSystem

 # Create the s3 file system. This one does not have caching
 s3_file_system = s3fs.S3FileSystem(
    anon=True,
    default_block_size=100 * 1024 * 1024,
    client_kwargs={
        'region_name': 'us-east-1'
    }
 )

 # Wrap the S3 file system in a caching file system
 # Note 1: We use the `SimpleCacheFileSystem` as this cached the entire file instead of per block. 
 #         The regular `CacheFileSystem` (cache in blocks) don't seem to work when same query is performed in from same connection..
 # Note 2: Some method calls to the cache will reach the original fs even when an cache item exists.
 #         In this case this will cause some HEAD requests to s3, which are taking some time. The data itself is cached.
 fs = SimpleCacheFileSystem(
    fs=s3_file_system,
    cache_storage="./tmp/"
 )

 # Create a new duckdb connection
 con = duckdb.connect()

 # Register the cached file system. Read more about duckdb fsspec support: https://duckdb.org/docs/guides/python/filesystems.html
 con.register_filesystem(fs)

 # Run the query the first time
 t0 = time.time()
 df = con.execute(f'''
    select *
    from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv')
    limit 10
    '''
 )
 t1 = time.time()

 # Same query second time
 df = con.execute(f'''
    select *
    from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv')
    limit 10
    '''
 )
 t2 = time.time()


 print(f"First run took: {(t1 - t0) * 1000:.1f}ms")
 print(f"Second run took: {(t2 - t1) * 1000:.1f}ms")

 """
 First run took: 16892.7ms
 Second run took: 341.2ms
 """
	import time

	# Make sure you have duckdb==0.7.0. Earlier versions might fail with GIL problems ( https://twitter.com/mr_le_fox/status/1620535141675433986 )
	import duckdb

	import s3fs

	from fsspec.implementations.cached import SimpleCacheFileSystem

	# Create the s3 file system. This one does not have caching
	s3_file_system = s3fs.S3FileSystem(
	anon=True,
	default_block_size=100 * 1024 * 1024,
	client_kwargs={
	'region_name': 'us-east-1'
	}
	)

	# Wrap the S3 file system in a caching file system
	# Note 1: We use the `SimpleCacheFileSystem` as this cached the entire file instead of per block.
	# The regular `CacheFileSystem` (cache in blocks) don't seem to work when same query is performed in from same connection..
	# Note 2: Some method calls to the cache will reach the original fs even when an cache item exists.
	# In this case this will cause some HEAD requests to s3, which are taking some time. The data itself is cached.
	fs = SimpleCacheFileSystem(
	fs=s3_file_system,
	cache_storage="./tmp/"
	)

	# Create a new duckdb connection
	con = duckdb.connect()

	# Register the cached file system. Read more about duckdb fsspec support: https://duckdb.org/docs/guides/python/filesystems.html
	con.register_filesystem(fs)

	# Run the query the first time
	t0 = time.time()
	df = con.execute(f'''
	select *
	from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv')
	limit 10
	'''
	)
	t1 = time.time()

	# Same query second time
	df = con.execute(f'''
	select *
	from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv')
	limit 10
	'''
	)
	t2 = time.time()


	print(f"First run took: {(t1 - t0) * 1000:.1f}ms")
	print(f"Second run took: {(t2 - t1) * 1000:.1f}ms")

	"""
	First run took: 16892.7ms
	Second run took: 341.2ms
	"""