Skip to content

Instantly share code, notes, and snippets.

@ian-r-rose
Created April 5, 2022 18:33
Show Gist options
  • Save ian-r-rose/2f518380e186f5787d1ad6f298cddd9a to your computer and use it in GitHub Desktop.
Save ian-r-rose/2f518380e186f5787d1ad6f298cddd9a to your computer and use it in GitHub Desktop.
import os
import dask
import s3fs
dask.config.set({"num_workers": 2})
dask.config.set({"scheduler": "threads"})
fs = s3fs.S3FileSystem(anon=True)
paths = fs.glob("s3://ursa-labs-taxi-data/2009/**.parquet")
@dask.delayed
def download(path):
out = ".".join(path.split("/")[-2:])
fs.download("s3://" + path, out)
os.remove(out)
print(f"Downloaded {path} to {out}")
if __name__ == "__main__":
delayeds = map(download, paths)
import time
start = time.time()
dask.compute(delayeds)
end = time.time()
size = 5627404555
print(f"Speed (MB/s): {size / (end - start) / 10**6}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment