Created
January 19, 2024 18:13
-
-
Save vdavez/44013a19e7bcfb5d8d93d0ae9c6bfd09 to your computer and use it in GitHub Desktop.
Download CSV directly to parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aiohttp | |
import asyncio | |
import polars as pl | |
import tempfile | |
async def convert_csv_to_parquet(url: str, output_file: str): | |
""" | |
A script that rapidly streams a CSV url to a parquet file | |
Args: | |
url: a string representing the URL of the CSV to be downloaded | |
output_file: the location where the parquet file should be saved (be sure to include .parquet at the end) | |
""" | |
async with aiohttp.ClientSession() as session: | |
async with session.get(url) as resp: | |
with tempfile.NamedTemporaryFile() as fp: | |
fp.write(await resp.read()) | |
fp.seek(0) | |
pl.scan_csv(fp.name, encoding="utf8-lossy").sink_parquet(output_file) | |
asnycio.run(convert_csv_to_parquet(url)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment