Created
November 14, 2022 15:19
-
-
Save avriiil/2bf65bfcecaa66d5d870a502ad28e1ec to your computer and use it in GitHub Desktop.
Download Kaggle NYC bike data for Dask benchmarking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from pathlib import Path | |
import duckdb | |
import kaggle | |
from loguru import logger | |
from tqdm import tqdm | |
# In order to access this data, you must create a Kaggle account and obtain an API key. | |
# You can obtain a key by clicking on your icon on the upper right of the homepage, | |
# clicking Account, and then clicking to create a new API token. | |
# This will allow you to download a kaggle.json credentials file which contains the API key. | |
# Move the kaggle.json file that you downloaded during the Kaggle setup to ~/.kaggle/kaggle.json. | |
def csv_to_parquet(infile: str, outfile: str, sample_size=512_000) -> None: | |
conn = duckdb.connect(":memory:") | |
# We cannot autodetect columns due to some messiness in the data. For example, | |
# station_id is mostly an integer except for some weird stations that have a long | |
# UUID-esque ID. These types of stations do not appear in every file. | |
columns = { | |
"station_id": "VARCHAR", | |
"num_bikes_available": "INTEGER", | |
"num_ebikes_available": "VARCHAR", | |
"num_bikes_disabled": "VARCHAR", | |
"num_docks_available": "INTEGER", | |
"num_docks_disabled": "VARCHAR", | |
"is_installed": "VARCHAR", | |
"is_renting": "VARCHAR", | |
"is_returning": "VARCHAR", | |
"station_status_last_reported": "INTEGER", | |
"station_name": "VARCHAR", | |
"lat": "VARCHAR", | |
"lon": "VARCHAR", | |
"region_id": "VARCHAR", | |
"capacity": "VARCHAR", | |
"has_kiosk": "VARCHAR", | |
"station_information_last_updated": "VARCHAR", | |
"missing_station_information": "BOOLEAN", | |
} | |
# Format columns correctly for the duckdb query. | |
# Replace double-quotes with single-quotes. | |
columns = json.dumps(columns, indent=2).replace('"', "'") | |
query = f""" | |
COPY ( | |
SELECT * | |
FROM read_csv( | |
'{infile}', | |
columns={columns}, | |
sample_size={sample_size}, | |
header=True, | |
auto_detect=False | |
) | |
) | |
TO '{outfile}' | |
(FORMAT 'PARQUET'); | |
""" | |
conn.execute(query) | |
def construct_dataset(root: Path, kaggle_dataset_name: str) -> None: | |
# Download the dataset from Kaggle | |
logger.info("Downloading {!r} dataset from kaggle.", kaggle_dataset_name) | |
download_path = root / "csv" | |
download_path.mkdir(parents=True, exist_ok=True) | |
kaggle.api.dataset_download_files(kaggle_dataset_name, download_path, unzip=True) | |
logger.info("Converting dataset from CSV to parquet.") | |
# Convert the dataset to parquet | |
parquet_path = root / "parquet" | |
parquet_path.mkdir(parents=True, exist_ok=True) | |
csvs = list(download_path.iterdir()) | |
for csv in tqdm(csvs): | |
outfile = parquet_path / csv.with_suffix(".parquet").name | |
csv_to_parquet(csv.as_posix(), outfile.as_posix()) | |
# generate dataset | |
dataset = construct_dataset(Path("~/Desktop/data_kaggle"), "rosenthal/citi-bike-stations") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment