jacobtomlinson · January 15, 2025 16:01
diff --git a/README.md b/README.md
diff --git a/cudf_pandas_coiled_demo.py b/cudf_pandas_coiled_demo.py
 import pandas as pd
 from time import perf_counter
 from contextlib import contextmanager


 @contextmanager
 def timeit(name):
    """Utility context manager to print out how long things take."""
    start = perf_counter()
    yield lambda: perf_counter() - start
    print(f'{name} took: {perf_counter() - start:.3f} seconds')


 # Read in the NYC parking violations dataset for 2022
 df = pd.read_parquet(
    "s3://rapidsai-data/datasets/nyc_parking/nyc_parking_violations_2022.parquet",
    columns=["Registration State", "Violation Description", "Vehicle Body Type", "Issue Date", "Summons Number"]
 )


 with timeit("Calculate violations by state"):
    for _ in range(10):
        (df[["Registration State", "Violation Description"]]  # get only these two columns
         .value_counts()  # get the count of offences per state and per type of offence
         .groupby("Registration State")  # group by state
         .head(1)  # get the first row in each group (the type of offence with the largest count)
         .sort_index()  # sort by state name
         .reset_index()
        )


 with timeit("Calculate violations by vehicle type"):
    for _ in range(10):
        (df
         .groupby(["Vehicle Body Type"])
         .agg({"Summons Number": "count"})
         .rename(columns={"Summons Number": "Count"})
         .sort_values(["Count"], ascending=False)
        )


 with timeit("Calculate violations by day of week"):
    for _ in range(10):
        weekday_names = {
            0: "Monday",
            1: "Tuesday",
            2: "Wednesday",
            3: "Thursday",
            4: "Friday",
            5: "Saturday",
            6: "Sunday",
        }
        
        df["Issue Date"] = df["Issue Date"].astype("datetime64[ms]")
        df["issue_weekday"] = df["Issue Date"].dt.weekday.map(weekday_names)
        
        df.groupby(["issue_weekday"])["Summons Number"].count().sort_values()
	import pandas as pd
	from time import perf_counter
	from contextlib import contextmanager


	@contextmanager
	def timeit(name):
	"""Utility context manager to print out how long things take."""
	start = perf_counter()
	yield lambda: perf_counter() - start
	print(f'{name} took: {perf_counter() - start:.3f} seconds')


	# Read in the NYC parking violations dataset for 2022
	df = pd.read_parquet(
	"s3://rapidsai-data/datasets/nyc_parking/nyc_parking_violations_2022.parquet",
	columns=["Registration State", "Violation Description", "Vehicle Body Type", "Issue Date", "Summons Number"]
	)


	with timeit("Calculate violations by state"):
	for _ in range(10):
	(df[["Registration State", "Violation Description"]] # get only these two columns
	.value_counts() # get the count of offences per state and per type of offence
	.groupby("Registration State") # group by state
	.head(1) # get the first row in each group (the type of offence with the largest count)
	.sort_index() # sort by state name
	.reset_index()
	)


	with timeit("Calculate violations by vehicle type"):
	for _ in range(10):
	(df
	.groupby(["Vehicle Body Type"])
	.agg({"Summons Number": "count"})
	.rename(columns={"Summons Number": "Count"})
	.sort_values(["Count"], ascending=False)
	)


	with timeit("Calculate violations by day of week"):
	for _ in range(10):
	weekday_names = {
	0: "Monday",
	1: "Tuesday",
	2: "Wednesday",
	3: "Thursday",
	4: "Friday",
	5: "Saturday",
	6: "Sunday",
	}

	df["Issue Date"] = df["Issue Date"].astype("datetime64[ms]")
	df["issue_weekday"] = df["Issue Date"].dt.weekday.map(weekday_names)

	df.groupby(["issue_weekday"])["Summons Number"].count().sort_values()