haydenflinner · January 17, 2024 12:38
diff --git a/stats.py b/stats.py
 import pandas as pd

 from dataclasses import dataclass

 @dataclass
 class PositionSample:
    lap: int
    rider: str
    pos: int
 Sample = PositionSample

 # 5 riders, 4 lap race. Real data should reveal more noise / trends.
 df = pd.DataFrame([
    Sample(1, 'a', 1),
    Sample(1, 'b', 2),
    Sample(1, 'c', 3),
    Sample(1, 'd', 4),
    Sample(1, 'e', 5),
    
    Sample(2, 'a', 1),
    Sample(2, 'b', 2),
    Sample(2, 'c', 3),
    Sample(2, 'e', 4),
    Sample(2, 'd', 5),  # <-- d and e traded places

    Sample(3, 'a', 1),
    Sample(3, 'd', 2),
    Sample(3, 'e', 3),
    Sample(3, 'c', 4),  # b took out c and himself. or c took out b and himself.
    Sample(3, 'b', 5),
    
    # Finishing lap unchanged from prior lap.
    Sample(4, 'a', 1),
    Sample(4, 'd', 2),
    Sample(4, 'e', 3),
    Sample(4, 'c', 4),
    Sample(4, 'b', 5),
 ])

 # Our goal here is to find if there's a correlation between being near certain riders
 # and changes in Position. For example, maybe a certain rider is known to ride
 # a wide bike and so being just behind him means you have a below average
 # chance of passing. To really weigh that you'd need to correct for speed with
 # something like ELO or maybe just finishing position in the current race.
 # Another example would be that having Jett Lawrence behind you is a recipe
 # for losing one spot by the end of the lap.
 # This may also reveal riders who have a tendency
 # to put other riders far down the results sheet.

 # For each lap sample, this is important info.
 df = df.sort_values(by=["lap", "pos"])
 df["rider_ahead"] = df.groupby("lap")["rider"].shift()
 df["rider_behind"] = df.groupby("lap")["rider"].shift(-1)

 # Group by "rider" and then use shift to get the previous lap's "pos"
 df = df.sort_values(by=["rider", "lap"])
 df["prev_pos"] = df.groupby("rider")["pos"].shift()
 df["rider_ahead_last_lap"] = df.groupby("rider")["rider_ahead"].shift()

 # display(df.sort_values(by=["lap", "pos"]))
 # Drop rows where there is no previous lap. Not a big loss since first laps are especially hectic.
 df = df.dropna(subset=["prev_pos"])

 df["pos_change"] = df.pos - df.prev_pos

 # Resetting index if needed
 df = df.reset_index(drop=True)


 df.sort_values(by=["lap", "pos"])

 display(df.groupby("rider_ahead_last_lap")["pos_change"].max())

 import plotly.express as px
 px.box(df, x='rider_ahead_last_lap', y='pos_change')
	import pandas as pd

	from dataclasses import dataclass

	@dataclass
	class PositionSample:
	lap: int
	rider: str
	pos: int
	Sample = PositionSample

	# 5 riders, 4 lap race. Real data should reveal more noise / trends.
	df = pd.DataFrame([
	Sample(1, 'a', 1),
	Sample(1, 'b', 2),
	Sample(1, 'c', 3),
	Sample(1, 'd', 4),
	Sample(1, 'e', 5),

	Sample(2, 'a', 1),
	Sample(2, 'b', 2),
	Sample(2, 'c', 3),
	Sample(2, 'e', 4),
	Sample(2, 'd', 5), # <-- d and e traded places

	Sample(3, 'a', 1),
	Sample(3, 'd', 2),
	Sample(3, 'e', 3),
	Sample(3, 'c', 4), # b took out c and himself. or c took out b and himself.
	Sample(3, 'b', 5),

	# Finishing lap unchanged from prior lap.
	Sample(4, 'a', 1),
	Sample(4, 'd', 2),
	Sample(4, 'e', 3),
	Sample(4, 'c', 4),
	Sample(4, 'b', 5),
	])

	# Our goal here is to find if there's a correlation between being near certain riders
	# and changes in Position. For example, maybe a certain rider is known to ride
	# a wide bike and so being just behind him means you have a below average
	# chance of passing. To really weigh that you'd need to correct for speed with
	# something like ELO or maybe just finishing position in the current race.
	# Another example would be that having Jett Lawrence behind you is a recipe
	# for losing one spot by the end of the lap.
	# This may also reveal riders who have a tendency
	# to put other riders far down the results sheet.

	# For each lap sample, this is important info.
	df = df.sort_values(by=["lap", "pos"])
	df["rider_ahead"] = df.groupby("lap")["rider"].shift()
	df["rider_behind"] = df.groupby("lap")["rider"].shift(-1)

	# Group by "rider" and then use shift to get the previous lap's "pos"
	df = df.sort_values(by=["rider", "lap"])
	df["prev_pos"] = df.groupby("rider")["pos"].shift()
	df["rider_ahead_last_lap"] = df.groupby("rider")["rider_ahead"].shift()

	# display(df.sort_values(by=["lap", "pos"]))
	# Drop rows where there is no previous lap. Not a big loss since first laps are especially hectic.
	df = df.dropna(subset=["prev_pos"])

	df["pos_change"] = df.pos - df.prev_pos

	# Resetting index if needed
	df = df.reset_index(drop=True)


	df.sort_values(by=["lap", "pos"])

	display(df.groupby("rider_ahead_last_lap")["pos_change"].max())

	import plotly.express as px
	px.box(df, x='rider_ahead_last_lap', y='pos_change')