julian-west · June 19, 2022 12:11
diff --git a/hash_split.py b/hash_split.py
 TEST_RATIO = 0.1
 BUCKETS = 10

 def assign_hash_bucket(value: Any, total_buckets: int = BUCKETS) -> int:
    """Assign a bucket to an input value using hashing algorithm"""
    hashed_value = farmhash.fingerprint64(str(value))
    return hashed_value % total_buckets


 def test_set_check(bucket: int) -> bool:
    """Check if the bucket should be included in the test set
    
    This is an arbitrary function, you could change this for your own
    requirements
    
    In this case, the datapoint is assigned to the test set if the bucket
    number is less than the test ratio x total buckets.
    """
    return bucket < TEST_RATIO * BUCKETS
  
  
 def hash_train_test_split(
    df: pd.DataFrame,
    split_col: str,
    approx_test_ratio: float,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Split the data into a training and test set based of a specific column

    This function adds an additional column to the dataframe. This is for
    demonstration purposes and is not required. The test set check could all
    be completed in memory

    Args:
        df: original dataset
        split_col: name of the column to use for hashing which uniquely
            identifies a datapoint
        approx_test_ratio: float between 0-1. This is an approximate ratio as
            the hashing algo will not necessarily provide a uniform bucket
            distribution for small datasets
    Returns:
        A tuple of two dataframes, the first is the training set and the second
        is the test set
    """
    # assign bucket
    df["bucket"] = df[split_col].apply(assign_hash_bucket)
    
    # generate 'mask' of boolean values which define the train/test split
    in_test_set = df["bucket"].apply(test_set_check)
    
    return df[~in_test_set], df[in_test_set]
	TEST_RATIO = 0.1
	BUCKETS = 10

	def assign_hash_bucket(value: Any, total_buckets: int = BUCKETS) -> int:
	"""Assign a bucket to an input value using hashing algorithm"""
	hashed_value = farmhash.fingerprint64(str(value))
	return hashed_value % total_buckets


	def test_set_check(bucket: int) -> bool:
	"""Check if the bucket should be included in the test set

	This is an arbitrary function, you could change this for your own
	requirements

	In this case, the datapoint is assigned to the test set if the bucket
	number is less than the test ratio x total buckets.
	"""
	return bucket < TEST_RATIO * BUCKETS


	def hash_train_test_split(
	df: pd.DataFrame,
	split_col: str,
	approx_test_ratio: float,
	) -> tuple[pd.DataFrame, pd.DataFrame]:
	"""Split the data into a training and test set based of a specific column

	This function adds an additional column to the dataframe. This is for
	demonstration purposes and is not required. The test set check could all
	be completed in memory

	Args:
	df: original dataset
	split_col: name of the column to use for hashing which uniquely
	identifies a datapoint
	approx_test_ratio: float between 0-1. This is an approximate ratio as
	the hashing algo will not necessarily provide a uniform bucket
	distribution for small datasets
	Returns:
	A tuple of two dataframes, the first is the training set and the second
	is the test set
	"""
	# assign bucket
	df["bucket"] = df[split_col].apply(assign_hash_bucket)

	# generate 'mask' of boolean values which define the train/test split
	in_test_set = df["bucket"].apply(test_set_check)

	return df[~in_test_set], df[in_test_set]