Skip to content

Instantly share code, notes, and snippets.

@grej
Created November 13, 2018 15:28
Show Gist options
  • Save grej/6ea218e6921011d10d0aa7ac844ac21e to your computer and use it in GitHub Desktop.
Save grej/6ea218e6921011d10d0aa7ac844ac21e to your computer and use it in GitHub Desktop.
Rank encode a dataframe column
"""
USAGE:
>>> replace_with_rank_indexes(dataframe, column_name)
or
>>> df.pipe(replace_with_rank_indexes, column='column_name')
Returns a copy of the dataframe with the designated column replaced by
the respective rank values of each numeric value in that column
"""
import pandas as pd
import numpy as np
import numba
@numba.njit
def _replace_over_vector(rank_map):
ranks = np.empty(rank_map.shape[0], np.int32)
# initialize to lowest value
prev_val = rank_map[0][1]
current_rank_idx = 1
for i in range(ranks.size):
if rank_map[i][1] > prev_val:
prev_val = rank_map[i][1]
current_rank_idx += 1
ranks[int(rank_map[i][0])] = current_rank_idx
return ranks
def replace_with_rank_indexes(df, column=None):
"""
-------------------------------------------------
:param df: a pandas dataframe
:param column: the columm to replace with rank values. MUST be numeric
:return: a copy dataframe with the respective rank
values of each numeric value in the designated column
__________________________________________________
"""
if column is None:
raise ValueError('method requires column value')
df = df.copy()
df_col = df[column]
sorted_df_col = df_col.sort_values()
rank_map = np.array((sorted_df_col.index + 0.0001, sorted_df_col.values)).T
df[column] = _replace_over_vector(rank_map)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment