Skip to content

Instantly share code, notes, and snippets.

@Jdoz
Last active July 28, 2021 20:40
Show Gist options
  • Save Jdoz/a3fff7b3cd59a56a3aa4227a77a05835 to your computer and use it in GitHub Desktop.
Save Jdoz/a3fff7b3cd59a56a3aa4227a77a05835 to your computer and use it in GitHub Desktop.
[Clean Names] Clean Pandas column names #python #pandas #data-cleansing
import pandas as pd
import unicodedata
import re
from typing import Hashable, List, Collection, Union
_underscorer1 = re.compile(r"(.)([A-Z][a-z]+)")
_underscorer2 = re.compile("([a-z0-9])([A-Z])")
def _camel2snake(col_name: str) -> str:
"""
Convert camelcase names to snake case.
Implementation taken from: https://gist.github.com/jaytaylor/3660565
by @jtaylor
"""
subbed = _underscorer1.sub(r"\1_\2", col_name)
return _underscorer2.sub(r"\1_\2", subbed).lower()
FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", "")]
def _normalize_1(col_name: Hashable) -> str:
result = str(col_name)
for search, replace in FIXES:
result = re.sub(search, replace, result)
return result
def _remove_special(col_name: Hashable) -> str:
"""Remove special characters from column name."""
return "".join(item for item in str(col_name) if item.isalnum() or "_" in item)
def _strip_accents(col_name: str) -> str:
"""
Removes accents from a DataFrame column name.
.. _StackOverflow:
https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
""" # noqa: E501
return "".join(
l
for l in unicodedata.normalize("NFD", col_name)
if not unicodedata.combining(l)
)
def _clean_unders(col_name: str) -> str:
col_name = col_name.replace("__", "_")
col_name = col_name[1:] if col_name.startswith("_") else col_name
col_name = col_name[:-1] if col_name.endswith("_") else col_name
return col_name.strip()
def _clean_col_names(columns: Hashable) -> List:
original_columns = columns
funcs = [_camel2snake, _normalize_1, _remove_special, _strip_accents, _clean_unders]
_cleaned_names = []
col_map = {}
for c, o in zip(columns, original_columns):
for f in funcs:
c = f(c)
_cleaned_names.append(c)
col_map.update({o: c})
return _cleaned_names, col_map
def clean_names(df: pd.DataFrame, orig_cols: bool = False) -> Union[pd.DataFrame, bool]:
"""Clean pandas data column names, this is the function entry point"""
df_ = df.copy()
original_columns = df_.columns.tolist()
clean_columns, col_map = _clean_col_names(original_columns)
df_ = df_.rename(columns={x: y for x, y in zip(original_columns, clean_columns)})
if orig_cols:
return df_, col_map
return df_
@baggiponte
Copy link

Congratulations! This should definitely be integrated into Pandas (in R there's this great {janitor} package for doing these operations very quickly)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment