Last active
July 28, 2021 20:40
-
-
Save Jdoz/a3fff7b3cd59a56a3aa4227a77a05835 to your computer and use it in GitHub Desktop.
[Clean Names] Clean Pandas column names #python #pandas #data-cleansing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import unicodedata | |
import re | |
from typing import Hashable, List, Collection, Union | |
_underscorer1 = re.compile(r"(.)([A-Z][a-z]+)") | |
_underscorer2 = re.compile("([a-z0-9])([A-Z])") | |
def _camel2snake(col_name: str) -> str: | |
""" | |
Convert camelcase names to snake case. | |
Implementation taken from: https://gist.github.com/jaytaylor/3660565 | |
by @jtaylor | |
""" | |
subbed = _underscorer1.sub(r"\1_\2", col_name) | |
return _underscorer2.sub(r"\1_\2", subbed).lower() | |
FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", "")] | |
def _normalize_1(col_name: Hashable) -> str: | |
result = str(col_name) | |
for search, replace in FIXES: | |
result = re.sub(search, replace, result) | |
return result | |
def _remove_special(col_name: Hashable) -> str: | |
"""Remove special characters from column name.""" | |
return "".join(item for item in str(col_name) if item.isalnum() or "_" in item) | |
def _strip_accents(col_name: str) -> str: | |
""" | |
Removes accents from a DataFrame column name. | |
.. _StackOverflow: | |
https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string | |
""" # noqa: E501 | |
return "".join( | |
l | |
for l in unicodedata.normalize("NFD", col_name) | |
if not unicodedata.combining(l) | |
) | |
def _clean_unders(col_name: str) -> str: | |
col_name = col_name.replace("__", "_") | |
col_name = col_name[1:] if col_name.startswith("_") else col_name | |
col_name = col_name[:-1] if col_name.endswith("_") else col_name | |
return col_name.strip() | |
def _clean_col_names(columns: Hashable) -> List: | |
original_columns = columns | |
funcs = [_camel2snake, _normalize_1, _remove_special, _strip_accents, _clean_unders] | |
_cleaned_names = [] | |
col_map = {} | |
for c, o in zip(columns, original_columns): | |
for f in funcs: | |
c = f(c) | |
_cleaned_names.append(c) | |
col_map.update({o: c}) | |
return _cleaned_names, col_map | |
def clean_names(df: pd.DataFrame, orig_cols: bool = False) -> Union[pd.DataFrame, bool]: | |
"""Clean pandas data column names, this is the function entry point""" | |
df_ = df.copy() | |
original_columns = df_.columns.tolist() | |
clean_columns, col_map = _clean_col_names(original_columns) | |
df_ = df_.rename(columns={x: y for x, y in zip(original_columns, clean_columns)}) | |
if orig_cols: | |
return df_, col_map | |
return df_ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Congratulations! This should definitely be integrated into Pandas (in R there's this great
{janitor}
package for doing these operations very quickly)