Last active
October 19, 2022 14:09
-
-
Save krassowski/159bb0c76ff47edb031710e2cff6862f to your computer and use it in GitHub Desktop.
Simple pandas DataFrame explorer for JupyterLab (using sidecar)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas_explorer import pandas_explorer | |
from pandas import read_csv | |
iris = read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') | |
pandas_explorer(iris, title='Iris') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2021 Michał Krassowski. | |
# Distributed under the terms of the Modified BSD License. | |
from sidecar import Sidecar | |
from ipywidgets import widgets | |
from IPython.display import display, update_display, HTML | |
from types import SimpleNamespace | |
from pandas import DataFrame, option_context | |
import string | |
import re | |
def pandas_explorer(data: DataFrame, title='Explorer', default_rows=30, drop_index=False): | |
data = data.reset_index(drop=drop_index) | |
split_on = '|'.join(re.escape(x) for x in string.punctuation + ' ') | |
table_widgets = SimpleNamespace( | |
row_filter=widgets.Text(description='Rows filter'), | |
row_filter_fuzzy=widgets.Checkbox(description='Fuzzy', value=True), | |
show_index=widgets.Checkbox(description='Index', value=False), | |
max_rows=widgets.IntSlider(value=default_rows, description='Max rows', min=0, max=len(data)), | |
max_columns=widgets.IntSlider(value=10, description='Max columns', min=0, max=len(data.columns)), | |
sort_column=widgets.Dropdown(options=[None, *data.columns], description='Sort'), | |
sort_ascending=widgets.Checkbox(description='Ascending') | |
) | |
filtered_from = None | |
def highlight(value, substring, marker='b', options=''): | |
value = str(value) | |
value = value.split(substring) | |
return f'<{marker} {options}>{substring}</{marker}>'.join(value) | |
def split(text: str): | |
return [ | |
v | |
for v in re.split(split_on, text) | |
if v | |
] | |
def contains(value, substring: str, fuzzy: bool): | |
if fuzzy: | |
parts = split(substring) | |
if len(parts) > 1: | |
return all( | |
contains(value, part, fuzzy=fuzzy) | |
for part in parts | |
) | |
# todo case sensitivity option? | |
return substring.lower() in value.lower() | |
def show_frame( | |
row_filter: str, row_filter_fuzzy: bool, | |
show_index: bool, max_rows: int, max_columns: int, | |
sort_column: str, sort_ascending: bool | |
): | |
nonlocal filtered_from | |
df = data.copy() | |
if row_filter: | |
df = df[ | |
df.apply( | |
lambda row: ( | |
row | |
.astype(str) | |
.apply(contains, substring=row_filter, fuzzy=row_filter_fuzzy) | |
.any() | |
), | |
axis=1) | |
] | |
if len(df) < max_rows and not filtered_from: | |
filtered_from = max_rows | |
def highlight_matches(value): | |
if row_filter: | |
if row_filter_fuzzy: | |
parts = split(row_filter) | |
for part in parts: | |
value = highlight(value, part) | |
else: | |
value = highlight(value, row_filter) | |
return value | |
columns_to_hide = list(df.columns)[max_columns:] | |
notes = [] | |
if max_rows < len(df): | |
notes.append(f'{len(df) - max_rows} rows hidden') | |
if columns_to_hide: | |
notes.append(f'{len(columns_to_hide)} columns hidden') | |
if sort_column is not None: | |
df = df.sort_values(sort_column, ascending=sort_ascending) | |
try: | |
styled = ( | |
df.head(max_rows).style | |
.hide_columns(columns_to_hide) | |
.format(highlight_matches) | |
.set_caption(' '.join(notes)) | |
) | |
if not show_index: | |
styled = styled.hide_index() | |
displayed = display(styled) | |
except ValueError as e: | |
if 'style is not supported for non-unique indices' not in e.args[0]: | |
raise | |
with option_context('display.max_rows', max_rows): | |
styled = df.head(max_rows).loc[:,df.columns.isin(columns_to_hide)] | |
displayed = display(styled) | |
if filtered_from is not None and len(df) != 0: | |
current_value = min(filtered_from, len(df)) | |
if len(df) > filtered_from: | |
filtered_from = None | |
table_widgets.max_rows.max = current_value | |
table_widgets.max_rows.value = current_value | |
table_widgets.max_rows.max = len(df) | |
return displayed | |
sc = Sidecar(title=title) | |
# https://github.com/jupyter-widgets/jupyterlab-sidecar/issues/25 | |
sc_out = widgets.Output(layout={'overflow': 'scroll', 'max-width': '100%'}) | |
with sc: | |
display(sc_out) | |
with sc_out: | |
out = widgets.interactive_output( | |
show_frame, | |
vars(table_widgets) | |
) | |
ui = widgets.VBox([ | |
widgets.HBox([table_widgets.row_filter, table_widgets.row_filter_fuzzy]), | |
widgets.HBox([table_widgets.max_rows, table_widgets.max_columns]), | |
widgets.HBox([table_widgets.sort_column, table_widgets.sort_ascending]), | |
]) | |
out_box = widgets.Box( | |
[out], | |
# sadly hard-coded to allow for scroll as sidcar has layout issues | |
layout=widgets.Layout(max_width='600px', max_height='1300px') | |
) | |
display( | |
widgets.VBox([ | |
ui, | |
out_box | |
]) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment