Skip to content

Instantly share code, notes, and snippets.

@asehmi
Created December 14, 2021 15:18
Show Gist options
  • Save asehmi/6c69c8db650dd561f48fca855f4de91f to your computer and use it in GitHub Desktop.
Save asehmi/6c69c8db650dd561f48fca855f4de91f to your computer and use it in GitHub Desktop.
Chaining Streamlit forms and passing values using session state API
import streamlit as st
from streamlit_pandas_profiling import st_profile_report
from streamlit import session_state as session
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
if 'data_uploader_submitted' not in session:
session.data_uploader_submitted = False
if 'cluster_duplicates_submitted' not in session:
session.cluster_duplicates_submitted = False
def cluster_duplicates(df, col_name, dis_num, dis_non_alphanum, sim, aff):
st.write(col_name)
st.write(df.head())
st.write(df[col_name].unique())
@st.experimental_memo(show_spinner=True, persist='disk')
def get_profile_report(file_info, df):
pr = ProfileReport(df, explorative=True, lazy=True, minimal=True)
return pr
def profiler(file, delim):
file = st.session_state.upload
delimiter = st.session_state.delim.split(" ")[1][1:-1]
df = pd.read_csv(file, sep=delimiter, engine="python")
session['df'] = df
file_info = {"Filename": file.name, "FileType": file.type, "FileSize": file.size}
st.write(file_info)
pr = get_profile_report(file_info, df)
st_profile_report(pr)
cluster_duplicates_form = st.form(key="cluster_duplicates")
with cluster_duplicates_form:
cols = [val for val in df.columns]
col_name = st.selectbox("Select column for clustering", cols, key="col_name")
dis_num = st.checkbox("discard_numeric", key="dis_num")
dis_non_alphanum = st.checkbox("discard_nonalpha_numeric", key="dis_non_alphanum")
similarity = st.radio(label="Select Similarity Measure",
options=["levenshtein (recommended)", "cosine", "jaro_winkler", "trigram",
"levenshtein_partial"], key="similarity")
affinity = st.radio(label="Select Distance Measure",
options=["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], key="affinity")
if cluster_duplicates_form.form_submit_button(label = "Cluster Duplicates"):
session.cluster_duplicates_submitted = True
def data_uploader_form():
file_upload_form = st.form(key="file_upload")
with file_upload_form:
data_file = st.file_uploader("Upload File", type=['csv', 'xlsx'], key="upload")
delim_list = ["pipe (|)", r"tab (\t)", "comma (,)", "semicolon (;)"]
delim = st.selectbox("Select File Seperator/Delimiter", delim_list, key="delim")
if file_upload_form.form_submit_button(label='Profile Data'):
session.data_uploader_submitted = True
if __name__ =="__main__":
#st.set_page_config(layout="wide")
st.write("Data Profiler :wave:")
data_uploader_form()
if session.data_uploader_submitted:
profiler(session.upload, session.delim)
if session.cluster_duplicates_submitted:
method_args = (session.df, session.col_name, session.dis_num, session.dis_non_alphanum, session.similarity, session.affinity)
print(method_args)
print(list(session.items()))
cluster_duplicates(*method_args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment