Skip to content

Instantly share code, notes, and snippets.

@davidgilbertson
Created January 13, 2025 00:32
Show Gist options
  • Save davidgilbertson/e81e206998b4dc753002294affe78098 to your computer and use it in GitHub Desktop.
Save davidgilbertson/e81e206998b4dc753002294affe78098 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import streamlit as st
st.set_page_config(
page_title="SDC selector",
layout="wide",
)
@st.cache_data
def get_data():
df = pd.read_csv("messages.csv")
embeddings = np.load("embeddings.npy")
assert len(df) == len(embeddings)
df["Target"] = False # reset for testing
embeddings = (embeddings - embeddings.mean(0)) / embeddings.std(0)
return df, embeddings
if "df" not in st.session_state:
df, embeddings = get_data()
st.session_state.df = df
st.session_state.embeddings = embeddings
else:
df = st.session_state.df
embeddings = st.session_state.embeddings
any_selected = df.Target.sum() > 0
if any_selected:
weights = embeddings[df.Target].mean(0)
score = embeddings @ weights
df["Score"] = score
text = st.text_input("Filter")
if text:
df_filtered = df[df.Message.str.contains(text, case=False, regex=False)]
else:
df_filtered = df.copy()
df_filtered = df_filtered.loc[~df.Target]
if any_selected:
df_filtered = df_filtered.sort_values("Score", ascending=False)
new_df = st.data_editor(
df_filtered,
column_config=dict(
Score=st.column_config.NumberColumn(
label="Score", width="small", format="%.1f"
),
Target=st.column_config.CheckboxColumn(
label="Target",
width="small",
),
Message=st.column_config.TextColumn(
label="Message",
width="large",
),
),
column_order=["Score", "Target", "Message"],
disabled=["Score", "Message"],
hide_index=True,
use_container_width=True,
)
def update_df():
ids = new_df[new_df.Target].index
df.loc[ids, "Target"] = True
df.to_csv("messages_labelled.csv", index=False)
st.button("Save and update", on_click=update_df)
if any_selected:
with st.expander(f"Show {df.Target.sum()} positive samples"):
st.dataframe(df[df.Target])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment