Skip to content

Instantly share code, notes, and snippets.

@ltbringer
Last active July 28, 2022 17:17
Show Gist options
  • Save ltbringer/b1dfdccc2e0449cb2276f4f742a1d413 to your computer and use it in GitHub Desktop.
Save ltbringer/b1dfdccc2e0449cb2276f4f742a1d413 to your computer and use it in GitHub Desktop.
tsne_class_errors
import json
import string
import random
import colorsys
import operator as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from matplotlib import offsetbox
lower_case = set(string.ascii_lowercase + string.whitespace)
def hsv2rgb(h,s,v):
return '#%02x%02x%02x' % tuple(round(i * 255) for i in colorsys.hsv_to_rgb(h,s,v))
def label_color_generator(labels):
colmap = {}
for i, label in enumerate(labels):
h = np.random.uniform(low=0, high=0.99)
s = np.random.uniform(low=0.4, high=0.99)
v = np.random.uniform(low=0.60, high=0.99)
colmap[label] = hsv2rgb(h, s, v)
return colmap
def first_transcript(text):
if not text:
return ""
d = json.loads(text)
if d:
return d[0][0]["transcript"]
return ""
def label2color(label):
return colmap[label]
def fix_tags(tag):
if tag.startswith("_") and tag.endswith("_"):
return tag[1:-1]
return tag
def process(df):
df["color"] = df.tag.apply(label2color)
df["first_transcript"] = df.alternatives.apply(first_transcript)
df.alternatives = df.alternatives.apply(lambda x: " ".join([obj["transcript"] for utterance in json.loads(x) for obj in utterance]))
df["lang"] = df.alternatives.apply(lang_detect)
df["tag"] = df.tag.apply(fix_tags)
def df2tsne(df, feat_col, label_col):
vectors = TfidfVectorizer().fit_transform(df[feat_col])
X_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(vectors)
embeddings = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_reduced)
df["x"] = embeddings[:,0]
df["y"] = embeddings[:,1]
def plot_tsne(df, label_col, text_col, color_col):
indexmap = {}
for label, freq in df[label_col].value_counts().items():
indexmap[label] = np.random.choice(df[df[label_col] == label].index, max((freq // 100), 1))
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
wspace=0.0, hspace=0.0)
operators = [op.add, op.sub]
for label in df[label_col].unique():
label_slice = df[df[label_col] == label]
plt.scatter(
label_slice["x"],
label_slice["y"],
c=label_slice[color_col],
marker="x",
label=label
)
ax.legend()
for label, indexes in indexmap.items():
for idx in indexes:
text = df[text_col].loc[idx]
x = df["x"].loc[idx]
y = df["y"].loc[idx]
x_op = random.choice(operators)
y_op = random.choice(operators)
xtext = x_op(x, 2)
ytext = y_op(y, 2)
ax.annotate(text, xy=(x, y), xytext=(xtext, ytext), arrowprops={"arrowstyle":"->", "color":"gray"})
colmap = label_color_generator(df.tag.unique())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment