Skip to content

Instantly share code, notes, and snippets.

@kristiewirth
Last active December 11, 2020 21:40
Show Gist options
  • Save kristiewirth/ba9f86517dae8b980b3b9592376b5762 to your computer and use it in GitHub Desktop.
Save kristiewirth/ba9f86517dae8b980b3b9592376b5762 to your computer and use it in GitHub Desktop.
import re
import numpy as np
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from toolz.functoolz import pipe
def find_non_sentences(sentences: list, nlp):
non_sentences = []
for sentence in sentences:
spacy_text = nlp(sentence.strip())
verb_count = np.sum(
[
(
token.pos_ == "VERB"
or token.pos_ == "AUX"
or token.pos_ == "ROOT"
or token.pos_ == "pcomp"
)
for token in spacy_text
]
)
try:
prob = float(verb_count) / len(spacy_text)
except Exception:
prob = 1.0
# If __% or less of a sentence is verbs, it's probably not a real sentence
if prob <= 0.05:
non_sentences.append(sentence)
return non_sentences
def clean_text(df):
df["cleaned_body"] = [
str(row)
# Remove strange encoding
.encode("ascii", errors="ignore").decode().lower()
# Remove replied to emails
.split("> wrote:")[0]
# Remove text populated from contact form
.split("what do you need help with")[0]
# Remove signatures
.split("thanks,")[0].split("regards,")[0]
for row in df["body"]
]
# Remove HTML tags
df["cleaned_body"] = [re.sub("<.*?>", "", row) for row in df["cleaned_body"]]
return df
def drop_ineligible_tickets(df):
ineligible = [
"trial",
"extend",
"refund",
"charge",
]
df = df[~df["cleaned_body"].str.contains("|".join(ineligible), na=False)]
return df
def drop_rows_low_word_count(df):
df["word_count"] = df["cleaned_body"].apply(
lambda x: len(str(x).strip().split(" "))
)
tickets_no_low_word_count = df[df["word_count"] >= 3]
return tickets_no_low_word_count
def detect_with_exception(row):
"""
Adding exception to this existing function so that .apply doesn't error out
"""
try:
language = detect(row)
except Exception:
language = "unknown"
return language
def remove_non_english_tickets(df):
df["language"] = df["cleaned_body"].apply(lambda x: detect_with_exception(str(x)))
tickets_only_english = df[df["language"] == "en"]
return tickets_only_english
def save_historical_data(df):
cleaned_df = pipe(
df,
clean_text,
drop_ineligible_tickets,
drop_rows_low_word_count,
remove_non_english_tickets,
)
save_to_s3(cleaned_df)
def vectorize_save_historical_data(df):
vectorizer = TfidfVectorizer(
tokenizer=ct.lematize,
ngram_range=(1, 3),
stop_words=all_stop_words,
min_df=2,
max_df=0.4,
)
vectors = vectorizer.fit_transform(df["cleaned_body"])
save_to_s3(vectorizer)
save_to_s3(vectors)
def vectorize_get_similarities_one_incoming_ticket(
incoming_ticket, vectors_past_tickets, saved_vectorizer
):
vectors = saved_vectorizer.transform(incoming_ticket["cleaned_body"])
cos_sims = linear_kernel(vectors, vectors_past_tickets)
return cos_sims
def get_most_similar_ticket(incoming_ticket, cos_sims, past_tickets, past_responses):
cos_sims_indices_descending = (-cos_sims).argsort()
cos_sims_values_descending = -np.sort(-cos_sims, axis=1)
incoming_ticket["most_similar"] = [
past_tickets["cleaned_body"].iloc[x[0]]
for x in cos_sims_indices_descending[:, 0 : 0 + 1]
]
incoming_ticket["most_similar_conv_id"] = [
past_tickets["conv_id"].iloc[x[0]]
for x in cos_sims_indices_descending[:, 0 : 0 + 1]
]
incoming_ticket["most_similar_score"] = [x[0] for x in cos_sims_values_descending]
incoming_ticket["most_similar_response"] = [
" ".join(past_responses[past_responses["conv_id"] == value]["body"].values)
for value in incoming_ticket["most_similar_conv_id"]
]
return incoming_ticket
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment