This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence | |
seqs = ['gigantic_string','tiny_str','medium_str'] | |
# make <pad> idx 0 | |
vocab = ['<pad>'] + sorted(set(''.join(seqs))) | |
# make model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import pairwise_distances_argmin_min | |
avg = [] | |
for j in range(n_clusters): | |
idx = np.where(kmeans.labels_ == j)[0] | |
avg.append(np.mean(idx)) | |
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded) | |
ordering = sorted(range(n_clusters), key=lambda k: avg[k]) | |
summary = ' '.join([email[closest[idx]] for idx in ordering]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.cluster import KMeans | |
n_clusters = np.ceil(len(encoded)**0.5) | |
kmeans = KMeans(n_clusters=n_clusters) | |
kmeans = kmeans.fit(encoded) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The 'skipthoughts' module can be found at the root of the GitHub repository linked above | |
import skipthoughts | |
# You would need to download pre-trained models first | |
model = skipthoughts.load_model() | |
encoder = skipthoughts.Encoder(model) | |
encoded = encoder.encode(sentences) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.tokenize import sent_tokenize | |
sentences = sent_tokenize(email, language = lang) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from langdetect import detect | |
lang = detect(cleaned_email) # lang = 'en' for an English email |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from talon.signature.bruteforce import extract_signature | |
cleaned_email, _ = extract_signature(email) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# clean() is a modified version of extract_signature() found in bruteforce.py in the GitHub repository linked above | |
cleaned_email, _ = clean(email) | |
lines = cleaned_email.split('\n') | |
lines = [line for line in lines if line != ''] | |
cleaned_email = ' '.join(lines) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Module for E-mail Summarization | |
***************************************************************************** | |
Input Parameters: | |
emails: A list of strings containing the emails | |
Returns: | |
summary: A list of strings containing the summaries. | |
***************************************************************************** |