Skip to content

Instantly share code, notes, and snippets.

View gaphex's full-sized avatar

Denis gaphex

  • Moscow
View GitHub Profile
@gaphex
gaphex / reuters_embeddings.py
Created June 23, 2019 12:06
generate embeddings for articles from the Reuters news corpus
from nltk.corpus import reuters
nltk.download("reuters")
nltk.download("punkt")
max_samples = 256
categories = ['wheat', 'tea', 'strategic-metal',
'housing', 'money-supply', 'fuel']
S, X, Y = [], [], []
Q = tf.placeholder("float", [dim])
S = tf.placeholder("float", [None, dim])
S_norm = tf.placeholder("float", [None, 1])
Qr = tf.reshape(Q, (1, -1))
PP = S_norm
QQ = tf.matmul(Qr, tf.transpose(Qr))
PQ = tf.matmul(S, tf.transpose(Qr))
class L2Retriever:
def __init__(self, dim, top_k=3, use_norm=False, use_gpu=True):
self.dim = dim
self.top_k = top_k
self.use_norm = use_norm
config = tf.ConfigProto(
device_count={'GPU': (1 if use_gpu else 0)}
)
config.gpu_options.allow_growth = True
self.session = tf.Session(config=config)
import pandas as pd
import json
!wget http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
!tar -xvzf MovieSummaries.tar.gz
plots_df = pd.read_csv('MovieSummaries/plot_summaries.txt', sep='\t', header=None)
meta_df = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
plot = {}
import pandas as pd
import json
!wget http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
!tar -xvzf MovieSummaries.tar.gz
plots_df = pd.read_csv('MovieSummaries/plot_summaries.txt', sep='\t', header=None)
meta_df = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
plot = {}
import pandas as pd
import json
!wget http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
!tar -xvzf MovieSummaries.tar.gz
plots_df = pd.read_csv('MovieSummaries/plot_summaries.txt', sep='\t', header=None)
meta_df = pd.read_csv('MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
plot = {}
X_vect = bert_vectorizer(X, verbose=True)
def buildMovieRecommender(movie_names, vectorized_plots, top_k=10):
retriever = L2Retriever(vectorized_plots.shape[1], use_norm=True, top_k=top_k, use_gpu=False)
vectorized_norm = np.sum(vectorized_plots**2, axis=1).reshape((-1,1))
def recommend(query):
try:
idx = retriever.predict(vectorized_plots,
vectorized_plots[movie_names.index(query)],
vectorized_norm)[0][1:]
for i in idx:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
Xtr, Xts, Ytr, Yts = train_test_split(X, Y, random_state=34)
mlp = LogisticRegression()
mlp.fit(Xtr, Ytr)
print(classification_report(Yts, mlp.predict(Xts)))
@gaphex
gaphex / build_bert_module.py
Last active January 11, 2021 13:43
Spec function for BERT token embedding module
def build_module_fn(config_path, vocab_path, do_lower_case=True):
def bert_module_fn(is_training):
"""Spec function for a token embedding module."""
input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")
input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask")
token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids")
config = BertConfig.from_json_file(config_path)