Skip to content

Instantly share code, notes, and snippets.

'''
Merge/combine courses in the OpenedX OLX format.
'''
import sys
import os
from distutils.dir_util import copy_tree
import json
# Example:
@aneesha
aneesha / SiameseBERT_SemanticSearch.ipynb
Last active August 9, 2023 00:48
Semantic Search with Sentence-BERT
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import h5py
f = h5py.File('myhdf5file.hdf5')
dset = f['/data/path']
import dask.array as da
x = da.from_array(dset, chunks=(5000, 5000))
import dask.bag as db
import json
records = db.read_text('data/2018-*-*.json').map(json.loads)
records.filter(lambda d: d['username'] == 'Aneesha').pluck('id').frequencies()
import dask.dataframe as dd
df = dd.read_csv('logs/2018-*.*.csv', parse_dates=['timestamp'])
df.groupby(df.timestamp.dt.hour).value.mean().compute()
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# load the original word vectors and the retrofitted word vectors as separate gensim models
original_glove_model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.50d.word2vec.txt', binary=False)
retrofitted_glove_model = gensim.models.KeyedVectors.load_word2vec_format('retrofittedglove.word2vec.txt', binary=False)
# display the words closest to 'happy' using the original GLOVE vectors
display_closestwords_tsnescatterplot(original_glove_model, 'happy', 50, 10, "Original Glove Word Vectors - 'Happy'")
# display the words closest to 'happy' using the GLOVE vectors retrofitted with the Paraphrase lexicons
display_closestwords_tsnescatterplot(retrofitted_glove_model, 'happy', 50, 10, "Retroffited Glove Word Vectors - 'Happy'")
# git clone https://github.com/mfaruqui/retrofitting.git
# Run retrofit.py with arguments to set the word vectors file, the lexicon file, the number of iterations
# and the output word vectors. The word vectors must be in text format
# Eg:
# python retrofit.py -i word_vec_file -l lexicon_file -n num_iter -o out_vec_file
# python retrofit.py -i /data/glove.6B.50d.txt -l /retrofitting/lexicons/ppdb-xl.txt -n 10 -o retrofittedglove.txt
# Convert txt based GLOVE word vectors to Word2Vec format
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="/data/glove.6B.50d.txt", word2vec_output_file="glove.6B.50d.word2vec.txt")
# Method to plot the top no_similar_words in 2D using TSNE
def display_closestwords_tsnescatterplot(model, word, word_vector_dimension, no_similar_words, plot_title):
arr = np.empty((0,word_vector_dimension), dtype='f')
word_labels = [word]
# get close words
close_words = model.similar_by_word(word, topn=no_similar_words)
# add the vector for each of the closest words to the array