This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Merge/combine courses in the OpenedX OLX format. | |
''' | |
import sys | |
import os | |
from distutils.dir_util import copy_tree | |
import json | |
# Example: |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import h5py | |
f = h5py.File('myhdf5file.hdf5') | |
dset = f['/data/path'] | |
import dask.array as da | |
x = da.from_array(dset, chunks=(5000, 5000)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dask.bag as db | |
import json | |
records = db.read_text('data/2018-*-*.json').map(json.loads) | |
records.filter(lambda d: d['username'] == 'Aneesha').pluck('id').frequencies() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dask.dataframe as dd | |
df = dd.read_csv('logs/2018-*.*.csv', parse_dates=['timestamp']) | |
df.groupby(df.timestamp.dt.hour).value.mean().compute() |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load the original word vectors and the retrofitted word vectors as separate gensim models | |
original_glove_model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.50d.word2vec.txt', binary=False) | |
retrofitted_glove_model = gensim.models.KeyedVectors.load_word2vec_format('retrofittedglove.word2vec.txt', binary=False) | |
# display the words closest to 'happy' using the original GLOVE vectors | |
display_closestwords_tsnescatterplot(original_glove_model, 'happy', 50, 10, "Original Glove Word Vectors - 'Happy'") | |
# display the words closest to 'happy' using the GLOVE vectors retrofitted with the Paraphrase lexicons | |
display_closestwords_tsnescatterplot(retrofitted_glove_model, 'happy', 50, 10, "Retroffited Glove Word Vectors - 'Happy'") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# git clone https://github.com/mfaruqui/retrofitting.git | |
# Run retrofit.py with arguments to set the word vectors file, the lexicon file, the number of iterations | |
# and the output word vectors. The word vectors must be in text format | |
# Eg: | |
# python retrofit.py -i word_vec_file -l lexicon_file -n num_iter -o out_vec_file | |
# python retrofit.py -i /data/glove.6B.50d.txt -l /retrofitting/lexicons/ppdb-xl.txt -n 10 -o retrofittedglove.txt | |
# Convert txt based GLOVE word vectors to Word2Vec format | |
from gensim.scripts.glove2word2vec import glove2word2vec | |
glove2word2vec(glove_input_file="/data/glove.6B.50d.txt", word2vec_output_file="glove.6B.50d.word2vec.txt") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Method to plot the top no_similar_words in 2D using TSNE | |
def display_closestwords_tsnescatterplot(model, word, word_vector_dimension, no_similar_words, plot_title): | |
arr = np.empty((0,word_vector_dimension), dtype='f') | |
word_labels = [word] | |
# get close words | |
close_words = model.similar_by_word(word, topn=no_similar_words) | |
# add the vector for each of the closest words to the array |
NewerOlder