Last active
September 23, 2018 06:38
-
-
Save tslumley/c36db10b4b316277482619451675f580 to your computer and use it in GitHub Desktop.
Read GloVe word embeddings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based on https://gist.github.com/tjvananne/8b0e7df7dcad414e8e6d5bf3947439a9 | |
# Rewritten to work chunk by chunk, so I can read the 42B file with only 8GB memory | |
# input .txt file, exports list of list of values and character vector of names (words) | |
proc_pretrained_vec <- function(filename, chunksize=1000, guess_size=100000) { | |
size<-guess_size | |
here<-0 | |
# initialize space for values and the names of each word in vocab | |
vals <- vector(mode = "list", length(size)) | |
names <- character(length(size)) | |
done<-FALSE | |
filecon<-file(filename, open="rt") | |
while(!done){ | |
p_vec<-scan(filecon,nlines=chunksize, | |
what="", sep="\n") | |
n_read<-length(p_vec) | |
if (n_read<chunksize) done<-TRUE | |
if (n_read==0) break | |
if (here+n_read >size) { #too small; double it | |
vals<-c(vals, vector(mode = "list", length(size))) | |
names<-c(names, character(length(size))) | |
} | |
# loop through to gather values and names of each word | |
for(i in 1:n_read) { | |
this_vec <- p_vec[i] | |
this_vec_unlisted <- unlist(strsplit(this_vec, " ")) | |
this_vec_values <- as.numeric(this_vec_unlisted[-1]) # this needs testing, does it become numeric? | |
this_vec_name <- this_vec_unlisted[1] | |
vals[[i+here]] <- this_vec_values | |
names[[i+here]] <- this_vec_name | |
} | |
here<-here+n_read | |
print(here) | |
} | |
# convert lists to data.frame and attach the names | |
glove <- data.frame(vals) | |
names(glove) <- names | |
return(glove) | |
} | |
# using the function ------------------------------------------------------------------------- | |
# here we are reading in the unzipped, raw, GloVe pre-trained word vector object (.txt) | |
# all you have to change is the file path to where you GloVe object has been unzipped | |
g6b_300 <- proc_pretrained_vec(file = "WORDS/glove.42B.300d.txt") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment