Created
December 5, 2016 00:49
Revisions
-
ili3p created this gist
Dec 5, 2016 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,31 @@ local words = torch.load(opt.words) -- it's a tds.Hash local word2vec = torch.FloatTensor(opt.vocabsz, opt.dim) local buffsz = 2^13 -- == 8k local f = io.input(opt.input) local done = 0 local unk -- read huge word2vec file with 2,196,017 lines while true do local lines, leftover = f:read(buffsz, '*line') if not lines then break end -- no more lines if leftover then lines = lines .. leftover .. '\n' end -- join the leftover lines = lines:split('\n') for i=1, #lines do if done % 1000 == 0 then xlua.progress(done, opt.nvec) end local line = lines[i]:split(' ') if line[1] == 'UNK' then table.remove(line, 1) -- remove the word unk = torch.FloatTensor(line) else local index = words['word2id'][line[1]] if index then table.remove(line, 1) -- remove the word word2vec[index] = torch.FloatTensor(line) end end done = done + 1 end end