Created
December 5, 2016 00:49
-
-
Save ili3p/23d682b03a602cd2359a01656d111540 to your computer and use it in GitHub Desktop.
Reading 5.3GB text file with LuaJIT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local words = torch.load(opt.words) -- it's a tds.Hash | |
local word2vec = torch.FloatTensor(opt.vocabsz, opt.dim) | |
local buffsz = 2^13 -- == 8k | |
local f = io.input(opt.input) | |
local done = 0 | |
local unk | |
-- read huge word2vec file with 2,196,017 lines | |
while true do | |
local lines, leftover = f:read(buffsz, '*line') | |
if not lines then break end -- no more lines | |
if leftover then lines = lines .. leftover .. '\n' end -- join the leftover | |
lines = lines:split('\n') | |
for i=1, #lines do | |
if done % 1000 == 0 then | |
xlua.progress(done, opt.nvec) | |
end | |
local line = lines[i]:split(' ') | |
if line[1] == 'UNK' then | |
table.remove(line, 1) -- remove the word | |
unk = torch.FloatTensor(line) | |
else | |
local index = words['word2id'][line[1]] | |
if index then | |
table.remove(line, 1) -- remove the word | |
word2vec[index] = torch.FloatTensor(line) | |
end | |
end | |
done = done + 1 | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment