ili3p · December 5, 2016 00:49
diff --git a/process_word2vec.lua b/process_word2vec.lua
 local words = torch.load(opt.words) -- it's a tds.Hash
 local word2vec = torch.FloatTensor(opt.vocabsz, opt.dim)
 local buffsz = 2^13 -- == 8k
 local f = io.input(opt.input)
 local done = 0
 local unk

 -- read huge word2vec file with 2,196,017 lines
 while true do
    local lines, leftover = f:read(buffsz, '*line')
    if not lines then break end  -- no more lines
    if leftover then lines = lines .. leftover .. '\n' end -- join the leftover
    lines = lines:split('\n')
    for i=1, #lines do
        if done % 1000 == 0 then 
            xlua.progress(done, opt.nvec)
        end
        local line = lines[i]:split(' ')
        if line[1] == 'UNK' then 
            table.remove(line, 1) -- remove the word
            unk = torch.FloatTensor(line) 
        else
            local index = words['word2id'][line[1]]
            if index then
                table.remove(line, 1) -- remove the word
                word2vec[index] = torch.FloatTensor(line)
            end
        end
        done = done + 1
    end
 end
	local words = torch.load(opt.words) -- it's a tds.Hash
	local word2vec = torch.FloatTensor(opt.vocabsz, opt.dim)
	local buffsz = 2^13 -- == 8k
	local f = io.input(opt.input)
	local done = 0
	local unk

	-- read huge word2vec file with 2,196,017 lines
	while true do
	local lines, leftover = f:read(buffsz, '*line')
	if not lines then break end -- no more lines
	if leftover then lines = lines .. leftover .. '\n' end -- join the leftover
	lines = lines:split('\n')
	for i=1, #lines do
	if done % 1000 == 0 then
	xlua.progress(done, opt.nvec)
	end
	local line = lines[i]:split(' ')
	if line[1] == 'UNK' then
	table.remove(line, 1) -- remove the word
	unk = torch.FloatTensor(line)
	else
	local index = words['word2id'][line[1]]
	if index then
	table.remove(line, 1) -- remove the word
	word2vec[index] = torch.FloatTensor(line)
	end
	end
	done = done + 1
	end
	end