Skip to content

Instantly share code, notes, and snippets.

@ili3p
Created December 5, 2016 00:49

Revisions

  1. ili3p created this gist Dec 5, 2016.
    31 changes: 31 additions & 0 deletions process_word2vec.lua
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,31 @@
    local words = torch.load(opt.words) -- it's a tds.Hash
    local word2vec = torch.FloatTensor(opt.vocabsz, opt.dim)
    local buffsz = 2^13 -- == 8k
    local f = io.input(opt.input)
    local done = 0
    local unk

    -- read huge word2vec file with 2,196,017 lines
    while true do
    local lines, leftover = f:read(buffsz, '*line')
    if not lines then break end -- no more lines
    if leftover then lines = lines .. leftover .. '\n' end -- join the leftover
    lines = lines:split('\n')
    for i=1, #lines do
    if done % 1000 == 0 then
    xlua.progress(done, opt.nvec)
    end
    local line = lines[i]:split(' ')
    if line[1] == 'UNK' then
    table.remove(line, 1) -- remove the word
    unk = torch.FloatTensor(line)
    else
    local index = words['word2id'][line[1]]
    if index then
    table.remove(line, 1) -- remove the word
    word2vec[index] = torch.FloatTensor(line)
    end
    end
    done = done + 1
    end
    end