Last active
September 6, 2018 21:27
-
-
Save tokestermw/6b3549bc5caa1be1d724a2a09659284c to your computer and use it in GitHub Desktop.
Test code of `_ElmoSoftmax`.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
To use it inside ELMo script | |
To get the embeddings: | |
allennlp elmo sample_sents.txt out1.hdf5 --top | |
python -c "import h5py; f = h5py.File('out1.hdf5'); print(f['0'][:], f['0'].shape)" | |
To get probabilities: | |
allennlp elmo sample_sents.txt out2.hdf5 --top \ | |
--softmax-weight-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_softmax_weights.hdf5 \ | |
--softmax-vocab-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/vocab-2016-09-10.txt | |
python -c "import h5py; f = h5py.File('out2.hdf5'); print(f['0'][:], f['0'].shape)" | |
Save new example softmax file | |
import h5py | |
import numpy as np | |
e = h5py.File('./allennlp/tests/fixtures/elmo/elmo_token_embeddings.hdf5') | |
W = e['embedding'][:] | |
b = np.zeros_like(W[:, 0]) | |
with h5py.File('elmo_softmax_weights.hdf5', 'w') as f: | |
grp = f.create_group('softmax') | |
grp.create_dataset('W', W.shape, dtype='float32', data=W) | |
grp.create_dataset('b', b.shape, dtype='float32', data=b) | |
To test | |
pytest allennlp/tests/modules/elmo_test.py | |
""" | |
from allennlp.modules.elmo import _ElmoCharacterEncoder, _ElmoBiLm, _ElmoSoftmax, Elmo, batch_to_ids | |
from allennlp.commands.elmo import DEFAULT_OPTIONS_FILE, DEFAULT_WEIGHT_FILE, DEFAULT_SOFTMAX_FILE, DEFAULT_VOCAB_FILE | |
def _tokenize(text): | |
return text.split() | |
if __name__ == '__main__': | |
# elmo_char_encoder - _ElmoCharacterEncoder | |
elmo_bilm = _ElmoBiLm(DEFAULT_OPTIONS_FILE, DEFAULT_WEIGHT_FILE) | |
elmo_softmax = _ElmoSoftmax(DEFAULT_SOFTMAX_FILE, DEFAULT_VOCAB_FILE) | |
sentences = [ | |
'How are you ?', | |
'how are you ?', | |
'How are you .', | |
'You are how ?', | |
] | |
sentences = [_tokenize(i) for i in sentences] | |
char_ids, word_ids = batch_to_ids(sentences, elmo_softmax.vocab) | |
bilm_outputs = elmo_bilm(char_ids) | |
softmax_log_probs, softmax_mask = elmo_softmax( | |
bilm_outputs, word_ids, aggregation_fun='mean') | |
# average backward and forward log probs | |
print(softmax_log_probs.shape) | |
print(softmax_mask.shape) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment