Last active
July 31, 2018 10:04
-
-
Save l1m2p3/fe2e355d5af19e17e5a21bcf356b3d45 to your computer and use it in GitHub Desktop.
functions for updating/accessing word vecs on DynamoDB (*updated to use spacy to find token. See https://spacy.io/usage/ for how to install spacy)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import numpy | |
import pickle | |
import spacy | |
table_name = 'wordvec' # table name on DynamoDB | |
# batch size specified by DynamoDB. See DynamoDB's doc for more details | |
write_batch_size = 25 | |
read_batch_size = 100 | |
# DynamoDB client | |
client = boto3.client('dynamodb') | |
# helps turn words into tokens, from which we can find word vector | |
tokenizer = spacy.load('en') | |
# helper function to divide list into sublists | |
def sublist(l, batch_size): | |
return [l[i:i+batch_size] for i in range(0,len(l), batch_size)] | |
# helper function to convert a word to a put request | |
def word_to_put_req(word): | |
vector = tokenizer(unicode(word, encoding='utf-8'))[0].vector | |
return { | |
'PutRequest': { | |
'Item': { | |
'word': { | |
'S': word | |
}, | |
'vector': { | |
'L': [{'N': str(n)} for n in vector] | |
} | |
} | |
} | |
} | |
# | |
# upload lookup for word in `words` | |
# | |
def put_words(words): | |
# request cannot contain duplicate keys. remove duplicates | |
words = list(set(words)) | |
batches = sublist(words, write_batch_size) | |
for batch in batches: | |
request = [word_to_put_req(word) for word in batch] | |
response = client.batch_write_item( | |
RequestItems = { | |
table_name: request | |
} | |
) | |
# | |
# returns a lookup for word in `words` | |
# if a word is absent on DynamoDB, it won't be in the returned lookup's keys | |
# | |
def get_words(words): | |
# request cannot contain duplicate keys. remove duplicates | |
words_no_dup = list(set(words)) | |
batches = sublist(words_no_dup, read_batch_size) | |
wordvec_a = [] | |
for batch in batches: | |
request = [{'word':{'S':word}} for word in batch] | |
response = client.batch_get_item( | |
RequestItems = { | |
table_name: { | |
'Keys': request | |
} | |
} | |
) | |
wordvec_a = wordvec_a + [(d['word']['S'], d['vector']['L']) for d in response['Responses'][table_name]] | |
return dict(wordvec_a) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment