Created
July 9, 2019 02:08
-
-
Save RoaldSchuring/7ddbdf3359ff1d0ed9b3fa2c6aafa746 to your computer and use it in GitHub Desktop.
extracting_info_from_vectorstxt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.preprocessing import normalize | |
# open the vectors.txt file containing all the trained word embeddings, extracting the descriptors & embeddings | |
num_points = len(open('vectors.txt','r').read().split('\n')) | |
first_line = True | |
index_to_word = [] | |
with open("vectors.txt","r") as f: | |
for line_num, line in enumerate(f): | |
if first_line: | |
dim = int(line.strip().split()[1]) | |
word_vecs = np.zeros((num_points, dim), dtype=float) | |
first_line = False | |
continue | |
line = line.strip() | |
word = line.split()[0] | |
vec = word_vecs[line_num-1] | |
for index, vec_val in enumerate(line.split()[1:]): | |
vec[index] = float(vec_val) | |
index_to_word.append(word) | |
if line_num >= num_points: | |
break | |
word_vecs = normalize(word_vecs, copy=False, return_norm=False) | |
names_vecs = list(zip(index_to_word, word_vecs)) | |
# eliminate any words that are not in our overview of accepted wine descriptors (contained in the dataframe descriptor_mapping) | |
descriptor_mapping = pd.read_csv('s3://{}/descriptor_mapping.csv'.format(bucket)).set_index('raw descriptor') | |
names_vecs_filtered = [n for n in names_vecs if n[0] in list(descriptor_mapping['level_3'])] | |
# save the descriptor names and the corresponding word vectors in a csv file in our S3 bucket | |
names_vecs_df = pd.DataFrame(names_vecs_filtered, columns=['word', 'vector']) | |
names_vecs_df.sort_values(by=['word'], inplace=True) | |
names_vecs_df.to_csv('word_vectors.csv') | |
boto3.Session().resource('s3').Bucket(bucket).Object('word_vectors.csv').upload_file('word_vectors.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment