Last active
October 9, 2019 17:14
-
-
Save Puzer/432e224a56f49ae0c52d91ea4b919e6b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bpemb_ru = BPEmb(lang='ru', dim=50) | |
def extract_text(json_data): | |
ru_text = list(filter(lambda x: any(1040 <= ord(y) <= 1103 for y in x), json_data.split('"'))) | |
return ' '.join(ru_text) | |
def embed_text(text): | |
ids = bpemb_ru.encode_ids(text) | |
# ids = list(filter(lambda x: x in ids_white_list, ids)) | |
return bpemb_ru.emb.vectors[ids].sum(axis=0) | |
def preprocess_stories_description(df): | |
story_emb_df = pd.DataFrame(np.vstack(df.story_json.map(extract_text).map(embed_text))).add_prefix('story_bpe__') | |
story_emb_df['story_id'] = df['story_id'] | |
return story_emb_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment