hamelsmu · August 27, 2019 01:27
diff --git a/get_issue_embeddings.py b/get_issue_embeddings.py
 #!/usr/bin/env python
 # coding: utf-8


 # This notebook illustrates the use of a utility, `InferenceWrapper.df_to_emb` that can be used to perform inference in bulk.

 # - **checkpointed model** (2.29 GB): 
 # `https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/best_22zkdqlr.pth`


 # this file imports https://github.com/kubeflow/code-intelligence/blob/master/Issue_Embeddings/flask_app/inference.py


 from inference import InferenceWrapper, pass_through
 import pandas as pd
 from numpy import concatenate as cat
 import numpy as np


 # !wget https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl


 # #### Create an `InferenceWrapper` object:
 wrapper = InferenceWrapper(model_path='/ds/notebooks',
                           model_file_name='trained_model_22zkdqlr.pkl')


 ##### Load the GFI Dataset

 gfidf = pd.read_csv('gfi_data_all.csv')
 train_mask = gfidf.split_name != 'test'
 test_mask = gfidf.split_name == 'test'

 # cutoff crazy long issues
 len_cutoff = int(gfidf[train_mask].body.str.len().quantile(.95))
 print(f'95th percentile body length: {len_cutoff:,}')
 gfidf.body = gfidf.body.str[:len_cutoff]



 assert gfidf.body.str.len().max() <= len_cutoff


 # # Perform Batch Inference To Get Embeddings
 # This retrieves the document embeddings for each issue

 train_embeddings = wrapper.df_to_emb(gfidf[train_mask])
 test_embeddings = wrapper.df_to_emb(gfidf[test_mask])


 with open('gfi_train_emb.npy', 'wb') as f:
    np.save(f, train_embeddings)


 with open('gfi_test_emb.npy', 'wb') as f:
    np.save(f, test_embeddings)
	#!/usr/bin/env python
	# coding: utf-8


	# This notebook illustrates the use of a utility, `InferenceWrapper.df_to_emb` that can be used to perform inference in bulk.

	# - checkpointed model (2.29 GB):
	# `https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/best_22zkdqlr.pth`


	# this file imports https://github.com/kubeflow/code-intelligence/blob/master/Issue_Embeddings/flask_app/inference.py


	from inference import InferenceWrapper, pass_through
	import pandas as pd
	from numpy import concatenate as cat
	import numpy as np


	# !wget https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl


	# #### Create an `InferenceWrapper` object:
	wrapper = InferenceWrapper(model_path='/ds/notebooks',
	model_file_name='trained_model_22zkdqlr.pkl')


	##### Load the GFI Dataset

	gfidf = pd.read_csv('gfi_data_all.csv')
	train_mask = gfidf.split_name != 'test'
	test_mask = gfidf.split_name == 'test'

	# cutoff crazy long issues
	len_cutoff = int(gfidf[train_mask].body.str.len().quantile(.95))
	print(f'95th percentile body length: {len_cutoff:,}')
	gfidf.body = gfidf.body.str[:len_cutoff]



	assert gfidf.body.str.len().max() <= len_cutoff


	# # Perform Batch Inference To Get Embeddings
	# This retrieves the document embeddings for each issue

	train_embeddings = wrapper.df_to_emb(gfidf[train_mask])
	test_embeddings = wrapper.df_to_emb(gfidf[test_mask])


	with open('gfi_train_emb.npy', 'wb') as f:
	np.save(f, train_embeddings)


	with open('gfi_test_emb.npy', 'wb') as f:
	np.save(f, test_embeddings)