This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenize_docstring(text): | |
"""Apply tokenization using spacy to docstrings.""" | |
tokens = EN.tokenizer(text) | |
return [token.text.lower() for token in tokens if not token.is_space] | |
def tokenize_code(text): | |
"""A very basic procedure for tokenizing code strings.""" | |
return RegexpTokenizer(r'\w+').tokenize(text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create data loaders | |
trn_dl = LanguageModelLoader(trn_indexed, bs, bptt) | |
val_dl = LanguageModelLoader(val_indexed, bs, bptt) | |
# create lang model data | |
md = LanguageModelData(mpath, 1, vocab_size, trn_dl, val_dl, bs=bs, bptt=bptt) | |
# build learner. some hyper-params borrowed from fast.ai examples | |
opt_fn = partial(optim.Adam, betas=(0.8, 0.99)) | |
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15]) * 0.7 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# extract encoder | |
encoder_model = extract_encoder_model(seq2seq_Model) | |
# Freeze Encoder Model | |
for l in encoder_model.layers: | |
l.trainable = False | |
#### Build Model Architecture For Fine-Tuning #### | |
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input') | |
enc_out = encoder_model(encoder_inputs) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# transform raw docstring input | |
encinp = enc_pp.transform_parallel(no_docstring_funcs) | |
# vectorize code using the code2emb model | |
nodoc_vecs = code2emb_model.predict(encinp, batch_size=20000) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
search_index = nmslib.init(method='hnsw', space='cosinesimil') | |
search_index.addDataPointBatch(numpy_vectors) | |
search_index.createIndex({'post': 2}, print_progress=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class search_engine: | |
"""Organizes all the necessary elements we need to make a semantic search tool.""" | |
def __init__(self, | |
nmslib_index, | |
ref_df, | |
query2emb_func): | |
""" | |
Parameters | |
========== | |
nmslib_index : nmslib object |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def dot_product(x, kernel): | |
""" | |
Wrapper for dot product operation, in order to be compatible with both | |
Theano and Tensorflow | |
Args: | |
x (): input | |
kernel (): weights | |
Returns: | |
""" | |
if K.backend() == 'tensorflow': |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"action": "edited", | |
"issue": { | |
"html_url": "https://github.com/Codertocat/Hello-World/issues/2", | |
"id": 327883527, | |
"number": 2, | |
"title": "Spelling error in the README file", | |
"user": { | |
"login": "Codertocat", | |
"type": "User", |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pathlib import Path | |
from fastai.text import TextLMDataBunch as lmdb, load_data | |
from fastai.text.transform import Tokenizer | |
def pass_through(x): | |
return x | |
valid_df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/pre_processed_data/processed_part0000.csv') | |
train_df = valid_df |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from fastai.text import TextLMDataBunch as lmdb | |
from fastai.text.transform import Tokenizer | |
import pandas as pd | |
from pathlib import Path | |
small_df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/pre_processed_data/processed_part0000.csv').head(1000) | |
stokenizer = Tokenizer(pre_rules=[pass_through], n_cpus=30) | |
spath = Path('lang_model_test/') |