Skip to content

Instantly share code, notes, and snippets.

View hamelsmu's full-sized avatar
💻
Always learning.

Hamel Husain hamelsmu

💻
Always learning.
View GitHub Profile
@hamelsmu
hamelsmu / get_code_docstring_pairs.py
Last active May 28, 2018 16:58
Get code and comment pairs - for tutorial
def tokenize_docstring(text):
"""Apply tokenization using spacy to docstrings."""
tokens = EN.tokenizer(text)
return [token.text.lower() for token in tokens if not token.is_space]
def tokenize_code(text):
"""A very basic procedure for tokenizing code strings."""
return RegexpTokenizer(r'\w+').tokenize(text)
@hamelsmu
hamelsmu / language_model.py
Last active May 29, 2018 21:11
language model used in code search
# create data loaders
trn_dl = LanguageModelLoader(trn_indexed, bs, bptt)
val_dl = LanguageModelLoader(val_indexed, bs, bptt)
# create lang model data
md = LanguageModelData(mpath, 1, vocab_size, trn_dl, val_dl, bs=bs, bptt=bptt)
# build learner. some hyper-params borrowed from fast.ai examples
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15]) * 0.7
# extract encoder
encoder_model = extract_encoder_model(seq2seq_Model)
# Freeze Encoder Model
for l in encoder_model.layers:
l.trainable = False
#### Build Model Architecture For Fine-Tuning ####
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')
enc_out = encoder_model(encoder_inputs)
# transform raw docstring input
encinp = enc_pp.transform_parallel(no_docstring_funcs)
# vectorize code using the code2emb model
nodoc_vecs = code2emb_model.predict(encinp, batch_size=20000)
search_index = nmslib.init(method='hnsw', space='cosinesimil')
search_index.addDataPointBatch(numpy_vectors)
search_index.createIndex({'post': 2}, print_progress=True)
class search_engine:
"""Organizes all the necessary elements we need to make a semantic search tool."""
def __init__(self,
nmslib_index,
ref_df,
query2emb_func):
"""
Parameters
==========
nmslib_index : nmslib object
@hamelsmu
hamelsmu / AttentionWithContext.py
Created June 30, 2018 03:51 — forked from cbaziotis/AttentionWithContext.py
Keras Layer that implements an Attention mechanism, with a context/query vector, for temporal data. Supports Masking. Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] "Hierarchical Attention Networks for Document Classification"
def dot_product(x, kernel):
"""
Wrapper for dot product operation, in order to be compatible with both
Theano and Tensorflow
Args:
x (): input
kernel (): weights
Returns:
"""
if K.backend() == 'tensorflow':
{
"action": "edited",
"issue": {
"html_url": "https://github.com/Codertocat/Hello-World/issues/2",
"id": 327883527,
"number": 2,
"title": "Spelling error in the README file",
"user": {
"login": "Codertocat",
"type": "User",
import pandas as pd
from pathlib import Path
from fastai.text import TextLMDataBunch as lmdb, load_data
from fastai.text.transform import Tokenizer
def pass_through(x):
return x
valid_df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/pre_processed_data/processed_part0000.csv')
train_df = valid_df
@hamelsmu
hamelsmu / wandb_fastai_troubleshooting.py
Created May 14, 2019 21:09
For troubleshooting wandb
from fastai.text import TextLMDataBunch as lmdb
from fastai.text.transform import Tokenizer
import pandas as pd
from pathlib import Path
small_df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/pre_processed_data/processed_part0000.csv').head(1000)
stokenizer = Tokenizer(pre_rules=[pass_through], n_cpus=30)
spath = Path('lang_model_test/')