Last active
June 24, 2018 03:27
-
-
Save hendra-herviawan/a863c8859351efa7f3ce99d3e062e6f6 to your computer and use it in GitHub Desktop.
snipped_201712
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#https://stackoverflow.com/questions/10622179/how-to-find-identify-large-files-commits-in-git-history | |
git rev-list --objects --all \ | |
| git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' \ | |
| awk '/^blob/ {print substr($0,6)}' \ | |
| sort --numeric-sort --key=2 \ | |
| cut --complement --characters=13-40 \ | |
| numfmt --field=2 --to=iec-i --suffix=B --padding=7 --round=nearest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#fabfile_listdir.py | |
from fabric.api import env, run, cd | |
env.hosts = ["localhost"] | |
def list_dir1(dir=None): | |
"""docstring for list_dir""" | |
dir = dir or env.cwd | |
string = run("for i in %s*; do echo $i; done" % dir) | |
files = string.replace("\r","").split("\n") | |
return files | |
def list_dir2(dir_=None): | |
"""returns a list of files in a directory (dir_) as absolute paths""" | |
with hide('output'): | |
if dir_ is not None and not dir_.endswith("/"): | |
dir_ += "/" | |
dir_ = dir_ or env.cwd | |
string_ = run("for i in %s*; do echo $i; done" % dir_) | |
files = string_.replace("\r","").split("\n") | |
return files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class NBatchLogger(Callback): | |
def __init__(self,display=100): | |
''' | |
display: Number of batches to wait before outputting loss | |
''' | |
self.seen = 0 | |
self.display = display | |
def on_batch_end(self,batch,logs={}): | |
self.seen += logs.get('size', 0) | |
if self.seen % self.display == 0: | |
print '\n{0}/{1} - Batch Loss: {2}'.format(self.seen,self.params['nb_sample'], | |
self.params['metrics'][0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk.stem | |
english_stemmer = nltk.stem.SnowballStemmer('english') | |
class StemmedCountVectorizer(CountVectorizer): | |
def build_analyzer(self): | |
analyzer = super(StemmedCountVectorizer, self).build_analyzer() | |
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) | |
class StemmedTfidfVectorizer(TfidfVectorizer): | |
def build_analyzer(self): | |
analyzer = super(StemmedTfidfVectorizer, self).build_analyzer() | |
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) | |
# | |
from nltk import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
class LemmaTokenizer(object): | |
def __init__(self): | |
self.wnl = WordNetLemmatizer() | |
def __call__(self, doc): | |
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] | |
vect = CountVectorizer(tokenizer=LemmaTokenizer()) | |
# | |
import re | |
def to_british(tokens): | |
for t in tokens: | |
t = re.sub(r"(...)our$", r"\1or", t) | |
t = re.sub(r"([bt])re$", r"\1er", t) | |
t = re.sub(r"([iy])s(e$|ing|ation)", r"\1z\2", t) | |
t = re.sub(r"ogue$", "og", t) | |
yield t | |
class CustomVectorizer(CountVectorizer): | |
def build_tokenizer(self): | |
tokenize = super(CustomVectorizer, self).build_tokenizer() | |
return lambda doc: list(to_british(tokenize(doc))) | |
print(CustomVectorizer().build_analyzer()(u"color colour")) | |
# | |
def number_normalizer(tokens): | |
""" Map all numeric tokens to a placeholder. | |
For many applications, tokens that begin with a number are not directly | |
useful, but the fact that such a token exists can be relevant. By applying | |
this form of dimensionality reduction, some methods may perform better. | |
""" | |
return ("#NUMBER" if token[0].isdigit() else token for token in tokens) | |
class NumberNormalizingVectorizer(TfidfVectorizer): | |
def build_tokenizer(self): | |
tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer() | |
return lambda doc: list(number_normalizer(tokenize(doc))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://stackoverflow.com/questions/45196312/spacy-and-scikit-learn-vectorizer | |
def number_normalizer(tokens): | |
""" Map all numeric tokens to a placeholder. | |
For many applications, tokens that begin with a number are not directly | |
useful, but the fact that such a token exists can be relevant. By applying | |
this form of dimensionality reduction, some methods may perform better. | |
""" | |
return ("#NUMBER" if token[0].isdigit() else token for token in tokens) | |
class NumberNormalizingVectorizer(TfidfVectorizer): | |
def build_tokenizer(self): | |
tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer() | |
return lambda doc: list(number_normalizer(tokenize(doc))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import spacy | |
nlp = spacy.load('en_core_web_sm') | |
# Clean text before feeding it to spaCy | |
punctuations = string.punctuation | |
# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation | |
def cleanup_text(docs, logging=False): | |
texts = [] | |
counter = 1 | |
for doc in docs: | |
if counter % 1000 == 0 and logging: | |
print("Processed %d out of %d documents." % (counter, len(docs))) | |
counter += 1 | |
doc = nlp(doc, disable=['parser', 'ner']) | |
tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] | |
tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations] | |
tokens = ' '.join(tokens) | |
texts.append(tokens) | |
return pd.Series(texts) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Punctuation | |
#https://stackoverflow.com/questions/39782418/remove-punctuations-in-pandas | |
df["new_column"] = df['review'].str.replace('[^\w\s]','') | |
#Remove non latter | |
review_text = re.sub("[^a-zA-Z]"," ", review_text) | |
#Stopword | |
eng_stopwords = stopwords.words('english') | |
wordList = [word for word in wordList if word not in eng_stopwords] | |
normalized = [w for w in text6 if w.lower() not in stopwords.words('english')] | |
#Word Tokenizer | |
tokenization_pattern = r'''(?x) # set flag to allow verbose regexps | |
([A-Z]\.)+ # abbreviations, e.g. U.S.A. | |
| \w+(-\w+)* # words with optional internal hyphens | |
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | |
| \w+[\x90-\xff] # these are escaped emojis | |
| [][.,;"'?():-_`] # these are separate tokens | |
''' | |
word_tokenizer = nltk.tokenize.regexp.RegexpTokenizer(tokenization_pattern) | |
#Stemming | |
pstemmer = nltk.PorterStemmer() | |
lstemmer = nltk.LancasterStemmer() | |
wnlemmatizer = nltk.WordNetLemmatizer() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment