Skip to content

Instantly share code, notes, and snippets.

View VibhuJawa's full-sized avatar
🏠
Working from home

Vibhu Jawa VibhuJawa

🏠
Working from home
  • Nvidia
  • Santa Clara
View GitHub Profile
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
print("Hello World")
def get_non_empty_lines(lines):
"""
returns non empty lines from a list of lines
"""
clean_lines = []
for line in lines:
str_line = line.strip()
if str_line:
clean_lines.append(str_line)
return clean_lines
# remove the following punctuation/characters from cudf
filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', '\\', ':', ';', '<', '=', '>',
'?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\~', '\t','\\n',"'",",",'~' , '—']
text_col_sample = df.head(5)
text_col_sample['text'].to_pandas()
text_col_sample['text_clean'] = text_col_sample['text'].str.replace_multi(filters, ' ', regex=False)
text_col_sample['text_clean'] = text_col_sample['text_clean'].str.lower()
text_col_sample['text_clean'].to_pandas()
text_col_sample['text_clean'] = text_col_sample['text_clean'].str.lower()
text_col_sample['text_clean'].to_pandas()
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = nvstrings.to_device(STOPWORDS)
text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean'].data, STOPWORDS, ' ')
text_col_sample['text_clean'].to_pandas()
text_col_sample['text_clean'] = text_col_sample['text_clean'].str.replace(r"\s+", ' ',regex=True)
text_col_sample['text_clean'] = text_col_sample['text_clean'].str.strip(' ')
text_col_sample['text_clean'].to_pandas()
STOPWORDS = nltk.corpus.stopwords.words('english')
filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', '\\', ':', ';', '<', '=', '>',
'?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\~', '\t','\\n',"'",",",'~' , '—']
def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
"""
* filter punctuation
* to_lower
* remove stop words (from nltk corpus)