Skip to content

Instantly share code, notes, and snippets.

@manashmandal
Created November 13, 2017 15:48
Show Gist options
  • Save manashmandal/389dcc0577cbeafb9f922c46f38e7fb2 to your computer and use it in GitHub Desktop.
Save manashmandal/389dcc0577cbeafb9f922c46f38e7fb2 to your computer and use it in GitHub Desktop.
NLP
import spacy
import pandas as pd
import argparse
import re
nlp = spacy.load('en')
# Get the name of the file
parser = argparse.ArgumentParser()
parser.add_argument("file_path", help="Enter csv filepath here")
parser.add_argument("write_file", help="Enter name of the written file here")
parser.add_argument("verbose_step", help="Limit verbose")
args = parser.parse_args()
file_path = args.file_path
write_file = args.write_file
verbose_step = int(args.verbose_step)
print(file_path)
print(write_file)
# Regular expression for cleaning the text
link_re = "http\S+"
hash_re = "#\\b\\w+\\b"
tag_re = "@\\b\\w+\\b"
# Cleans a given tweet
def clean_tweet(input_text):
input_text = re.sub(link_re, "", input_text)
input_text = re.sub(hash_re, "", input_text)
input_text = re.sub(tag_re, "", input_text)
input_text = input_text.strip().replace("RT : ", "").replace("’s", "").replace("n't", "").replace("’ve", "").replace("'s", "").replace("'re", "").replace("'s", "")
input_text = nlp(u'' + input_text)
it2 = input_text
# Converts to lowercase and removes all stop words aand punctuations
input_text = " ".join([tok.lower_ for tok in input_text if (not tok.is_punct and not tok.is_stop)])
input_text = input_text.replace("rt", "")
return input_text
# dataframe to line
def df2txt(fp, write_to):
df = pd.read_csv(fp, delimiter="\t")
with open(write_to, 'w') as f:
for i, text in enumerate(df.text):
ct = clean_tweet(text)
if ct != "":
f.write(ct + '\n')
if i % verbose_step == 0:
print("Wrote: {}".format(i))
print("DONE")
f.close()
df2txt(file_path, write_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment