Created
November 13, 2017 15:48
-
-
Save manashmandal/389dcc0577cbeafb9f922c46f38e7fb2 to your computer and use it in GitHub Desktop.
NLP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import pandas as pd | |
import argparse | |
import re | |
nlp = spacy.load('en') | |
# Get the name of the file | |
parser = argparse.ArgumentParser() | |
parser.add_argument("file_path", help="Enter csv filepath here") | |
parser.add_argument("write_file", help="Enter name of the written file here") | |
parser.add_argument("verbose_step", help="Limit verbose") | |
args = parser.parse_args() | |
file_path = args.file_path | |
write_file = args.write_file | |
verbose_step = int(args.verbose_step) | |
print(file_path) | |
print(write_file) | |
# Regular expression for cleaning the text | |
link_re = "http\S+" | |
hash_re = "#\\b\\w+\\b" | |
tag_re = "@\\b\\w+\\b" | |
# Cleans a given tweet | |
def clean_tweet(input_text): | |
input_text = re.sub(link_re, "", input_text) | |
input_text = re.sub(hash_re, "", input_text) | |
input_text = re.sub(tag_re, "", input_text) | |
input_text = input_text.strip().replace("RT : ", "").replace("’s", "").replace("n't", "").replace("’ve", "").replace("'s", "").replace("'re", "").replace("'s", "") | |
input_text = nlp(u'' + input_text) | |
it2 = input_text | |
# Converts to lowercase and removes all stop words aand punctuations | |
input_text = " ".join([tok.lower_ for tok in input_text if (not tok.is_punct and not tok.is_stop)]) | |
input_text = input_text.replace("rt", "") | |
return input_text | |
# dataframe to line | |
def df2txt(fp, write_to): | |
df = pd.read_csv(fp, delimiter="\t") | |
with open(write_to, 'w') as f: | |
for i, text in enumerate(df.text): | |
ct = clean_tweet(text) | |
if ct != "": | |
f.write(ct + '\n') | |
if i % verbose_step == 0: | |
print("Wrote: {}".format(i)) | |
print("DONE") | |
f.close() | |
df2txt(file_path, write_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment