Last active
November 26, 2021 15:09
-
-
Save revox/49975a50d0b96cf580ca966d157186f6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk, sys, csv | |
from nltk.tokenize import TweetTokenizer | |
from nltk.corpus import stopwords | |
from datetime import datetime | |
import pandas as pd | |
import numpy as np | |
import string | |
from collections import Counter | |
# function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal | |
def process(text, tokenizer=TweetTokenizer(), stopwords=[]): | |
text = text.lower() | |
tokens = tokenizer.tokenize(text) | |
return [word for word in tokens if word not in stopwords and not word.isdigit()] | |
# *** word frequency mining **** | |
# tokenizer | |
tweet_tokenizer = TweetTokenizer() | |
# punctuation list | |
punct = list(string.punctuation) | |
# download 127 Englisg stop words | |
nltk.download('stopwords') | |
# list of stop words and punctuations | |
stopword_list = stopwords.words('english') + punct + ['rt', 'via'] | |
# record the number of occurences for each word | |
tf = Counter() | |
all_dates = [] | |
with open('brexit_data.csv', 'rU') as inputfile: | |
tweetreader = csv.reader(inputfile,delimiter='|') | |
# get the text and the time | |
for row in tweetreader: | |
message = row[2] | |
tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list) | |
all_dates.append(row[1]) | |
# update word frequency | |
tf.update(tokens) | |
# convert the counter to a sorted list (tf_sorted is a list of 2-tuples) | |
tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True) | |
# print each word and its frequency | |
csvfile = open('text_data.csv', 'w') | |
csvwriter = csv.writer(csvfile) | |
for item in tf_list_sorted: | |
print item[0].encode('utf-8'), item[1] | |
csvwriter.writerow([item[0].encode('utf-8'), item[1]]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment