Skip to content

Instantly share code, notes, and snippets.

@revox
Last active November 26, 2021 15:09
Show Gist options
  • Save revox/49975a50d0b96cf580ca966d157186f6 to your computer and use it in GitHub Desktop.
Save revox/49975a50d0b96cf580ca966d157186f6 to your computer and use it in GitHub Desktop.
import nltk, sys, csv
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from datetime import datetime
import pandas as pd
import numpy as np
import string
from collections import Counter
# function to tokenize a text: 1. lowercase, 2. tokenize, 3. stopwords removal, 4. digits removal
def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
text = text.lower()
tokens = tokenizer.tokenize(text)
return [word for word in tokens if word not in stopwords and not word.isdigit()]
# *** word frequency mining ****
# tokenizer
tweet_tokenizer = TweetTokenizer()
# punctuation list
punct = list(string.punctuation)
# download 127 Englisg stop words
nltk.download('stopwords')
# list of stop words and punctuations
stopword_list = stopwords.words('english') + punct + ['rt', 'via']
# record the number of occurences for each word
tf = Counter()
all_dates = []
with open('brexit_data.csv', 'rU') as inputfile:
tweetreader = csv.reader(inputfile,delimiter='|')
# get the text and the time
for row in tweetreader:
message = row[2]
tokens = process(text = message, tokenizer = tweet_tokenizer, stopwords = stopword_list)
all_dates.append(row[1])
# update word frequency
tf.update(tokens)
# convert the counter to a sorted list (tf_sorted is a list of 2-tuples)
tf_list_sorted = sorted(tf.items(), key = lambda pair: pair[1], reverse = True)
# print each word and its frequency
csvfile = open('text_data.csv', 'w')
csvwriter = csv.writer(csvfile)
for item in tf_list_sorted:
print item[0].encode('utf-8'), item[1]
csvwriter.writerow([item[0].encode('utf-8'), item[1]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment