Created
October 9, 2013 03:43
-
-
Save brianly/6895863 to your computer and use it in GitHub Desktop.
Some sort of word count script I wrote to process a Yammer data export.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import csv | |
import argparse | |
import re | |
import nltk | |
from operator import itemgetter | |
from string import punctuation | |
def replace_topic(matched_topic): | |
"""Removes the tag goop from the message body""" | |
pattern = re.compile('\[Tag:\d{1,9}:') | |
tag = re.sub(pattern, '', matched_topic.group(0)) | |
return tag.replace(tag[len(tag)-1], '') | |
def convert_message(raw_message): | |
"""Converts tag syntax to plain text message""" | |
pattern = re.compile('\[Tag:\d{1,9}:([a-zA-Z])*\]') | |
if re.search(pattern, raw_message): | |
return re.sub(pattern, replace_topic, raw_message) | |
return raw_message | |
def tokenize(message): | |
"""Split a message into a list of words without punctuation""" | |
# message = message.lower() | |
# return message.strip(punctuation).split() | |
return nltk.word_tokenize(message) | |
def main(args=None): | |
if args.command == 'frequency': | |
wordCnt = {} | |
with open(args.export, 'rb') as fi: | |
reader = csv.DictReader(fi) | |
for row in reader: | |
message = convert_message(row['body']) | |
for word in tokenize(message): | |
word = word.rstrip('.') | |
word = word.rstrip(',') | |
word = word.rstrip(':') | |
word = word.rstrip('?') | |
word = word.rstrip(';') | |
word = word.rstrip('!') | |
word = word.rstrip('\'') | |
word = word.rstrip(')') | |
word = word.rstrip('"') | |
word = word.rstrip('"') | |
word = word.lstrip('"') | |
word = word.lstrip('"') | |
word = word.lstrip('\'') | |
word = word.lstrip('(') | |
word = word.lstrip(')') | |
if word in wordCnt: | |
wordCnt[word] += 1 | |
else: | |
wordCnt[word] = 1 | |
for item in sorted(wordCnt.items(), key=itemgetter(1)): | |
# if len(item[0]) > 4 and item[1] > 1: | |
print '%s\t%s' % (item[0], item[1]) | |
return 0 | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Process words in a Yammer message export.') | |
subparsers = parser.add_subparsers() | |
parser_install = subparsers.add_parser('frequency') | |
parser_install.add_argument('--file', type=str, help='Optional output file') | |
parser_install.add_argument('export', type=str, help='Message export file') | |
parser_install.set_defaults(command='frequency') | |
args = parser.parse_args() | |
sys.exit(main(args)) | |
Yammer |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment