Skip to content

Instantly share code, notes, and snippets.

@brianly
Created October 9, 2013 03:43
Show Gist options
  • Save brianly/6895863 to your computer and use it in GitHub Desktop.
Save brianly/6895863 to your computer and use it in GitHub Desktop.
Some sort of word count script I wrote to process a Yammer data export.
import sys
import csv
import argparse
import re
import nltk
from operator import itemgetter
from string import punctuation
def replace_topic(matched_topic):
"""Removes the tag goop from the message body"""
pattern = re.compile('\[Tag:\d{1,9}:')
tag = re.sub(pattern, '', matched_topic.group(0))
return tag.replace(tag[len(tag)-1], '')
def convert_message(raw_message):
"""Converts tag syntax to plain text message"""
pattern = re.compile('\[Tag:\d{1,9}:([a-zA-Z])*\]')
if re.search(pattern, raw_message):
return re.sub(pattern, replace_topic, raw_message)
return raw_message
def tokenize(message):
"""Split a message into a list of words without punctuation"""
# message = message.lower()
# return message.strip(punctuation).split()
return nltk.word_tokenize(message)
def main(args=None):
if args.command == 'frequency':
wordCnt = {}
with open(args.export, 'rb') as fi:
reader = csv.DictReader(fi)
for row in reader:
message = convert_message(row['body'])
for word in tokenize(message):
word = word.rstrip('.')
word = word.rstrip(',')
word = word.rstrip(':')
word = word.rstrip('?')
word = word.rstrip(';')
word = word.rstrip('!')
word = word.rstrip('\'')
word = word.rstrip(')')
word = word.rstrip('"')
word = word.rstrip('&quot')
word = word.lstrip('"')
word = word.lstrip('&quot')
word = word.lstrip('\'')
word = word.lstrip('(')
word = word.lstrip(')')
if word in wordCnt:
wordCnt[word] += 1
else:
wordCnt[word] = 1
for item in sorted(wordCnt.items(), key=itemgetter(1)):
# if len(item[0]) > 4 and item[1] > 1:
print '%s\t%s' % (item[0], item[1])
return 0
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process words in a Yammer message export.')
subparsers = parser.add_subparsers()
parser_install = subparsers.add_parser('frequency')
parser_install.add_argument('--file', type=str, help='Optional output file')
parser_install.add_argument('export', type=str, help='Message export file')
parser_install.set_defaults(command='frequency')
args = parser.parse_args()
sys.exit(main(args))
Yammer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment