Skip to content

Instantly share code, notes, and snippets.

@hachibeeDI
Last active November 1, 2016 13:11
Show Gist options
  • Save hachibeeDI/6716011 to your computer and use it in GitHub Desktop.
Save hachibeeDI/6716011 to your computer and use it in GitHub Desktop.
Pythonで単語の数え上げとかするならCounterを使うと便利なはなし ref: http://qiita.com/hatchinee/items/a904c1f8d732a4686c9d
data = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']
word_and_counts = {}
for word in data:
if word_and_counts.has_key(word):
word_and_counts[word] += 1
else:
word_and_counts[word] = 1
for w, c in sorted(word_and_counts.iteritems(), key=lambda x: x[1], reverse=True):
print w, c # =>
# aaa 2
# bbb 1
# ccc 1
# ddd 1
from collections import Counter
data = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']
counter = Counter(data)
for word, cnt in counter.most_common():
print word, cnt # =>
# aaa 2
# bbb 1
# ccc 1
# ddd 1
from collections import Counter
dataA = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']
dataB = ['aaa', 'bbb', 'bbb', 'bbb', 'abc']
counterA = Counter(dataA)
counterB = Counter(dataB)
counter = counterA + counterB # 頻度を足し合わせられる
counterA.subtract(counterB) # 要素の差をとる(破壊的メソッド)
counter.most_common(3) # 上位3要素の取得(上記の例のように、引数nの省略を省略すればすべての要素を降順で取得)
# 他にもいくつか
# -*- coding: utf-8 -*-
from collections import Counter
import codecs
import json
import MeCab
# バッドノウハウ感あるけど、出力結果をリダイレクトしたいし
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
# codecsはunicodeを返す
# 一行目に余計な記述があってだるいしテストコードだし面倒なので事前に消しておこう
_tweetfile = codecs.open('./data/js/tweets/2013_09.js', 'r', 'sjis')
tweets = json.load(_tweetfile)
# Mecabはstr型しか受け付けないのでエンコード
texts = (tw['text'].encode('utf-8') for tw in tweets)
tagger = MeCab.Tagger('-Ochasen')
counter = Counter()
for text in texts:
nodes = tagger.parseToNode(text)
while nodes:
if nodes.feature.split(',')[0] == '名詞':
word = nodes.surface.decode('utf-8')
counter[word] += 1
nodes = nodes.next
for word, cnt in counter.most_common():
print word, cnt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment