Last active
November 1, 2016 13:11
-
-
Save hachibeeDI/6716011 to your computer and use it in GitHub Desktop.
Pythonで単語の数え上げとかするならCounterを使うと便利なはなし ref: http://qiita.com/hatchinee/items/a904c1f8d732a4686c9d
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd'] | |
word_and_counts = {} | |
for word in data: | |
if word_and_counts.has_key(word): | |
word_and_counts[word] += 1 | |
else: | |
word_and_counts[word] = 1 | |
for w, c in sorted(word_and_counts.iteritems(), key=lambda x: x[1], reverse=True): | |
print w, c # => | |
# aaa 2 | |
# bbb 1 | |
# ccc 1 | |
# ddd 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
data = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd'] | |
counter = Counter(data) | |
for word, cnt in counter.most_common(): | |
print word, cnt # => | |
# aaa 2 | |
# bbb 1 | |
# ccc 1 | |
# ddd 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
dataA = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd'] | |
dataB = ['aaa', 'bbb', 'bbb', 'bbb', 'abc'] | |
counterA = Counter(dataA) | |
counterB = Counter(dataB) | |
counter = counterA + counterB # 頻度を足し合わせられる | |
counterA.subtract(counterB) # 要素の差をとる(破壊的メソッド) | |
counter.most_common(3) # 上位3要素の取得(上記の例のように、引数nの省略を省略すればすべての要素を降順で取得) | |
# 他にもいくつか |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from collections import Counter | |
import codecs | |
import json | |
import MeCab | |
# バッドノウハウ感あるけど、出力結果をリダイレクトしたいし | |
import sys | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
# codecsはunicodeを返す | |
# 一行目に余計な記述があってだるいしテストコードだし面倒なので事前に消しておこう | |
_tweetfile = codecs.open('./data/js/tweets/2013_09.js', 'r', 'sjis') | |
tweets = json.load(_tweetfile) | |
# Mecabはstr型しか受け付けないのでエンコード | |
texts = (tw['text'].encode('utf-8') for tw in tweets) | |
tagger = MeCab.Tagger('-Ochasen') | |
counter = Counter() | |
for text in texts: | |
nodes = tagger.parseToNode(text) | |
while nodes: | |
if nodes.feature.split(',')[0] == '名詞': | |
word = nodes.surface.decode('utf-8') | |
counter[word] += 1 | |
nodes = nodes.next | |
for word, cnt in counter.most_common(): | |
print word, cnt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment