hachibeeDI · November 1, 2016 13:11
diff --git a/file0.txt b/file0.txt

 data = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']

 word_and_counts = {}
 for word in data:
    if word_and_counts.has_key(word):
        word_and_counts[word] += 1
    else:
        word_and_counts[word] = 1
 for w, c in sorted(word_and_counts.iteritems(), key=lambda x: x[1], reverse=True):
    print w, c  # =>
                #   aaa 2
                #   bbb 1
                #   ccc 1
                #   ddd 1
diff --git a/file1.txt b/file1.txt
 from collections import Counter

 data = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']
 counter = Counter(data)
 for word, cnt in counter.most_common():
    print word, cnt # =>
                    #   aaa 2
                    #   bbb 1
                    #   ccc 1
                    #   ddd 1
diff --git a/file2.txt b/file2.txt
 from collections import Counter

 dataA = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']
 dataB = ['aaa', 'bbb', 'bbb', 'bbb', 'abc']

 counterA = Counter(dataA)
 counterB = Counter(dataB)

 counter = counterA + counterB  # 頻度を足し合わせられる
 counterA.subtract(counterB)  # 要素の差をとる（破壊的メソッド）
 counter.most_common(3)  # 上位3要素の取得（上記の例のように、引数nの省略を省略すればすべての要素を降順で取得）
 # 他にもいくつか
diff --git a/file3.txt b/file3.txt
 # -*- coding: utf-8 -*-

 from collections import Counter
 import codecs
 import json

 import MeCab


 # バッドノウハウ感あるけど、出力結果をリダイレクトしたいし
 import sys
 reload(sys)
 sys.setdefaultencoding("utf-8")

 # codecsはunicodeを返す
 # 一行目に余計な記述があってだるいしテストコードだし面倒なので事前に消しておこう
 _tweetfile = codecs.open('./data/js/tweets/2013_09.js', 'r', 'sjis')
 tweets = json.load(_tweetfile)
 # Mecabはstr型しか受け付けないのでエンコード
 texts = (tw['text'].encode('utf-8') for tw in tweets)

 tagger = MeCab.Tagger('-Ochasen')
 counter = Counter()
 for text in texts:
    nodes = tagger.parseToNode(text)
    while nodes:
        if nodes.feature.split(',')[0] == '名詞':
            word = nodes.surface.decode('utf-8')
            counter[word] += 1
        nodes = nodes.next
 for word, cnt in counter.most_common():
    print word, cnt

	data = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']

	word_and_counts = {}
	for word in data:
	if word_and_counts.has_key(word):
	word_and_counts[word] += 1
	else:
	word_and_counts[word] = 1
	for w, c in sorted(word_and_counts.iteritems(), key=lambda x: x[1], reverse=True):
	print w, c # =>
	# aaa 2
	# bbb 1
	# ccc 1
	# ddd 1
	from collections import Counter

	data = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']
	counter = Counter(data)
	for word, cnt in counter.most_common():
	print word, cnt # =>
	# aaa 2
	# bbb 1
	# ccc 1
	# ddd 1
	from collections import Counter

	dataA = ['aaa', 'bbb', 'ccc', 'aaa', 'ddd']
	dataB = ['aaa', 'bbb', 'bbb', 'bbb', 'abc']

	counterA = Counter(dataA)
	counterB = Counter(dataB)

	counter = counterA + counterB # 頻度を足し合わせられる
	counterA.subtract(counterB) # 要素の差をとる（破壊的メソッド）
	counter.most_common(3) # 上位3要素の取得（上記の例のように、引数nの省略を省略すればすべての要素を降順で取得）
	# 他にもいくつか
	# -- coding: utf-8 --

	from collections import Counter
	import codecs
	import json

	import MeCab


	# バッドノウハウ感あるけど、出力結果をリダイレクトしたいし
	import sys
	reload(sys)
	sys.setdefaultencoding("utf-8")

	# codecsはunicodeを返す
	# 一行目に余計な記述があってだるいしテストコードだし面倒なので事前に消しておこう
	_tweetfile = codecs.open('./data/js/tweets/2013_09.js', 'r', 'sjis')
	tweets = json.load(_tweetfile)
	# Mecabはstr型しか受け付けないのでエンコード
	texts = (tw['text'].encode('utf-8') for tw in tweets)

	tagger = MeCab.Tagger('-Ochasen')
	counter = Counter()
	for text in texts:
	nodes = tagger.parseToNode(text)
	while nodes:
	if nodes.feature.split(',')[0] == '名詞':
	word = nodes.surface.decode('utf-8')
	counter[word] += 1
	nodes = nodes.next
	for word, cnt in counter.most_common():
	print word, cnt