Skip to content

Instantly share code, notes, and snippets.

@tondol
Last active January 6, 2017 18:11
Show Gist options
  • Save tondol/dcee7222acb637612d355d2192011518 to your computer and use it in GitHub Desktop.
Save tondol/dcee7222acb637612d355d2192011518 to your computer and use it in GitHub Desktop.
Aqours楽曲の歌詞をテキストマイニングする

README

tfdf.py

  • 各単語のTerm Frequency, Document Frequencyを計算する。
  • MeCab, mecab-ipadic-neologdによる分かち書きを行う。
  • 動詞は原形に変換してから集計する。
  • アルファベットはスペースで分割し、小文字に変換する。
  • 記号のみの単語や、平仮名・片仮名のみからなる1文字の単語は削除する。

tfdf_kana.py

  • tfdf.py の原形変換部分を、元の表現のまま平仮名に変換する処理に置換してから集計したもの。

df_song.py

  • tfdf.py により求めたdf上位150件の単語を多く含む楽曲を集計する。

df_song_kana.py

  • df_song.py のdf集計時の処理を tfdf_kana.py と同じロジックにしたもの。

スクリプトの説明

実行準備

歌詞テキストの整形(テキストの文字コードがUTF-8かつスペースを含まないファイル名なら不要)

$ brew install nkf rename
$ find . -name "*.txt" -print0 | xargs -0 nkf -w --overwrite
$ rename 's/ /_/g' *

MeCabのインストール

$ brew install mecab
$ pip3.4 install mecab-python3

MeCab用辞書のインストール

$ brew install git curl xz
$ git clone --depth 1 [email protected]:neologd/mecab-ipadic-neologd.git
$ cd mecab-ipadic-neologd
$ ./bin/install-mecab-ipadic-neologd -n

実行

$ ls
aqours_heroes.txt		mattete.txt			tfdf.py
...
$ python3.4 tfdf.py tf > tf.md
$ python3.4 tfdf.py df > df.md
#-*- encoding: utf-8 -*-
import glob
import re
import sys
import MeCab
mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
df_dict = {}
song_dict = {}
for filename in glob.glob('./*.txt'):
data = open(filename).read()
# 記号や改行を削除
data = data.replace("\n", " ").replace("\r", " ")
data = data.replace("(", "").replace(")", "").replace("(", "").replace(")", "")
data = data.replace("!", "").replace("!", "").replace("?", "").replace("?", "")
data = data.replace("・", "")
data = re.sub(r'\s+', " ", data).strip()
node = mecab.parseToNode(data)
words = []
while node:
meta = node.feature.split(",")
if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
# アルファベットのみなら単語に分解、小文字にする
words += list(map(lambda s: s.lower(), node.surface.split(" ")))
elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
# 記号のみならスキップする
pass
else:
# 可能なら原形に変換する
if len(meta) >= 7 and meta[6] != "*":
word = meta[6]
else:
word = node.surface
# 2文字以上もしくは漢字1文字のときのみ使う
if len(word) >= 2:
words.append(word)
elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
words.append(word)
node = node.next
for word in set(words):
if word in df_dict:
df_dict[word] += 1
else:
df_dict[word] = 1
song_dict[filename] = set(words)
# df
items = sorted(df_dict.items(), key=lambda p: p[1])
items = reversed(items)
items = list(items)[:150]
freq_words = set(map(lambda p: p[0], items))
freq_songs = map(lambda p: (p[0], p[1].intersection(freq_words)), song_dict.items())
freq_songs = sorted(freq_songs, key=lambda pair: len(pair[1]))
freq_songs = reversed(freq_songs)
for k, v in freq_songs:
print("'%s', %d" % (k, len(v)))
print(v)
#-*- encoding: utf-8 -*-
import glob
import re
import sys
import MeCab
mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
df_dict = {}
song_dict = {}
# http://d.hatena.ne.jp/mohayonao/20101213/1292237816
def make_function_hiragana():
re_katakana = re.compile(r'[ァ-ン]')
def hiragana(s):
return re_katakana.sub(lambda x: chr(ord(x.group(0)) - 0x60), s)
return hiragana
hiragana = make_function_hiragana()
def make_function_katakana():
re_hiragana = re.compile(r'[ぁ-ん]')
def katakana(s):
return re_hiragana.sub(lambda x: chr(ord(x.group(0)) + 0x60), s)
return katakana
katakana = make_function_katakana()
for filename in glob.glob('./*.txt'):
data = open(filename).read()
# 記号や改行を削除
data = data.replace("\n", " ").replace("\r", " ")
data = data.replace("(", "").replace(")", "").replace("(", "").replace(")", "")
data = data.replace("!", "").replace("!", "").replace("?", "").replace("?", "")
data = data.replace("・", "")
data = re.sub(r'\s+', " ", data).strip()
node = mecab.parseToNode(data)
words = []
while node:
meta = node.feature.split(",")
if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
# アルファベットのみなら単語に分解、小文字にする
words += list(map(lambda s: s.lower(), node.surface.split(" ")))
elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
# 記号のみならスキップする
pass
else:
# 可能なら読み仮名に変換する
if len(meta) >= 8 and meta[7] != "*":
word = hiragana(meta[7])
else:
word = node.surface
# 2文字以上もしくは漢字1文字のときのみ使う
if len(word) >= 2:
words.append(word)
elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
words.append(word)
node = node.next
for word in set(words):
if word in df_dict:
df_dict[word] += 1
else:
df_dict[word] = 1
song_dict[filename] = set(words)
# df
items = sorted(df_dict.items(), key=lambda p: p[1])
items = reversed(items)
items = list(items)[:150]
freq_words = set(map(lambda p: p[0], items))
freq_songs = map(lambda p: (p[0], p[1].intersection(freq_words)), song_dict.items())
freq_songs = sorted(freq_songs, key=lambda pair: len(pair[1]))
freq_songs = reversed(freq_songs)
for k, v in freq_songs:
print("'%s', %d" % (k, len(v)))
print(v)
#-*- encoding: utf-8 -*-
import glob
import re
import sys
import MeCab
mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
tf_dict = {}
df_dict = {}
for filename in glob.glob('./*.txt'):
data = open(filename).read()
# 記号や改行を削除
data = data.replace("\n", " ").replace("\r", " ")
data = data.replace("(", "").replace(")", "").replace("(", "").replace(")", "")
data = data.replace("!", "").replace("!", "").replace("?", "").replace("?", "")
data = data.replace("・", "")
data = re.sub(r'\s+', " ", data).strip()
node = mecab.parseToNode(data)
words = []
while node:
meta = node.feature.split(",")
if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
# アルファベットのみなら単語に分解、小文字にする
words += list(map(lambda s: s.lower(), node.surface.split(" ")))
elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
# 記号のみならスキップする
pass
else:
# 可能なら原形に変換する
if len(meta) >= 7 and meta[6] != "*":
word = meta[6]
else:
word = node.surface
# 2文字以上もしくは漢字1文字のときのみ使う
if len(word) >= 2:
words.append(word)
elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
words.append(word)
node = node.next
for word in words:
if word in tf_dict:
tf_dict[word] += 1
else:
tf_dict[word] = 1
for word in set(words):
if word in df_dict:
df_dict[word] += 1
else:
df_dict[word] = 1
if len(sys.argv) >= 2 and sys.argv[1] == "tf":
# tf
items = sorted(tf_dict.items(), key=lambda p: p[1])
items = reversed(items)
for k, v in items:
print("'%s' %d" % (k, v))
else:
# df
items = sorted(df_dict.items(), key=lambda p: p[1])
items = reversed(items)
for k, v in items:
print("'%s' %d" % (k, v))
#-*- encoding: utf-8 -*-
import glob
import re
import sys
import MeCab
mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
tf_dict = {}
df_dict = {}
# http://d.hatena.ne.jp/mohayonao/20101213/1292237816
def make_function_hiragana():
re_katakana = re.compile(r'[ァ-ン]')
def hiragana(s):
return re_katakana.sub(lambda x: chr(ord(x.group(0)) - 0x60), s)
return hiragana
hiragana = make_function_hiragana()
def make_function_katakana():
re_hiragana = re.compile(r'[ぁ-ん]')
def katakana(s):
return re_hiragana.sub(lambda x: chr(ord(x.group(0)) + 0x60), s)
return katakana
katakana = make_function_katakana()
for filename in glob.glob('./*.txt'):
data = open(filename).read()
# 記号や改行を削除
data = data.replace("\n", " ").replace("\r", " ")
data = data.replace("(", "").replace(")", "").replace("(", "").replace(")", "")
data = data.replace("!", "").replace("!", "").replace("?", "").replace("?", "")
data = data.replace("・", "")
data = re.sub(r'\s+', " ", data).strip()
node = mecab.parseToNode(data)
words = []
while node:
meta = node.feature.split(",")
if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
# アルファベットのみなら単語に分解、小文字にする
words += list(map(lambda s: s.lower(), node.surface.split(" ")))
elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
# 記号のみならスキップする
pass
else:
# 可能なら読み仮名に変換する
if len(meta) >= 8 and meta[7] != "*":
word = hiragana(meta[7])
else:
word = hiragana(node.surface)
# 2文字以上もしくは漢字1文字のときのみ使う
if len(word) >= 2:
words.append(word)
elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
words.append(word)
node = node.next
for word in words:
if word in tf_dict:
tf_dict[word] += 1
else:
tf_dict[word] = 1
for word in set(words):
if word in df_dict:
df_dict[word] += 1
else:
df_dict[word] = 1
if len(sys.argv) >= 2 and sys.argv[1] == "tf":
# tf
items = sorted(tf_dict.items(), key=lambda p: p[1])
items = reversed(items)
for k, v in items:
print("'%s' %d" % (k, v))
else:
# df
items = sorted(df_dict.items(), key=lambda p: p[1])
items = reversed(items)
for k, v in items:
print("'%s' %d" % (k, v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment