Skip to content

Instantly share code, notes, and snippets.

@GINK03
Last active January 31, 2022 09:21
Show Gist options
  • Save GINK03/afd3de494605cc98e6d1bec949f1db6f to your computer and use it in GitHub Desktop.
Save GINK03/afd3de494605cc98e6d1bec949f1db6f to your computer and use it in GitHub Desktop.
ネット上での血液型のイメージ
import fire
import glob
import gzip
import pandas as pd
from tqdm.auto import tqdm
import regex
from multiprocessing import Pool, cpu_count
import zlib
from pathlib import Path
from loguru import logger
import numpy as np
import MeCab
import mojimoji
def _agg(chunk):
tmp = []
for f in chunk:
try:
with gzip.open(f, "rt") as fp:
raw = fp.read()
except (zlib.error, gzip.BadGzipFile, EOFError):
Path(f).unlink()
continue
if "血液型" in raw:
df = pd.read_csv(f, usecols=["date", "text"], compression="gzip")
df["text"] = df["text"].astype(str).str.lower()
ba = df[df["text"].apply(
lambda x: bool(regex.search("a型", str(x))))].copy()
if len(ba):
ba.loc[:, "bt"] = "a型"
tmp.append(ba)
bb = df[df["text"].apply(
lambda x: bool(regex.search("[^a]b型", str(x))))].copy()
if len(bb):
bb.loc[:, "bt"] = "b型"
tmp.append(bb)
bo = df[df["text"].apply(
lambda x: bool(regex.search("o型", str(x))))].copy()
if len(bo):
bo.loc[:, "bt"] = "o型"
tmp.append(bo)
bab = df[df["text"].apply(
lambda x: bool(regex.search("ab型", str(x))))].copy()
if len(bab):
bab.loc[:, "bt"] = "ab型"
tmp.append(bab)
if len(tmp) == 0:
return None
res = pd.concat(tmp)
res.drop_duplicates(subset=["text", "bt"], inplace=True)
return res
def agg(num):
fs = glob.glob("../csv/*")
fs = np.array(fs)
chunks = np.array_split(fs, 10000)
logger.info(f"chunk size = {len(chunks)}, on size = {len(chunks[0])}")
with Pool(cpu_count()) as p:
(*ret, ) = tqdm(p.imap(_agg, chunks[:num]),
total=len(chunks[:num]),
desc="work1")
df = pd.concat(ret)
df.sort_values(by=["date"], inplace=True)
df.drop_duplicates(subset=["text", "bt"], inplace=True)
logger.info(df)
df.to_csv("var/blood-type-agg.csv", index=None)
def calc_values():
df = pd.read_csv("var/blood-type-agg.csv")
logger.info(df["bt"].value_counts(normalize=True))
def calc_words():
parser = MeCab.Tagger(
"-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd/"
)
df = pd.read_csv("var/blood-type-agg.csv") # , nrows=100000)
df.text = df.text.astype(str).str.lower().apply(
lambda x: mojimoji.han_to_zen(x, ascii=False))
tmp = []
for bt, text in tqdm(zip(df["bt"], df["text"]), total=len(df)):
for line in parser.parse(text).strip().split("\n"):
entities = line.split("\t")
if len(entities) <= 4:
continue
word = entities[0] # もとの単語
yomi = entities[1] # 読み
orig = entities[2] # 未活用の原型
type = entities[3] # 品詞
if "形容" in type:
if len(orig) == 1 or regex.search("^\p{Hiragana}{1,}$",
orig) is not None:
continue
tmp.append((bt, orig))
tmp = pd.DataFrame(tmp)
tmp.columns = ["bt", "word"]
base = tmp["word"] \
.value_counts(normalize=False) \
.to_frame() \
.reset_index() \
.rename(columns={"index": "word", "word": "count"})
base = base.sort_values("count", ascending=False).head(200)
total = base["count"].sum()
base = {word: count / total for word, count in zip(base["word"], base["count"])}
tmp = tmp[tmp["word"].apply(lambda x: True if x in base else False)]
for bt, sub in tmp.groupby(by=["bt"]):
sub = sub["word"].value_counts(normalize=True) \
.to_frame() \
.reset_index() \
.rename(columns={"index": "word", "word": "nfreq"})
sub["nfreq"] = sub["nfreq"] / sub["word"].apply(lambda x: base[x])
sub.sort_values(by=["nfreq"], ascending=False, inplace=True)
sub.to_csv(f"var/bt_{bt}.csv", index=None)
if __name__ == "__main__":
fire.Fire({
"agg": agg,
"calc_values": calc_values,
"calc_words": calc_words
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment