Last active
January 31, 2022 09:21
-
-
Save GINK03/afd3de494605cc98e6d1bec949f1db6f to your computer and use it in GitHub Desktop.
ネット上での血液型のイメージ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fire | |
import glob | |
import gzip | |
import pandas as pd | |
from tqdm.auto import tqdm | |
import regex | |
from multiprocessing import Pool, cpu_count | |
import zlib | |
from pathlib import Path | |
from loguru import logger | |
import numpy as np | |
import MeCab | |
import mojimoji | |
def _agg(chunk): | |
tmp = [] | |
for f in chunk: | |
try: | |
with gzip.open(f, "rt") as fp: | |
raw = fp.read() | |
except (zlib.error, gzip.BadGzipFile, EOFError): | |
Path(f).unlink() | |
continue | |
if "血液型" in raw: | |
df = pd.read_csv(f, usecols=["date", "text"], compression="gzip") | |
df["text"] = df["text"].astype(str).str.lower() | |
ba = df[df["text"].apply( | |
lambda x: bool(regex.search("a型", str(x))))].copy() | |
if len(ba): | |
ba.loc[:, "bt"] = "a型" | |
tmp.append(ba) | |
bb = df[df["text"].apply( | |
lambda x: bool(regex.search("[^a]b型", str(x))))].copy() | |
if len(bb): | |
bb.loc[:, "bt"] = "b型" | |
tmp.append(bb) | |
bo = df[df["text"].apply( | |
lambda x: bool(regex.search("o型", str(x))))].copy() | |
if len(bo): | |
bo.loc[:, "bt"] = "o型" | |
tmp.append(bo) | |
bab = df[df["text"].apply( | |
lambda x: bool(regex.search("ab型", str(x))))].copy() | |
if len(bab): | |
bab.loc[:, "bt"] = "ab型" | |
tmp.append(bab) | |
if len(tmp) == 0: | |
return None | |
res = pd.concat(tmp) | |
res.drop_duplicates(subset=["text", "bt"], inplace=True) | |
return res | |
def agg(num): | |
fs = glob.glob("../csv/*") | |
fs = np.array(fs) | |
chunks = np.array_split(fs, 10000) | |
logger.info(f"chunk size = {len(chunks)}, on size = {len(chunks[0])}") | |
with Pool(cpu_count()) as p: | |
(*ret, ) = tqdm(p.imap(_agg, chunks[:num]), | |
total=len(chunks[:num]), | |
desc="work1") | |
df = pd.concat(ret) | |
df.sort_values(by=["date"], inplace=True) | |
df.drop_duplicates(subset=["text", "bt"], inplace=True) | |
logger.info(df) | |
df.to_csv("var/blood-type-agg.csv", index=None) | |
def calc_values(): | |
df = pd.read_csv("var/blood-type-agg.csv") | |
logger.info(df["bt"].value_counts(normalize=True)) | |
def calc_words(): | |
parser = MeCab.Tagger( | |
"-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd/" | |
) | |
df = pd.read_csv("var/blood-type-agg.csv") # , nrows=100000) | |
df.text = df.text.astype(str).str.lower().apply( | |
lambda x: mojimoji.han_to_zen(x, ascii=False)) | |
tmp = [] | |
for bt, text in tqdm(zip(df["bt"], df["text"]), total=len(df)): | |
for line in parser.parse(text).strip().split("\n"): | |
entities = line.split("\t") | |
if len(entities) <= 4: | |
continue | |
word = entities[0] # もとの単語 | |
yomi = entities[1] # 読み | |
orig = entities[2] # 未活用の原型 | |
type = entities[3] # 品詞 | |
if "形容" in type: | |
if len(orig) == 1 or regex.search("^\p{Hiragana}{1,}$", | |
orig) is not None: | |
continue | |
tmp.append((bt, orig)) | |
tmp = pd.DataFrame(tmp) | |
tmp.columns = ["bt", "word"] | |
base = tmp["word"] \ | |
.value_counts(normalize=False) \ | |
.to_frame() \ | |
.reset_index() \ | |
.rename(columns={"index": "word", "word": "count"}) | |
base = base.sort_values("count", ascending=False).head(200) | |
total = base["count"].sum() | |
base = {word: count / total for word, count in zip(base["word"], base["count"])} | |
tmp = tmp[tmp["word"].apply(lambda x: True if x in base else False)] | |
for bt, sub in tmp.groupby(by=["bt"]): | |
sub = sub["word"].value_counts(normalize=True) \ | |
.to_frame() \ | |
.reset_index() \ | |
.rename(columns={"index": "word", "word": "nfreq"}) | |
sub["nfreq"] = sub["nfreq"] / sub["word"].apply(lambda x: base[x]) | |
sub.sort_values(by=["nfreq"], ascending=False, inplace=True) | |
sub.to_csv(f"var/bt_{bt}.csv", index=None) | |
if __name__ == "__main__": | |
fire.Fire({ | |
"agg": agg, | |
"calc_values": calc_values, | |
"calc_words": calc_words | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment