Last active
October 15, 2021 22:18
-
-
Save Destaq/06415954e3394527d2351713c53fd769 to your computer and use it in GitHub Desktop.
Graphs your total comprehension of a Chinese text file based on known vocabulary, number of words learnt per step, and other rules.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# generates a graph of the percentage known words of a text at every step | |
import argparse | |
from LAC import LAC | |
from plotly.subplots import make_subplots | |
import plotly.graph_objects as go | |
from collections import Counter | |
parser = argparse.ArgumentParser( | |
"Display a graph of your comprehension every *x* of a Chinese text file." | |
) | |
parser.add_argument("-t", "--target", required=True, help="Path to target file.") | |
parser.add_argument( | |
"-s", "--step", required=False, default=1000, help="Every x characters/words." | |
) | |
parser.add_argument( | |
"-k", "--known", required=False, help="Path to your known word list." | |
) | |
parser.add_argument( | |
"-l", | |
"--learned", | |
required=False, | |
default=-1, | |
help="The number of new words you estimate you will learn for every step (float).", | |
) | |
parser.add_argument( | |
"-c", | |
"--cutoff", | |
required=False, | |
help="How many characters of the file to read up to", | |
default=-1, | |
) | |
args = parser.parse_args() | |
class GraphAnalyzer: | |
""" | |
Shows cumulative new words every step. | |
""" | |
ignore_chars = ",.:()!@[]+/\\!??。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─''\"" | |
segmenter = LAC(mode="seg") | |
def __init__(self, target_path, step, knownfile_path, cutoff, learned_per_step): | |
self.target_path = target_path | |
self.step = step | |
self.knownfile_path = knownfile_path | |
self.cutoff = cutoff | |
self.learned_per_step = learned_per_step | |
self.runner() | |
def runner(self): | |
with open(self.target_path, "r") as f: | |
text = "".join(f.read().split()) | |
if self.cutoff != -1: | |
text = text[: self.cutoff] | |
text_words = self.segmenter.run(text) | |
# now strip the undesired punctuation from every element of text_words | |
for i in range(len(text_words)): | |
text_words[i] = text_words[i].translate( | |
str.maketrans("", "", self.ignore_chars) | |
) | |
# remove elements that are just empty strings | |
self.text_words = [x for x in text_words if x != ""] | |
# create a list of single characters from the text, excluding ignore chars | |
self.text_chars = [x for x in text if x not in self.ignore_chars] | |
# generate the data | |
self.data_generator() | |
# graph the data | |
self.graph() | |
def data_generator(self): | |
# create a list of known words and chars | |
if self.knownfile_path: | |
with open(self.knownfile_path, "r") as f: | |
known_words = f.read().split() | |
known_chars = [x for x in known_words if x not in self.ignore_chars] | |
else: | |
known_words = [] | |
known_chars = [] | |
# trackers for updating known words and chars if they are a float | |
word_tracker = 0.0 | |
char_tracker = 0.0 | |
# calculate the number of new words and chars every step | |
self.new_words_step_counter = [] | |
current_possible_new_known_words = Counter([]) | |
for i in range(0, len(self.text_words), self.step): | |
current_new_words = 0 | |
# check for new words | |
for word in self.text_words[i : i + self.step]: | |
if word not in known_words: | |
current_new_words += 1 | |
# update known words based off of estimated gain in knowledge | |
if self.learned_per_step != -1: | |
for word in self.text_words[i : i + self.step]: | |
current_possible_new_known_words[word] += 1 | |
# update tracker | |
word_tracker += self.learned_per_step | |
remainder = word_tracker - int(word_tracker) | |
# choose the top learned_amount from the counter to add to known words | |
added = 0 | |
for word, count in current_possible_new_known_words.most_common(): | |
if added < int(word_tracker): | |
if word not in known_words: | |
known_words.append(word) | |
added += 1 | |
else: | |
break | |
word_tracker = remainder | |
else: | |
# you assume you will learn every word | |
for word in self.text_words[i : i + self.step]: | |
known_words.append(word) | |
# append to the step counter | |
self.new_words_step_counter.append(1 - current_new_words / self.step) | |
######### | |
self.new_chars_step_counter = [] | |
# now do the same for characters | |
current_possible_new_known_chars = Counter([]) | |
for i in range(0, len(self.text_chars), self.step): | |
current_new_chars = 0 | |
# check for new chars | |
for char in self.text_chars[i : i + self.step]: | |
if char not in known_chars: | |
current_new_chars += 1 | |
# update known chars based off of estimated gain in knowledge | |
if self.learned_per_step != -1: | |
for char in self.text_chars[i : i + self.step]: | |
current_possible_new_known_chars[char] += 1 | |
# update tracker | |
char_tracker += self.learned_per_step | |
remainder = char_tracker - int(char_tracker) | |
# choose the top learned_amount from the counter to add to known chars | |
added = 0 | |
for char, count in current_possible_new_known_chars.most_common(): | |
if added < int(char_tracker): | |
if char not in known_chars: | |
known_chars.append(char) | |
added += 1 | |
else: | |
break | |
char_tracker = remainder | |
else: | |
# you assume you will learn every char | |
for char in self.text_chars[i : i + self.step]: | |
known_chars.append(char) | |
# append to the step counter | |
self.new_chars_step_counter.append(1 - current_new_chars / self.step) | |
def graph(self): | |
# create a graph of the new chars and new words per step using plotly express | |
# show both figures on the graph | |
fig = make_subplots(rows=1, cols=2) | |
fig.add_trace( | |
go.Scatter( | |
x=list(range(0, len(self.new_words_step_counter))), | |
y=self.new_words_step_counter, | |
name="New Words", | |
mode="lines", | |
line=dict(color="blue"), | |
), | |
row=1, | |
col=1, | |
) | |
fig.add_trace( | |
go.Scatter( | |
x=list(range(0, len(self.new_chars_step_counter))), | |
y=self.new_chars_step_counter, | |
name="New Chars", | |
mode="lines", | |
line=dict(color="red"), | |
), | |
row=1, | |
col=2, | |
) | |
fig.update_layout( | |
height=600, | |
width=1440, | |
title_text=f"New Chars and Words by Step - {self.target_path[:-4]}", | |
) | |
# create gap between subplots | |
fig.update_yaxes(showgrid=True, row=1, col=1) | |
fig["layout"]["xaxis"].update(title=f"Words * {self.step}") | |
fig["layout"]["yaxis"].update(title=f"Known Words Proportion") | |
fig["layout"]["xaxis2"].update(title=f"Chars * {self.step}") | |
fig["layout"]["yaxis2"].update(title=f"Known Chars Proportion") | |
fig.show() | |
if __name__ == "__main__": | |
GraphAnalyzer( | |
args.target, int(args.step), args.known, int(args.cutoff), float(args.learned) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment