uhfx · February 22, 2021 03:31
diff --git a/20210218-mtg.md b/20210218-mtg.md
diff --git a/all.py b/all.py
 import pandas as pd
 import MeCab
 tagger = MeCab.Tagger("-Ochasen")
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np

 from collections import Counter
 import collections
 import itertools

 def q_get(text_paths): # 質問文書を text に格納
    texts = []
    for text_path in text_paths:
        text = open(text_path, 'r').read()
        text = text.split(',') # CSV ファイルのセルで分割
        text = ' '.join(text[8:9]) # 質問文書部分
        text = text.replace( '\n' , '' ) # 質問文書の改行を削除
        text = text.strip('"') # CSV ファイルのセル " を削除
        # text = text.replace('する', '') # する できる の削除（不要？）
        # text = text.replace('できる', '')
        texts.append(text) # 配列 texts に格納

    return texts

 def a_get(text_paths): # 回答文書を text に格納
    a_texts = []
    for text_path in text_paths:
        a_text = open(text_path, 'r').read()
        a_text = a_text.split(',') # CSV ファイルのセルで分割
        a_text = ' '.join(a_text[16:17]) # 質問文書部分
        # a_text = a_text.replace( '\n' , '' ) # 質問文書の改行を削除．読みにくいのでやっぱり不要
        a_text = a_text.strip('"') # CSV ファイルのセル " を削除
        a_texts.append(a_text) # 配列 a_texts に格納

    return a_texts

 def load_stopwords(path="data/jp_stop_words.txt"): # ストップワードの読み込み 外部ファイル
    url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
    # if os.path.exists(path):
    #     print('ストップワードの読み込み完了')
    # else:
    #     print('ストップワードのダウンロード中')
    #     urllib.request.urlretrieve(url, path)
    return pd.read_csv(path, header=None)[0].tolist()

 # def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
 #     stop_words = load_stopwords() # ストップワードの削除
 #     def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
 #         tokens = []
 #         node = tagger.parseToNode(str(text))
 #         while node:
 #             features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
 #             surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
 #             if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
 #                 node = node.next
 #                 continue
 #
 #             if (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
 #                 tokens.append(surface)
 #             elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
 #                 tokens.append(surface)
 #             elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
 #                 tokens.append(surface)
 #             elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
 #                 tokens.append(surface)
 #
 #             # noun_flag = (features[0] == '名詞')
 #             # proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
 #             # verb_flag = (features[0] == '動詞') & (features[1] == '自立')
 #             # adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
 #             # if proper_noun_flag:
 #             #     tokens.append(surface)
 #             # elif noun_flag:
 #             #     tokens.append(surface)
 #             # elif verb_flag:
 #             #     tokens.append(surface)
 #             # elif adjective_flag:
 #             #     tokens.append(surface)
 #
 #             node = node.next
 #         return " ".join(tokens)
 #
 #     series = series.map(tokenizer_func)
 #
 #     #---------------Normalization-----------#
 #     series = series.map(lambda x: x.lower()) # 小文字に統一
 #     # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
 #
 #     return series
 def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
    stop_words = load_stopwords() # ストップワードの削除
    def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
        tokens = []
        node = tagger.parseToNode(str(text))
        while node:
            features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
            surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
            # if surface == '*': # 知らない言葉を表示
                # print(node.surface)
            if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
                node = node.next
                continue
            elif (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
                tokens.append(surface)

            # noun_flag = (features[0] == '名詞')
            # proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
            # verb_flag = (features[0] == '動詞') & (features[1] == '自立')
            # adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
            # if proper_noun_flag:
            #     tokens.append(surface)
            # elif noun_flag:
            #     tokens.append(surface)
            # elif verb_flag:
            #     tokens.append(surface)
            # elif adjective_flag:
            #     tokens.append(surface)

            node = node.next
        return " ".join(tokens)

    series = series.map(tokenizer_func)

    #---------------Normalization-----------#
    series = series.map(lambda x: x.lower()) # 小文字に統一
    # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．

    return series
 # query_preprocess は不要．
 # def query_preprocess(query_series): # 前処理
 #     stop_words = load_stopwords() # ストップワードの削除
 #     def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
 #         tokens = []
 #         node = tagger.parseToNode(str(text))
 #         while node:
 #             features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
 #             surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
 #             if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
 #                 node = node.next
 #                 continue
 #             noun_flag = (features[0] == '名詞')
 #             proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
 #             verb_flag = (features[0] == '動詞') & (features[1] == '自立')
 #             adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
 #             if proper_noun_flag:
 #                 tokens.append(surface)
 #             elif noun_flag:
 #                 tokens.append(surface)
 #             elif verb_flag:
 #                 tokens.append(surface)
 #             elif adjective_flag:
 #                 tokens.append(surface)
 #             node = node.next
 #         return " ".join(tokens)
 #
 #     query_series = query_series.map(tokenizer_func)
 #     # query_series = tokenizer_func(query_series)
 #
 #     #---------------Normalization-----------#
 #     query_series = query_series.map(lambda x: x.lower()) # 小文字に統一
 #     # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
 #     return query_series

 def question_vector(series): # 質問文書を Tf-Idf を用いて数値化
    tfidf = TfidfVectorizer()
    question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価
    query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    # 複数の返り値 https://pg-chain.com/python-function-return#toc3
    return question_vector, query_vector

 def get_cs(query_series, series): # 質問文書を MeCab で処理したあとのものをコサイン類似度を評価．
    tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
    question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
    # print(len(question_vector[0]))
    query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    cs = cosine_similarity(question_vector, query_vector) # コサイン類似度の評価
    # print(len(query_vector[0]))
    return cs # それぞれのコサイン類似度を評価

 def get_len_series(series): # 質問文書の単語の総量を求める
    tfidf = TfidfVectorizer()
    question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価

    return len(question_vector) # それぞれのコサイン類似度を評価

 def find_top_n(n, cs): # コサイン類似度上から順に n 件の配列番号を取得する
    arr_top_n_indices = np.argsort(cs, axis = None)[-n:]
    top_n_indices = arr_top_n_indices[::-1] # 降順にソート
    return top_n_indices # top_n_indices は n 個の配列，一つ一つは番号

 def get_n_cs(cs, top_n_index, top_n_indices): # 配列番号 top_n_index 番目のコサイン類似度の取得
    for n_cs in top_n_indices:
        n_cs = cs[top_n_index][0]
    return n_cs

 ####
 # 今困ってるのはコサイン類似度0より大きいものの文書に出てくる単語を抽出し表示，その単語を選択させるプログラムが出来ない
 # 1. 単語の抽出
 # 2. 単語の表示
 # 3. 単語の選択（単語自体の入力 もしくは 番号で選択させる）
 # 4. 文書を1つに絞るまでやる

 # 同じ内容の文書（似たような文書）がある．その辺の扱いは一旦保留


 # def get_cs_words(query_series, series, texts):
 def get_new_words(cs, texts): # コサイン類似度の高い質問文書から，名詞と固有名詞のみを抽出する．
    new_texts = [] # 配列
    new_series = {} # pandas の series の形式
    # print(type(cs.nonzero()))
    new_nums = cs.nonzero()[0] # コサイン類似度の行列の1行目だけ欲しい
    for new_num in new_nums:
        # print(texts[new_num])
        new_texts.append(texts[new_num]) # 配列に要素を追加
        # print(new_texts)
    #     # for new_text in new_texts: #不要
    #     #     new_text = texts[new_num]
    #     #     print(new_text)
    #     #     new_texts.append(new_text)
    #     # new_texts = texts[new_nums[0]]
        # new_list = listing_query(texts[new_num])
    new_texts_pd = pd.Series(new_texts) # pandas の series に new_texts を格納
    # new_series = all.preprocess(new_texts_pd, ['名詞', '固有名詞']) # (消さない)対話型でやるときはこっち．all.〜入り
    new_series = preprocess(new_texts_pd, ['名詞', '固有名詞']) # 名詞と固有名詞だけを抽出
    # # print(new_texts)
    # print(new_series)
    print(type(new_series))
    return new_series

    # new_q_series = pd.Series(texts[top_n_indices])

    # tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
    # question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
    # query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    # # print(query_vector)
    # # cs_nonzero = query_vector.nonzero()
    # # nonzero_indices = np.argwhere(cs != 0)
    # # for nonzero_index in nonzero_indices:
    # #      # print(texts[nonzero_index])
    # #      print(nonzero_index)
    # # return series[nonzero_index]
    # nonzero_indices = tfidf.inverse_transform(query_vector)
    # print(nonzero_indices)
    # n_cs
    # return nonzero_indices

 # やること
 # 3. 単語の選択（単語自体の入力 もしくは 番号で選択させる）
 # 4. 文書を1つに絞るまでやる

 def select_topic(new_series): # 類似の質問文書を絞るために series から単語を選ばせる
    new_series_value = []
    new_series_values = []
    new_series_values = new_series.str.split(' ') # 半角スペースで区切る
    new_series_value = sum(new_series_values, []) # 単語の出現回数を数えるために，2次元配列を1次元にした

    # print(new_series_values)

    new_series_count = collections.Counter(new_series_value) # 単語の出現回数を出力

    print(f"{len(new_series_count)}件のワードが見つかりました")
    print(f"{set(new_series_count)}")
    return new_series_count

 def new_input(): # 提案したワードを入力させる
    new_input = input('上の中から近いワードを選んでください．複数選択する場合は半角スペースで区切って入力してください．: ').lower() # 入力を小文字にする
    # new_query = new_input.split(' ')
    # new_query = new_input
    # new_query_pd = pd.Series(new_input)
    # return new_query_pd
    return new_input

 # def new_question(new_query_pd, str_query):
 def new_str_query(new_input, str_query): # str_query に新たなクエリを付与したもの
    str_query_str = str_query[0] # pandas の1行目データ抜き出す．str_query 自体を str 形式にしてしまうと "dtype:object" も入ってしまうため．
    str_query_str = str_query_str + ' ' + new_input
    new_str_query = pd.Series(str_query_str)
    # new_str_query_pd_str = str_query.str + new_query_pd.str
    return new_str_query # 新しい情報を付加したクエリ

 def new_question_answer(new_str_query, processed_new_texts_series, n): # 新しいクエリと1度選ばれた質問文書のコサイン類似度の上位の文書番号集合を取得
    new_cs = get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと1度選ばれた質問文書のコサイン類似度の取得
    new_top_n_indices = find_top_n(n, new_cs) # コサイン類似度の高い順に配列番号を並び替える
    return new_top_n_indices # コサイン類似度の配列番号を出力

 def cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value):
    counter = 0
    print(f"コサイン類似度 {cs_value} 以上の文書は以下の通りです。")
    for new_top_n_index in new_top_n_indices: # 結果の表示
        new_n_cs = get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
        if new_cs_max <= cs_value:
            print(f"コサイン類似度 {cs_value} の質問文書はありません。検索ワードを変えてやり直してください。")
            break
        if new_n_cs > cs_value:
            print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
        #     # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
            counter = counter + 1
            continue
        if new_n_cs <= cs_value:
            print(f"コサイン類似度 {cs_value} 以上の質問文書は {counter} 件です。\n")
            break

 def print_new_words(new_top_n_indices, new_cs, new_texts, new_cs_max):

    # print(new_n_cs)
    cs_value = 0.2
    cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    cs_value = 0.3
    cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    cs_value = 0.4
    cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    # if new_cs_max <= 0.2:
    #     print("検索ワードを変えてやり直してください")
    #     break
    # if new_n_cs > 0.2:
    #     print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
    # #     # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
    #     continue
    # if new_n_cs <= 0.2:
    #     print("以上です")
    #     break

 def listing_query(query): # 質問文書を queries に格納
    list_query = []
    list_query.append(query) # 配列 queries に格納
    return list_query
diff --git a/dic.csv b/dic.csv
diff --git a/test-query.csv b/test-query.csv
diff --git a/test8.py b/test8.py
 import importlib
 # importlib.reload()
 import argparse
 import numpy as np
 import glob
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity

 from IPython.display import display

 import pandas as pd

 import MeCab
 tagger = MeCab.Tagger("-Ochasen")
 import mojimoji
 import os
 import urllib

 from collections import Counter
 import collections

 text_paths = glob.glob('data/ocu2/*.txt')
 from func import all
 # from func import q_get
 # from func import a_get
 # from func import load_stopwords
 # from func import preprocess
 # from func import get_cs
 # from func import find_top_n
 # from func import get_n_cs
 # from func import listing_query

 def main(args):
    texts = all.q_get(text_paths)
    a_texts = all.a_get(text_paths)
    query_texts = all.listing_query(args.query)
    # query_texts = all.listing_query("VPNが繋がらない")
    q_series = pd.Series(texts)
    query_series = pd.Series(query_texts)
    processed_q_series = all.preprocess(q_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_data = processed_q_series
    processed_query_series = all.preprocess(query_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_query = processed_query_series
    cs = all.get_cs(str_query, str_data)
    # nonzero_indices = all.get_cs_words(str_query, str_data, cs)
    # for nonzero_index in nonzero_indices:
        # print(q_series[nonzero_index])
    # n = 295
    n = all.get_len_series(str_data) # データセット文書数
    top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得

    #
    new_series = all.get_new_words(cs, texts)
    # print(new_series)

    # print(new_texts)
    max_index = np.argmax(cs)
    max_cs = cs[max_index][0]

    new_texts =[]

    if max_cs > 1e-10:
        # print(f"該当する質問番号: {top_n_indices}")
        # print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
        # print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
        for top_n_index in top_n_indices: # 結果の表示
            n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
            if n_cs > 1e-10:
                print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
                new_text = texts[top_n_index]
                new_texts.append(new_text) # 提示されているデータを新しい質問文書リストに格納

        # print(new_texts)
        new_series_count = all.select_topic(new_series) # 言葉の出現回数の取得
        print(new_series_count) # 言葉の出現回数の表示

        new_texts_series = pd.Series(new_texts) # 1度選ばれた質問文書を pandas のシリーズにする
        processed_new_texts_series = all.preprocess(new_texts_series, ['名詞', '固有名詞', '動詞', '形容詞']) # 1度選ばれた質問文書を処理し，名詞，固有名詞，動詞，形容詞のみにする
        new_input = all.new_input() # 提案されたワードを入力させる
        new_str_query = all.new_str_query(new_input, str_query) # はじめに入力されたクエリと後で追加されたクエリを合併させる
        new_top_n_indices = all.new_question_answer(new_str_query, processed_new_texts_series, n) # 質問文書をクエリと提示された文書とのコサイン類似度を大きい順に並べる．

        print(new_top_n_indices)
        new_cs = all.get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと新しい質問文書とのコサイン類似度の取得
        new_cs_max = np.argmax(new_cs)
        for new_top_n_index in new_top_n_indices: # 結果の表示
            new_n_cs = all.get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
            # print(new_n_cs)
            if new_cs_max <= 0.2:
                print("検索ワードを変えてやり直してください")
                break
            if new_n_cs > 0.2:
                print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
            #     # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
                continue
            if new_n_cs <= 0.2:
                print("以上です")
                break


    else:
        print("NotFound")

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("query", type=str)
    args = parser.parse_args()
    main(args)
diff --git a/test9.py b/test9.py
 import importlib
 # importlib.reload()
 import argparse
 import numpy as np
 import glob
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity

 from IPython.display import display

 import pandas as pd

 import MeCab
 tagger = MeCab.Tagger("-Ochasen")
 import mojimoji
 import os
 import urllib

 from collections import Counter
 import collections

 text_paths = glob.glob('data/ocu2/*.txt')
 from func import all
 # from func import q_get
 # from func import a_get
 # from func import load_stopwords
 # from func import preprocess
 # from func import get_cs
 # from func import find_top_n
 # from func import get_n_cs
 # from func import listing_query

 def main(args):
    texts = all.q_get(text_paths)
    a_texts = all.a_get(text_paths)
    query_texts = all.listing_query(args.query)
    # query_texts = all.listing_query("VPNが繋がらない")
    q_series = pd.Series(texts)
    query_series = pd.Series(query_texts)
    processed_q_series = all.preprocess(q_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_data = processed_q_series
    processed_query_series = all.preprocess(query_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_query = processed_query_series
    cs = all.get_cs(str_query, str_data)
    # nonzero_indices = all.get_cs_words(str_query, str_data, cs)
    # for nonzero_index in nonzero_indices:
        # print(q_series[nonzero_index])
    # n = 295
    n = all.get_len_series(str_data) # データセット文書数
    top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得

    #
    new_series = all.get_new_words(cs, texts)
    # print(new_series)

    # print(new_texts)
    max_index = np.argmax(cs)
    max_cs = cs[max_index][0]

    new_texts =[]

    if max_cs > 1e-10:
        # print(f"該当する質問番号: {top_n_indices}")
        # print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
        # print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
        for top_n_index in top_n_indices: # 結果の表示
            n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
            if n_cs > 1e-10:
                print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
                new_text = texts[top_n_index]
                new_texts.append(new_text) # 提示されているデータを新しい質問文書リストに格納

        # print(new_texts)
        new_series_count = all.select_topic(new_series) # 言葉の出現回数の取得
        # print(new_series_count) # 言葉の出現回数の表示

        new_texts_series = pd.Series(new_texts) # 1度選ばれた質問文書を pandas のシリーズにする
        processed_new_texts_series = all.preprocess(new_texts_series, ['名詞', '固有名詞', '動詞', '形容詞']) # 1度選ばれた質問文書を処理し，名詞，固有名詞，動詞，形容詞のみにする
        new_input = all.new_input() # 提案されたワードを入力させる
        new_str_query = all.new_str_query(new_input, str_query) # はじめに入力されたクエリと後で追加されたクエリを合併させる
        new_top_n_indices = all.new_question_answer(new_str_query, processed_new_texts_series, n) # 質問文書をクエリと提示された文書とのコサイン類似度を大きい順に並べる．

        # print(new_top_n_indices)
        new_cs = all.get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと新しい質問文書とのコサイン類似度の取得
        new_cs_max = np.argmax(new_cs)
        # for new_top_n_index in new_top_n_indices: # 結果の表示
        #     new_n_cs = all.get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
        #     # print(new_n_cs)
        #     if new_cs_max <= 0.2:
        #         print("検索ワードを変えてやり直してください")
        #         break
        #     if new_n_cs > 0.2:
        #         print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
        #     #     # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
        #         continue
        #     if new_n_cs <= 0.2:
        #         print("以上です")
        #         break
        all.print_new_words(new_top_n_indices, new_cs, new_texts, new_cs_max)

    else:
        print("NotFound")

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("query", type=str)
    args = parser.parse_args()
    main(args)
	import pandas as pd
	import MeCab
	tagger = MeCab.Tagger("-Ochasen")
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	from collections import Counter
	import collections
	import itertools

	def q_get(text_paths): # 質問文書を text に格納
	texts = []
	for text_path in text_paths:
	text = open(text_path, 'r').read()
	text = text.split(',') # CSV ファイルのセルで分割
	text = ' '.join(text[8:9]) # 質問文書部分
	text = text.replace( '\n' , '' ) # 質問文書の改行を削除
	text = text.strip('"') # CSV ファイルのセル " を削除
	# text = text.replace('する', '') # するできるの削除（不要？）
	# text = text.replace('できる', '')
	texts.append(text) # 配列 texts に格納

	return texts

	def a_get(text_paths): # 回答文書を text に格納
	a_texts = []
	for text_path in text_paths:
	a_text = open(text_path, 'r').read()
	a_text = a_text.split(',') # CSV ファイルのセルで分割
	a_text = ' '.join(a_text[16:17]) # 質問文書部分
	# a_text = a_text.replace( '\n' , '' ) # 質問文書の改行を削除．読みにくいのでやっぱり不要
	a_text = a_text.strip('"') # CSV ファイルのセル " を削除
	a_texts.append(a_text) # 配列 a_texts に格納

	return a_texts

	def load_stopwords(path="data/jp_stop_words.txt"): # ストップワードの読み込み外部ファイル
	url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
	# if os.path.exists(path):
	# print('ストップワードの読み込み完了')
	# else:
	# print('ストップワードのダウンロード中')
	# urllib.request.urlretrieve(url, path)
	return pd.read_csv(path, header=None)[0].tolist()

	# def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
	# stop_words = load_stopwords() # ストップワードの削除
	# def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
	# tokens = []
	# node = tagger.parseToNode(str(text))
	# while node:
	# features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
	# surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
	# if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
	# node = node.next
	# continue
	#
	# if (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
	# tokens.append(surface)
	# elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
	# tokens.append(surface)
	# elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
	# tokens.append(surface)
	# elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
	# tokens.append(surface)
	#
	# # noun_flag = (features[0] == '名詞')
	# # proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
	# # verb_flag = (features[0] == '動詞') & (features[1] == '自立')
	# # adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
	# # if proper_noun_flag:
	# # tokens.append(surface)
	# # elif noun_flag:
	# # tokens.append(surface)
	# # elif verb_flag:
	# # tokens.append(surface)
	# # elif adjective_flag:
	# # tokens.append(surface)
	#
	# node = node.next
	# return " ".join(tokens)
	#
	# series = series.map(tokenizer_func)
	#
	# #---------------Normalization-----------#
	# series = series.map(lambda x: x.lower()) # 小文字に統一
	# # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
	#
	# return series
	def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
	stop_words = load_stopwords() # ストップワードの削除
	def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
	tokens = []
	node = tagger.parseToNode(str(text))
	while node:
	features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
	surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
	# if surface == '*': # 知らない言葉を表示
	# print(node.surface)
	if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
	node = node.next
	continue
	elif (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
	tokens.append(surface)

	# noun_flag = (features[0] == '名詞')
	# proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
	# verb_flag = (features[0] == '動詞') & (features[1] == '自立')
	# adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
	# if proper_noun_flag:
	# tokens.append(surface)
	# elif noun_flag:
	# tokens.append(surface)
	# elif verb_flag:
	# tokens.append(surface)
	# elif adjective_flag:
	# tokens.append(surface)

	node = node.next
	return " ".join(tokens)

	series = series.map(tokenizer_func)

	#---------------Normalization-----------#
	series = series.map(lambda x: x.lower()) # 小文字に統一
	# series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．

	return series
	# query_preprocess は不要．
	# def query_preprocess(query_series): # 前処理
	# stop_words = load_stopwords() # ストップワードの削除
	# def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
	# tokens = []
	# node = tagger.parseToNode(str(text))
	# while node:
	# features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
	# surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
	# if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
	# node = node.next
	# continue
	# noun_flag = (features[0] == '名詞')
	# proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
	# verb_flag = (features[0] == '動詞') & (features[1] == '自立')
	# adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
	# if proper_noun_flag:
	# tokens.append(surface)
	# elif noun_flag:
	# tokens.append(surface)
	# elif verb_flag:
	# tokens.append(surface)
	# elif adjective_flag:
	# tokens.append(surface)
	# node = node.next
	# return " ".join(tokens)
	#
	# query_series = query_series.map(tokenizer_func)
	# # query_series = tokenizer_func(query_series)
	#
	# #---------------Normalization-----------#
	# query_series = query_series.map(lambda x: x.lower()) # 小文字に統一
	# # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
	# return query_series

	def question_vector(series): # 質問文書を Tf-Idf を用いて数値化
	tfidf = TfidfVectorizer()
	question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価
	query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	# 複数の返り値 https://pg-chain.com/python-function-return#toc3
	return question_vector, query_vector

	def get_cs(query_series, series): # 質問文書を MeCab で処理したあとのものをコサイン類似度を評価．
	tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
	question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
	# print(len(question_vector[0]))
	query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	cs = cosine_similarity(question_vector, query_vector) # コサイン類似度の評価
	# print(len(query_vector[0]))
	return cs # それぞれのコサイン類似度を評価

	def get_len_series(series): # 質問文書の単語の総量を求める
	tfidf = TfidfVectorizer()
	question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価

	return len(question_vector) # それぞれのコサイン類似度を評価

	def find_top_n(n, cs): # コサイン類似度上から順に n 件の配列番号を取得する
	arr_top_n_indices = np.argsort(cs, axis = None)[-n:]
	top_n_indices = arr_top_n_indices[::-1] # 降順にソート
	return top_n_indices # top_n_indices は n 個の配列，一つ一つは番号

	def get_n_cs(cs, top_n_index, top_n_indices): # 配列番号 top_n_index 番目のコサイン類似度の取得
	for n_cs in top_n_indices:
	n_cs = cs[top_n_index][0]
	return n_cs

	####
	# 今困ってるのはコサイン類似度0より大きいものの文書に出てくる単語を抽出し表示，その単語を選択させるプログラムが出来ない
	# 1. 単語の抽出
	# 2. 単語の表示
	# 3. 単語の選択（単語自体の入力もしくは番号で選択させる）
	# 4. 文書を1つに絞るまでやる

	# 同じ内容の文書（似たような文書）がある．その辺の扱いは一旦保留


	# def get_cs_words(query_series, series, texts):
	def get_new_words(cs, texts): # コサイン類似度の高い質問文書から，名詞と固有名詞のみを抽出する．
	new_texts = [] # 配列
	new_series = {} # pandas の series の形式
	# print(type(cs.nonzero()))
	new_nums = cs.nonzero()[0] # コサイン類似度の行列の1行目だけ欲しい
	for new_num in new_nums:
	# print(texts[new_num])
	new_texts.append(texts[new_num]) # 配列に要素を追加
	# print(new_texts)
	# # for new_text in new_texts: #不要
	# # new_text = texts[new_num]
	# # print(new_text)
	# # new_texts.append(new_text)
	# # new_texts = texts[new_nums[0]]
	# new_list = listing_query(texts[new_num])
	new_texts_pd = pd.Series(new_texts) # pandas の series に new_texts を格納
	# new_series = all.preprocess(new_texts_pd, ['名詞', '固有名詞']) # (消さない)対話型でやるときはこっち．all.〜入り
	new_series = preprocess(new_texts_pd, ['名詞', '固有名詞']) # 名詞と固有名詞だけを抽出
	# # print(new_texts)
	# print(new_series)
	print(type(new_series))
	return new_series

	# new_q_series = pd.Series(texts[top_n_indices])

	# tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
	# question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
	# query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	# # print(query_vector)
	# # cs_nonzero = query_vector.nonzero()
	# # nonzero_indices = np.argwhere(cs != 0)
	# # for nonzero_index in nonzero_indices:
	# # # print(texts[nonzero_index])
	# # print(nonzero_index)
	# # return series[nonzero_index]
	# nonzero_indices = tfidf.inverse_transform(query_vector)
	# print(nonzero_indices)
	# n_cs
	# return nonzero_indices

	# やること
	# 3. 単語の選択（単語自体の入力もしくは番号で選択させる）
	# 4. 文書を1つに絞るまでやる

	def select_topic(new_series): # 類似の質問文書を絞るために series から単語を選ばせる
	new_series_value = []
	new_series_values = []
	new_series_values = new_series.str.split(' ') # 半角スペースで区切る
	new_series_value = sum(new_series_values, []) # 単語の出現回数を数えるために，2次元配列を1次元にした

	# print(new_series_values)

	new_series_count = collections.Counter(new_series_value) # 単語の出現回数を出力

	print(f"{len(new_series_count)}件のワードが見つかりました")
	print(f"{set(new_series_count)}")
	return new_series_count

	def new_input(): # 提案したワードを入力させる
	new_input = input('上の中から近いワードを選んでください．複数選択する場合は半角スペースで区切って入力してください．: ').lower() # 入力を小文字にする
	# new_query = new_input.split(' ')
	# new_query = new_input
	# new_query_pd = pd.Series(new_input)
	# return new_query_pd
	return new_input

	# def new_question(new_query_pd, str_query):
	def new_str_query(new_input, str_query): # str_query に新たなクエリを付与したもの
	str_query_str = str_query[0] # pandas の1行目データ抜き出す．str_query 自体を str 形式にしてしまうと "dtype:object" も入ってしまうため．
	str_query_str = str_query_str + ' ' + new_input
	new_str_query = pd.Series(str_query_str)
	# new_str_query_pd_str = str_query.str + new_query_pd.str
	return new_str_query # 新しい情報を付加したクエリ

	def new_question_answer(new_str_query, processed_new_texts_series, n): # 新しいクエリと1度選ばれた質問文書のコサイン類似度の上位の文書番号集合を取得
	new_cs = get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと1度選ばれた質問文書のコサイン類似度の取得
	new_top_n_indices = find_top_n(n, new_cs) # コサイン類似度の高い順に配列番号を並び替える
	return new_top_n_indices # コサイン類似度の配列番号を出力

	def cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value):
	counter = 0
	print(f"コサイン類似度 {cs_value} 以上の文書は以下の通りです。")
	for new_top_n_index in new_top_n_indices: # 結果の表示
	new_n_cs = get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
	if new_cs_max <= cs_value:
	print(f"コサイン類似度 {cs_value} の質問文書はありません。検索ワードを変えてやり直してください。")
	break
	if new_n_cs > cs_value:
	print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
	# # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
	counter = counter + 1
	continue
	if new_n_cs <= cs_value:
	print(f"コサイン類似度 {cs_value} 以上の質問文書は {counter} 件です。\n")
	break

	def print_new_words(new_top_n_indices, new_cs, new_texts, new_cs_max):

	# print(new_n_cs)
	cs_value = 0.2
	cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	cs_value = 0.3
	cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	cs_value = 0.4
	cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	# if new_cs_max <= 0.2:
	# print("検索ワードを変えてやり直してください")
	# break
	# if new_n_cs > 0.2:
	# print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
	# # # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
	# continue
	# if new_n_cs <= 0.2:
	# print("以上です")
	# break

	def listing_query(query): # 質問文書を queries に格納
	list_query = []
	list_query.append(query) # 配列 queries に格納
	return list_query
ado	*	*	10	名詞	固有名詞	一般	*	*	*	ado	ado	ado
All	*	*	10	名詞	固有名詞	一般	*	*	*	all apps	all	all
apex	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
Apex	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
APEX	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
apexone	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
APEXONE	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
ApexOne	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
Apps	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
eduroam	*	*	10	名詞	固有名詞	一般	*	*	*	eduroam	eduroam	eduroam
Eduroam	*	*	10	名詞	固有名詞	一般	*	*	*	eduroam	eduroam	eduroam
EDUROAM	*	*	10	名詞	固有名詞	一般	*	*	*	eduroam	eduroam	eduroam
Forms	*	*	10	名詞	固有名詞	一般	*	*	*	Forms	forms	forms
list	*	*	10	名詞	固有名詞	一般	*	*	*	list	list	list
logout	*	*	10	名詞	一般	一般	*	*	*	ログアウト	ログアウト	ログアウト
mathmatica	*	*	10	名詞	固有名詞	一般	*	*	*	mathmatica	mathmatica	mathmatica
Mathmatica	*	*	10	名詞	固有名詞	一般	*	*	*	mathmatica	mathmatica	mathmatica
MATHMATICA	*	*	10	名詞	固有名詞	一般	*	*	*	mathmatica	mathmatica	mathmatica
matlab	*	*	10	名詞	固有名詞	一般	*	*	*	matlab	matlab	matlab
MATLAB	*	*	10	名詞	固有名詞	一般	*	*	*	matlab	matlab	matlab
Matlab	*	*	10	名詞	固有名詞	一般	*	*	*	matlab	matlab	matlab
office	*	*	10	名詞	固有名詞	一般	*	*	*	Office	オフィス	オフィス
Office	*	*	10	名詞	固有名詞	一般	*	*	*	Office	オフィス	オフィス
ocu	*	*	10	名詞	固有名詞	一般	*	*	*	ocu	ocu	ocu
Ocu	*	*	10	名詞	固有名詞	一般	*	*	*	ocu	ocu	ocu
OCU	*	*	10	名詞	固有名詞	一般	*	*	*	ocu	ocu	ocu
ocuid	*	*	10	名詞	固有名詞	一般	*	*	*	ocuid	ocuid	ocuid
OCUID	*	*	10	名詞	固有名詞	一般	*	*	*	ocuid	ocuid	ocuid
OCUNET	*	*	10	名詞	固有名詞	一般	*	*	*	ocunet	ocunet	ocunet
ocunet	*	*	10	名詞	固有名詞	一般	*	*	*	ocunet	ocunet	ocunet
Pro	*	*	10	名詞	固有名詞	一般	*	*	*	pro	pro	pro
pro	*	*	10	名詞	固有名詞	一般	*	*	*	pro	pro	pro
PRO	*	*	10	名詞	固有名詞	一般	*	*	*	pro	pro	pro
Publisher	*	*	10	名詞	固有名詞	一般	*	*	*	publisher	publisher	publisher
Teams	*	*	10	名詞	固有名詞	一般	*	*	*	teams	teams	teams
teams	*	*	10	名詞	固有名詞	一般	*	*	*	teams	teams	teams
TEAMS	*	*	10	名詞	固有名詞	一般	*	*	*	teams	teams	teams
TrendMicro	*	*	10	名詞	固有名詞	一般	*	*	*	トレンドマイクロ	トレンドマイクロ	トレンドマイクロ
trendmicro	*	*	10	名詞	固有名詞	一般	*	*	*	トレンドマイクロ	トレンドマイクロ	トレンドマイクロ
Trendmicro	*	*	10	名詞	固有名詞	一般	*	*	*	トレンドマイクロ	トレンドマイクロ	トレンドマイクロ
TRENDMICRO	*	*	10	名詞	固有名詞	一般	*	*	*	トレンドマイクロ	トレンドマイクロ	トレンドマイクロ
unipa	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニパ	ユニパ
Unipa	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニパ	ユニパ
UNIPA	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニパ	ユニパ
ユニパ	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニパ	ユニパ
ユニバーサルパスポート	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニバーサルパスポート	ユニバーサルパスポート
update	*	*	10	名詞	固有名詞	一般	*	*	*	アップデート	アップデート	アップデート
Update	*	*	10	名詞	固有名詞	一般	*	*	*	アップデート	アップデート	アップデート
UPDATE	*	*	10	名詞	固有名詞	一般	*	*	*	アップデート	アップデート	アップデート
WEB	*	*	10	名詞	一般	一般	*	*	*	web	ウェブ	ウェブ
Web	*	*	10	名詞	一般	一般	*	*	*	web	ウェブ	ウェブ
WebAuth	*	*	10	名詞	固有名詞	一般	*	*	*	webauth	webauth	webauth
webauth	*	*	10	名詞	固有名詞	一般	*	*	*	webauth	webauth	webauth
WebEx	*	*	10	名詞	固有名詞	一般	*	*	*	webex	webex	webex
webex	*	*	10	名詞	固有名詞	一般	*	*	*	webex	webex	webex
Wifi	*	*	10	名詞	一般	一般	*	*	*	Wi-Fi	ワイファイ	ワイファイ
WINDOWS	*	*	10	名詞	固有名詞	一般	*	*	*	Windows	ウィンドウズ	ウィンドウズ
WIndows	*	*	10	名詞	固有名詞	一般	*	*	*	Windows	ウィンドウズ	ウィンドウズ
windows	*	*	10	名詞	固有名詞	一般	*	*	*	Windows	ウィンドウズ	ウィンドウズ
ZOOM	*	*	10	名詞	固有名詞	一般	*	*	*	Zoom	ズーム	ズーム
聞蔵	*	*	10	名詞	固有名詞	一般	*	*	*	聞蔵	きくぞう	きくぞう
Questions	first query	second query	cs >= 0.2	satisfy(0.2)	cs >= 0.3	satisfy(0.3)	cs >= 0.4	satisfy(0.4)	最大CS
（非常勤講師）メールアドレスが欲しい	メールアドレス	付与	1	1	1	1	1	1	0.68
eduroamの認証方法が分からない	eduroam	接続方法	4	1	4	1	4	1	1
eduroamの認証方法が分からない	eduroam	設定	2	1	2	1	2	1	0.78
MACアドレスを調べる	MACアドレス	調べる	1	0	1	0	1	0	0.35
Officeのインストール方法が分からない	インストール出来ない	office	7	1	5	1	3	1	0.64
Officeのインストール方法が分からない	officeのインストール	方法	10	0	5	0	2	0	0.63
Teamsの設定方法がわからない	Teams	設定	6	1	3	1	0	1	0.39
ThunderbirdにOCUメールを設定したい	Thunderbird	設定方法	5	1	5	1	4	1	0.53
ThunderbirdにOCUメールを設定したい	Thunderbird	OCUメール	7	1	5	1	5	1	0.77
Unipaのマニュアルが欲しい	Unipa	マニュアル	1	1	1	1	1	1	0.53
VPNに接続できない	vpn	繋がらない	5	1	1	1	0	0	0.32
VPNに接続できない	仮想ネットワーク	接続	5	0	1	0	1	0	0.45
VPNの接続方法がわからない	vpn	接続方法	4	1	4	1	2	1	0.62
VPNの接続方法がわからない	仮想ネットワーク	登録方法	10	1	5	1	4	1	0.91
VPNの接続方法がわからない	vpn	登録方法	5	1	1	1	0	0	0.33
Wi-Fiに接続できない	wi-fi	接続	3	0	3	0	2	0	0.52
Wi-Fiに接続できない	wi-fi	繋がらない	0	0	0	0	0	0	0.6
Wi-Fiのパスワードがわからない	wi-fi	パスワード	4	1	4	1	3	1	0.57
Windows10Proが欲しい	Windows10Pro	欲しい	2	1	2	1	2	1	0.51
Cisco WebEXの有料アカウントが欲しい	WebEx	有料	1	1	1	1	1	1	0.63
Zoomの有料アカウントが欲しい	zoom	有料アカウント	7	1	3	1	2	1	0.49
Zoomの有料アカウントが欲しい	Zoom	有償アカウント	7	1	4	1	2	1	0.53
Zoomの有料アカウントが欲しい	zoom	学生	1	1	1	1	1	1	0.67
Zoomのログインする方法を知りたい	zoom	ログイン	5	1	5	1	4	1	0.72
ウイルス対策ソフト ApexOneのインストール方法が分からない	ウイルス対策ソフト	インストール	6	1	5	1	3	1	0.79
ウイルス対策ソフト ApexOneのインストール方法が分からない	Apexone	インストール	7	1	5	1	2	1	0.67
ウイルス対策ソフト ApexOneのインストール方法が分からない	TrendMicro	ダウンロード出来ない	3	1	3	1	2	1	0.6
共有PCにOfficeをインストールしたい	共有PC	office	4	1	3	1	3	1	0.67
固定IPアドレスについて	IPアドレス	固定	1	1	1	1	1	1	0.53
全学認証仮パスワードがわからない	全学認証	仮パスワード	6	1	5	1	5	1	0.67
ネットワークに接続できない	ネットワーク	使用出来ない	2	1	2	1	1	1	0.5
ネットワークに接続できない	ネットワーク	使えない	1	0	1	0	1	0	0.43
ネットワークに接続できない	ネットワーク	切れる	9	1	0	0	0	0	0.28
プリンターで印刷できない	印刷できない	プリンター	4	1	2	1	2	1	0.81
プリンターで印刷できない	繋がらない	プリンター	3	0	1	0	1	0	0.51
プリンターで印刷できない	接続出来ない	プリンター	15	1	4	0	1	0	0.63
プリンターで印刷できない	プリンター	印刷できない	1	1	1	1	1	1	0.87
プリンターで印刷できない	プリンター	繋がらない	6	1	2	1	1	1	0.4
プリンターで印刷できない	プリンター	接続出来ない	1	1	1	1	1	1	0.4
名誉教授がVPNを使用したい	名誉教授	vpn	1	1	1	1	1	1	0.66
名誉教授向けのOCUIDについて	名誉教授	OCUID	2	1	1	1	1	1	0.66
迷惑メールに分類されてしまう	迷惑メール	OCUメール	1	1	1	1	1	1	0.66
メーリングリストを差出人に設定したい	メーリングリスト	設定	3	1	3	1	3	1	0.63
メーリングリストを差出人に設定したい	メーリングリスト	Thunderbird	1	1	1	1	1	1	0.6
メーリングリストを差出人に設定したい	メーリングリスト	差出人	4	1	4	1	4	1	0.53
メールアドレス (@st.osaka-cu.ac.jp) が使用できない	メールアドレス	使用出来ない	2	1	2	1	1	1	0.52
リモートデスクトップに接続できない	リモートデスクトップ	繋がらない	1	1	1	1	1	1	0.54
リモートデスクトップに接続できない	リモートデスクトップ	接続出来ない	4	1	3	1	3	1	0.58
全学認証パスワードが分からない	全学認証パスワード	分からない	8	1	7	1	6	1	0.63
全学認証パスワードの初期パスワードが分からない	全学認証	初期パスワード	5	1	5	1	4	1	0.89
		Average	4.08	0.86	2.64	0.82	1.96	0.78	0.5966
	import importlib
	# importlib.reload()
	import argparse
	import numpy as np
	import glob
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	from IPython.display import display

	import pandas as pd

	import MeCab
	tagger = MeCab.Tagger("-Ochasen")
	import mojimoji
	import os
	import urllib

	from collections import Counter
	import collections

	text_paths = glob.glob('data/ocu2/*.txt')
	from func import all
	# from func import q_get
	# from func import a_get
	# from func import load_stopwords
	# from func import preprocess
	# from func import get_cs
	# from func import find_top_n
	# from func import get_n_cs
	# from func import listing_query

	def main(args):
	texts = all.q_get(text_paths)
	a_texts = all.a_get(text_paths)
	query_texts = all.listing_query(args.query)
	# query_texts = all.listing_query("VPNが繋がらない")
	q_series = pd.Series(texts)
	query_series = pd.Series(query_texts)
	processed_q_series = all.preprocess(q_series, ['名詞', '固有名詞', '動詞', '形容詞'])
	str_data = processed_q_series
	processed_query_series = all.preprocess(query_series, ['名詞', '固有名詞', '動詞', '形容詞'])
	str_query = processed_query_series
	cs = all.get_cs(str_query, str_data)
	# nonzero_indices = all.get_cs_words(str_query, str_data, cs)
	# for nonzero_index in nonzero_indices:
	# print(q_series[nonzero_index])
	# n = 295
	n = all.get_len_series(str_data) # データセット文書数
	top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得

	#
	new_series = all.get_new_words(cs, texts)
	# print(new_series)

	# print(new_texts)
	max_index = np.argmax(cs)
	max_cs = cs[max_index][0]

	new_texts =[]

	if max_cs > 1e-10:
	# print(f"該当する質問番号: {top_n_indices}")
	# print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
	# print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
	for top_n_index in top_n_indices: # 結果の表示
	n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
	if n_cs > 1e-10:
	print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
	new_text = texts[top_n_index]
	new_texts.append(new_text) # 提示されているデータを新しい質問文書リストに格納

	# print(new_texts)
	new_series_count = all.select_topic(new_series) # 言葉の出現回数の取得
	print(new_series_count) # 言葉の出現回数の表示

	new_texts_series = pd.Series(new_texts) # 1度選ばれた質問文書を pandas のシリーズにする
	processed_new_texts_series = all.preprocess(new_texts_series, ['名詞', '固有名詞', '動詞', '形容詞']) # 1度選ばれた質問文書を処理し，名詞，固有名詞，動詞，形容詞のみにする
	new_input = all.new_input() # 提案されたワードを入力させる
	new_str_query = all.new_str_query(new_input, str_query) # はじめに入力されたクエリと後で追加されたクエリを合併させる
	new_top_n_indices = all.new_question_answer(new_str_query, processed_new_texts_series, n) # 質問文書をクエリと提示された文書とのコサイン類似度を大きい順に並べる．

	print(new_top_n_indices)
	new_cs = all.get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと新しい質問文書とのコサイン類似度の取得
	new_cs_max = np.argmax(new_cs)
	for new_top_n_index in new_top_n_indices: # 結果の表示
	new_n_cs = all.get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
	# print(new_n_cs)
	if new_cs_max <= 0.2:
	print("検索ワードを変えてやり直してください")
	break
	if new_n_cs > 0.2:
	print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
	# # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
	continue
	if new_n_cs <= 0.2:
	print("以上です")
	break


	else:
	print("NotFound")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("query", type=str)
	args = parser.parse_args()
	main(args)