kepoorz · April 5, 2025 11:25
diff --git a/summerize_notes.py b/summerize_notes.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 r"""
 Gemini APIを使用してObsidianのノートを要約するスクリプトです。

 使用には、Gemini APIが必要です。
 https://developers.google.com/generative-ai/gemini/get-started

 指定されたディレクトリ内のMarkdownファイルを処理し、各ファイルの簡潔な要約を生成して、事前定義されたテンプレート形式でファイルに追加します。
 自分用で、動作保証してないので、使い方がわかっている方のみ使用してください。

 ## 主な機能
 - **summarize_file**: Gemini APIを使用してMarkdownファイルの内容を要約し、その要約をファイルに追加します
 - **process_files**: ディレクトリ内のすべてのMarkdownファイルを処理し、各ファイルの要約を生成し、同時実行を管理します

 ## コマンドラインサンプル
 python summarize_notes.py --directory "/path/to/obsidian/vault" --api_key "your_gemini_api_key_here"
 python summarize_notes.py --directory "/path/to/obsidian/vault" --api_key "your_gemini_api_key_here" --max_concurrent 3 --max_files 50 --tag_pattern "project|work"
 windowsの場合
 python summarize_notes.py --directory "C:\Users\YourName\Documents\ObsidianVault" --api_key "your_gemini_api_key_here"

 ## コマンドライン引数
 - **--directory**: Obsidianノート（Markdownファイル）を含むディレクトリのパス
 - **--api_key**: 必須。Gemini APIにアクセスするためのAPIキー
 - **--max_concurrent**: オプション。同時API要求の最大数（デフォルト：5）
 - **--max_files**: オプション。処理するファイルの最大数
 - **--tag_pattern**: オプション。ファイル名でフィルタリングするための正規表現パターン

 ## 使用方法
 コマンドラインから必要な引数を指定してスクリプトを実行します。スクリプトは指定されたディレクトリ内のMarkdownファイルを処理し、Gemini APIを使用して要約を生成し、その要約をファイルに追加します。

 ## 注意点
 - 既に処理されたファイルや既に要約が含まれているファイルはスキップされます
 - APIコストは入力と出力のトークン数に基づいて計算されます
 - 処理済みファイルは重複処理を避けるために`processed_files.txt`ファイルに記録されます

 """

 import os
 import asyncio
 import aiohttp
 import google.generativeai as genai
 import argparse
 import time
 import logging
 from tqdm.asyncio import tqdm_asyncio
 import re

 # ログ設定
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("summarize_notes.log"),
        logging.StreamHandler()
    ]
 )
 logger = logging.getLogger(__name__)

 # テンプレート定義
 SUMMARY_TEMPLATE = "\n\n## 要約\n{}\n\n"

 async def summarize_file(filepath, api_key, semaphore, model, processed_files, total_tokens):
    """ファイルを要約してテンプレートに沿って追記する"""
    async with semaphore:  # 同時リクエスト数を制限
        try:
            # ファイルが既に処理済みか確認
            if filepath in processed_files:
                logger.info(f"スキップ (既に処理済み): {filepath}")
                return 0, 0

            # ファイルを読み込む
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()

            # 既に要約がある場合はスキップ
            if "## 要約" in content:
                logger.info(f"スキップ (既に要約あり): {filepath}")
                return 0, 0

            # Gemini APIに要約をリクエスト
            prompt = f"""
            以下のテキストを300文字程度で要約してください。要点を箇条書きではなく文章で簡潔にまとめてください。

            {content}
            """

            generation_config = {
                'max_output_tokens': 512,
                'temperature': 0.3,
                'top_p': 0.95,
                'top_k': 40
            }

            response = await asyncio.to_thread(
                model.generate_content,
                prompt,
                generation_config=generation_config
            )

            # トークン数を計算
            input_tokens = len(prompt) // 4  # 簡易的な推定
            output_tokens = len(response.text) // 4  # 簡易的な推定

            # 要約を適切な位置に挿入する
            summary_text = SUMMARY_TEMPLATE.format(response.text)

            # YAMLフロントマターを探す (---で囲まれた部分)
            yaml_pattern = re.compile(r'^---\n.*?\n---\n', re.DOTALL)
            match = yaml_pattern.search(content)

            if match:
                # YAMLフロントマターが見つかった場合、その直後に要約を挿入
                yaml_end = match.end()
                updated_content = content[:yaml_end] + summary_text + content[yaml_end:]
            else:
                # YAMLフロントマターが見つからない場合はファイルの先頭に要約を挿入
                updated_content = summary_text + content

            # ファイルを上書き
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(updated_content)

            logger.info(f"要約を追加しました: {filepath}")
            return input_tokens, output_tokens

        except Exception as e:
            logger.error(f"エラー ({filepath}): {e}")
            return 0, 0

 async def process_files(directory, api_key, max_concurrent=5, max_files=None, tag_pattern=None):
    """ディレクトリ内のマークダウンファイルを処理する"""
    # 処理済みファイルの記録
    processed_files_path = "processed_files.txt"
    processed_files = set()

    if os.path.exists(processed_files_path):
        with open(processed_files_path, "r", encoding="utf-8") as f:
            processed_files = set(line.strip() for line in f)

    # 同時実行数を制限するセマフォを作成
    semaphore = asyncio.Semaphore(max_concurrent)

    # モデルを初期化
    genai.configure(api_key=api_key)
    # 2025年3月現在、最新の安定バージョンは gemini-1.5-flash-002 または gemini-1.5-pro-002
    model = genai.GenerativeModel('gemini-1.5-flash-002')

    # 処理対象ファイルを収集
    markdown_files = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                filepath = os.path.join(root, file)

                # タグパターンが指定された場合、ファイル名でフィルタリング
                if tag_pattern and not re.search(tag_pattern, file):
                    continue

                markdown_files.append(filepath)

    # 最大ファイル数が指定された場合
    if max_files and len(markdown_files) > max_files:
        markdown_files = markdown_files[:max_files]

    logger.info(f"処理対象ファイル数: {len(markdown_files)}")

    # タスクを作成
    tasks = []
    total_tokens = {"input": 0, "output": 0}

    for filepath in markdown_files:
        tasks.append(summarize_file(filepath, api_key, semaphore, model, processed_files, total_tokens))

    # タスクを非同期で実行
    results = await tqdm_asyncio.gather(*tasks, desc="ファイル処理中")

    # 成功したファイルを記録
    with open(processed_files_path, "a", encoding="utf-8") as f:
        for i, (input_tokens, output_tokens) in enumerate(results):
            if input_tokens > 0:  # 処理成功した場合
                f.write(f"{markdown_files[i]}\n")
                total_tokens["input"] += input_tokens
                total_tokens["output"] += output_tokens

    # APIコストを計算
    input_cost = total_tokens["input"] / 1000 * 0.00025
    output_cost = total_tokens["output"] / 1000 * 0.00075
    total_cost = input_cost + output_cost

    logger.info(f"処理完了")
    logger.info(f"入力トークン: {total_tokens['input']} トークン (コスト: ${input_cost:.4f})")
    logger.info(f"出力トークン: {total_tokens['output']} トークン (コスト: ${output_cost:.4f})")
    logger.info(f"合計コスト: ${total_cost:.4f}")

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Obsidianのノートの要約を自動生成するスクリプト")
    parser.add_argument("--directory", help="Obsidianのディレクトリパス")
    parser.add_argument("--api_key", required=True, help="Gemini API キー")
    parser.add_argument("--max_concurrent", type=int, default=5, help="同時リクエスト数の上限")
    parser.add_argument("--max_files", type=int, help="処理する最大ファイル数")
    parser.add_argument("--tag_pattern", help="処理対象のファイル名のパターン（正規表現）")

    args = parser.parse_args()

    asyncio.run(process_files(
        args.directory,
        args.api_key,
        args.max_concurrent,
        args.max_files,
        args.tag_pattern
    ))
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	r"""
	Gemini APIを使用してObsidianのノートを要約するスクリプトです。

	使用には、Gemini APIが必要です。
	https://developers.google.com/generative-ai/gemini/get-started

	指定されたディレクトリ内のMarkdownファイルを処理し、各ファイルの簡潔な要約を生成して、事前定義されたテンプレート形式でファイルに追加します。
	自分用で、動作保証してないので、使い方がわかっている方のみ使用してください。

	## 主な機能
	- summarize_file: Gemini APIを使用してMarkdownファイルの内容を要約し、その要約をファイルに追加します
	- process_files: ディレクトリ内のすべてのMarkdownファイルを処理し、各ファイルの要約を生成し、同時実行を管理します

	## コマンドラインサンプル
	python summarize_notes.py --directory "/path/to/obsidian/vault" --api_key "your_gemini_api_key_here"
	python summarize_notes.py --directory "/path/to/obsidian/vault" --api_key "your_gemini_api_key_here" --max_concurrent 3 --max_files 50 --tag_pattern "project\|work"
	windowsの場合
	python summarize_notes.py --directory "C:\Users\YourName\Documents\ObsidianVault" --api_key "your_gemini_api_key_here"

	## コマンドライン引数
	- --directory: Obsidianノート（Markdownファイル）を含むディレクトリのパス
	- --api_key: 必須。Gemini APIにアクセスするためのAPIキー
	- --max_concurrent: オプション。同時API要求の最大数（デフォルト：5）
	- --max_files: オプション。処理するファイルの最大数
	- --tag_pattern: オプション。ファイル名でフィルタリングするための正規表現パターン

	## 使用方法
	コマンドラインから必要な引数を指定してスクリプトを実行します。スクリプトは指定されたディレクトリ内のMarkdownファイルを処理し、Gemini APIを使用して要約を生成し、その要約をファイルに追加します。

	## 注意点
	- 既に処理されたファイルや既に要約が含まれているファイルはスキップされます
	- APIコストは入力と出力のトークン数に基づいて計算されます
	- 処理済みファイルは重複処理を避けるために`processed_files.txt`ファイルに記録されます

	"""

	import os
	import asyncio
	import aiohttp
	import google.generativeai as genai
	import argparse
	import time
	import logging
	from tqdm.asyncio import tqdm_asyncio
	import re

	# ログ設定
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler("summarize_notes.log"),
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)

	# テンプレート定義
	SUMMARY_TEMPLATE = "\n\n## 要約\n{}\n\n"

	async def summarize_file(filepath, api_key, semaphore, model, processed_files, total_tokens):
	"""ファイルを要約してテンプレートに沿って追記する"""
	async with semaphore: # 同時リクエスト数を制限
	try:
	# ファイルが既に処理済みか確認
	if filepath in processed_files:
	logger.info(f"スキップ (既に処理済み): {filepath}")
	return 0, 0

	# ファイルを読み込む
	with open(filepath, 'r', encoding='utf-8') as f:
	content = f.read()

	# 既に要約がある場合はスキップ
	if "## 要約" in content:
	logger.info(f"スキップ (既に要約あり): {filepath}")
	return 0, 0

	# Gemini APIに要約をリクエスト
	prompt = f"""
	以下のテキストを300文字程度で要約してください。要点を箇条書きではなく文章で簡潔にまとめてください。

	{content}
	"""

	generation_config = {
	'max_output_tokens': 512,
	'temperature': 0.3,
	'top_p': 0.95,
	'top_k': 40
	}

	response = await asyncio.to_thread(
	model.generate_content,
	prompt,
	generation_config=generation_config
	)

	# トークン数を計算
	input_tokens = len(prompt) // 4 # 簡易的な推定
	output_tokens = len(response.text) // 4 # 簡易的な推定

	# 要約を適切な位置に挿入する
	summary_text = SUMMARY_TEMPLATE.format(response.text)

	# YAMLフロントマターを探す (---で囲まれた部分)
	yaml_pattern = re.compile(r'^---\n.*?\n---\n', re.DOTALL)
	match = yaml_pattern.search(content)

	if match:
	# YAMLフロントマターが見つかった場合、その直後に要約を挿入
	yaml_end = match.end()
	updated_content = content[:yaml_end] + summary_text + content[yaml_end:]
	else:
	# YAMLフロントマターが見つからない場合はファイルの先頭に要約を挿入
	updated_content = summary_text + content

	# ファイルを上書き
	with open(filepath, 'w', encoding='utf-8') as f:
	f.write(updated_content)

	logger.info(f"要約を追加しました: {filepath}")
	return input_tokens, output_tokens

	except Exception as e:
	logger.error(f"エラー ({filepath}): {e}")
	return 0, 0

	async def process_files(directory, api_key, max_concurrent=5, max_files=None, tag_pattern=None):
	"""ディレクトリ内のマークダウンファイルを処理する"""
	# 処理済みファイルの記録
	processed_files_path = "processed_files.txt"
	processed_files = set()

	if os.path.exists(processed_files_path):
	with open(processed_files_path, "r", encoding="utf-8") as f:
	processed_files = set(line.strip() for line in f)

	# 同時実行数を制限するセマフォを作成
	semaphore = asyncio.Semaphore(max_concurrent)

	# モデルを初期化
	genai.configure(api_key=api_key)
	# 2025年3月現在、最新の安定バージョンは gemini-1.5-flash-002 または gemini-1.5-pro-002
	model = genai.GenerativeModel('gemini-1.5-flash-002')

	# 処理対象ファイルを収集
	markdown_files = []

	for root, _, files in os.walk(directory):
	for file in files:
	if file.endswith(".md"):
	filepath = os.path.join(root, file)

	# タグパターンが指定された場合、ファイル名でフィルタリング
	if tag_pattern and not re.search(tag_pattern, file):
	continue

	markdown_files.append(filepath)

	# 最大ファイル数が指定された場合
	if max_files and len(markdown_files) > max_files:
	markdown_files = markdown_files[:max_files]

	logger.info(f"処理対象ファイル数: {len(markdown_files)}")

	# タスクを作成
	tasks = []
	total_tokens = {"input": 0, "output": 0}

	for filepath in markdown_files:
	tasks.append(summarize_file(filepath, api_key, semaphore, model, processed_files, total_tokens))

	# タスクを非同期で実行
	results = await tqdm_asyncio.gather(*tasks, desc="ファイル処理中")

	# 成功したファイルを記録
	with open(processed_files_path, "a", encoding="utf-8") as f:
	for i, (input_tokens, output_tokens) in enumerate(results):
	if input_tokens > 0: # 処理成功した場合
	f.write(f"{markdown_files[i]}\n")
	total_tokens["input"] += input_tokens
	total_tokens["output"] += output_tokens

	# APIコストを計算
	input_cost = total_tokens["input"] / 1000 * 0.00025
	output_cost = total_tokens["output"] / 1000 * 0.00075
	total_cost = input_cost + output_cost

	logger.info(f"処理完了")
	logger.info(f"入力トークン: {total_tokens['input']} トークン (コスト: ${input_cost:.4f})")
	logger.info(f"出力トークン: {total_tokens['output']} トークン (コスト: ${output_cost:.4f})")
	logger.info(f"合計コスト: ${total_cost:.4f}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Obsidianのノートの要約を自動生成するスクリプト")
	parser.add_argument("--directory", help="Obsidianのディレクトリパス")
	parser.add_argument("--api_key", required=True, help="Gemini API キー")
	parser.add_argument("--max_concurrent", type=int, default=5, help="同時リクエスト数の上限")
	parser.add_argument("--max_files", type=int, help="処理する最大ファイル数")
	parser.add_argument("--tag_pattern", help="処理対象のファイル名のパターン（正規表現）")

	args = parser.parse_args()

	asyncio.run(process_files(
	args.directory,
	args.api_key,
	args.max_concurrent,
	args.max_files,
	args.tag_pattern
	))