katoy · November 3, 2024 12:19
diff --git a/sample-with_usb_camera_and_speach-02.py b/sample-with_usb_camera_and_speach-02.py
 """
 このプログラムは、Raspberry Pi 上でカメラ映像から物体を検出し、検出結果に基づいた説明を生成し、
 音声で読み上げる機能を提供します。

 主な機能:
 1. カメラ映像から物体検出: YOLO モデルを使用して、カメラ映像から物体を検出し、各物体に枠とラベルを描画します。
 2. 検出結果の説明生成: OpenAI API を用いて、検出された物体についての説明文を生成します。
 3. 音声出力: gTTS (Google Text-to-Speech) を使い、生成された説明を音声で読み上げます。
 4. 操作方法:
   - SPACEキー: 検出された物体に関する説明を生成し、音声で読み上げます。
   - ESCキー: 音声の読み上げを停止します。
   - Qキー: プログラムを終了します。
 """

 import cv2
 import openai
 import os
 import sys
 import logging
 import pygame
 import threading
 from io import BytesIO
 from dotenv import load_dotenv
 from gtts import gTTS
 from ultralytics import YOLO
 from contextlib import contextmanager

 # 初期設定
 load_dotenv()
 openai.api_key = os.getenv("OPENAI_API_KEY")
 pygame.mixer.init()
 logging.getLogger('ultralytics').setLevel(logging.CRITICAL)

 @contextmanager
 def suppress_output():
    """標準出力とエラー出力を一時的に無効化"""
    with open(os.devnull, 'w') as devnull:
        old_stdout, old_stderr = sys.stdout, sys.stderr
        sys.stdout, sys.stderr = devnull, devnull
        try:
            yield
        finally:
            sys.stdout, sys.stderr = old_stdout, old_stderr

 # YOLOモデルのロード
 model = YOLO("yolov9s.pt")

 def detect_objects(frame):
    """フレームからオブジェクトを検出"""
    with suppress_output():
        results = model(frame)
    return [(result.names[int(box.cls[0])], box.xyxy[0].tolist()) for result in results for box in result.boxes]

 def draw_boxes(frame, detections):
    """検出したオブジェクトに枠を描画"""
    for label, bbox in detections:
        x1, y1, x2, y2 = map(int, bbox)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

 def request_openai_completion(prompt):
    """OpenAI API でプロンプトに基づく説明を生成"""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "あなたは物体認識の結果を説明するアシスタントです。"},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    return response.choices[0].message['content'].strip()

 def trim_description(description, max_length=300):
    """指定文字数を超える場合、最後の文までで切り詰める"""
    if len(description) <= max_length:
        return description
    last_period_index = description.rfind("。", 0, max_length)
    return description[:last_period_index + 1] if last_period_index != -1 else description[:max_length]

 def generate_description(detections, max_length=300):
    """検出結果に基づき説明を生成"""
    labels = [label for label, _ in detections]
    prompt = f"以下の物体を検出しました: {', '.join(labels)}. これらの物体について説明してください。"
    
    full_description = ""
    while True:
        try:
            description = request_openai_completion(prompt)
            full_description += description
            if len(full_description) > max_length:
                return trim_description(full_description, max_length)
            if is_sentence_complete(full_description):
                break
            prompt = "続けてください: " + full_description
        except Exception as e:
            print(f"OpenAI API エラー: {e}")
            return "説明生成に失敗しました。"
    return full_description

 def is_sentence_complete(text):
    """文末が適切な句読点で終わっているかを確認"""
    return text.endswith(('.', '!', '?', '。', '！', '？'))

 def play_audio(description):
    """指定された説明を音声で再生"""
    try:
        tts = gTTS(description, lang="ja")
        audio_stream = BytesIO()
        tts.write_to_fp(audio_stream)
        audio_stream.seek(0)
        
        pygame.mixer.music.load(audio_stream, "mp3")
        pygame.mixer.music.play()
        while pygame.mixer.music.get_busy():
            continue
    except Exception as e:
        print(f"音声出力エラー: {e}")

 def speak_description(description):
    """音声再生を別スレッドで実行"""
    threading.Thread(target=play_audio, args=(description,)).start()

 def main():
    """メイン処理"""
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("カメラにアクセスできません。カメラデバイス番号を確認してください。")
        return

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                print("フレームをキャプチャできませんでした。")
                break

            detections = detect_objects(frame)
            draw_boxes(frame, detections)
            cv2.imshow("USB Camera - YOLOv8 Object Detection", frame)
            key = cv2.waitKey(10) & 0xFF

            if key == ord(' '):
                print("SPACEキーが押されました。説明を生成します...")
                if detections:
                    description = generate_description(detections)
                    print("AIからの説明:", description)
                    print("-----------------")
                    speak_description(description)
                else:
                    print("オブジェクトが検出されませんでした。")
                    print("-----------------")

            elif key == 27:  # ESCキーが押された場合
                print("ESCキーが押されました。音声を停止します...")
                if pygame.mixer.get_init() and pygame.mixer.music.get_busy():
                    pygame.mixer.music.stop()

            elif key == ord('q'):
                print("プログラムを終了します...")
                if pygame.mixer.get_init() and pygame.mixer.music.get_busy():
                    pygame.mixer.music.stop()
                break

    finally:
        cap.release()
        cv2.destroyAllWindows()

 if __name__ == "__main__":
    main()
	"""
	このプログラムは、Raspberry Pi 上でカメラ映像から物体を検出し、検出結果に基づいた説明を生成し、
	音声で読み上げる機能を提供します。

	主な機能:
	1. カメラ映像から物体検出: YOLO モデルを使用して、カメラ映像から物体を検出し、各物体に枠とラベルを描画します。
	2. 検出結果の説明生成: OpenAI API を用いて、検出された物体についての説明文を生成します。
	3. 音声出力: gTTS (Google Text-to-Speech) を使い、生成された説明を音声で読み上げます。
	4. 操作方法:
	- SPACEキー: 検出された物体に関する説明を生成し、音声で読み上げます。
	- ESCキー: 音声の読み上げを停止します。
	- Qキー: プログラムを終了します。
	"""

	import cv2
	import openai
	import os
	import sys
	import logging
	import pygame
	import threading
	from io import BytesIO
	from dotenv import load_dotenv
	from gtts import gTTS
	from ultralytics import YOLO
	from contextlib import contextmanager

	# 初期設定
	load_dotenv()
	openai.api_key = os.getenv("OPENAI_API_KEY")
	pygame.mixer.init()
	logging.getLogger('ultralytics').setLevel(logging.CRITICAL)

	@contextmanager
	def suppress_output():
	"""標準出力とエラー出力を一時的に無効化"""
	with open(os.devnull, 'w') as devnull:
	old_stdout, old_stderr = sys.stdout, sys.stderr
	sys.stdout, sys.stderr = devnull, devnull
	try:
	yield
	finally:
	sys.stdout, sys.stderr = old_stdout, old_stderr

	# YOLOモデルのロード
	model = YOLO("yolov9s.pt")

	def detect_objects(frame):
	"""フレームからオブジェクトを検出"""
	with suppress_output():
	results = model(frame)
	return [(result.names[int(box.cls[0])], box.xyxy[0].tolist()) for result in results for box in result.boxes]

	def draw_boxes(frame, detections):
	"""検出したオブジェクトに枠を描画"""
	for label, bbox in detections:
	x1, y1, x2, y2 = map(int, bbox)
	cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
	cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

	def request_openai_completion(prompt):
	"""OpenAI API でプロンプトに基づく説明を生成"""
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "あなたは物体認識の結果を説明するアシスタントです。"},
	{"role": "user", "content": prompt}
	],
	max_tokens=100
	)
	return response.choices[0].message['content'].strip()

	def trim_description(description, max_length=300):
	"""指定文字数を超える場合、最後の文までで切り詰める"""
	if len(description) <= max_length:
	return description
	last_period_index = description.rfind("。", 0, max_length)
	return description[:last_period_index + 1] if last_period_index != -1 else description[:max_length]

	def generate_description(detections, max_length=300):
	"""検出結果に基づき説明を生成"""
	labels = [label for label, _ in detections]
	prompt = f"以下の物体を検出しました: {', '.join(labels)}. これらの物体について説明してください。"

	full_description = ""
	while True:
	try:
	description = request_openai_completion(prompt)
	full_description += description
	if len(full_description) > max_length:
	return trim_description(full_description, max_length)
	if is_sentence_complete(full_description):
	break
	prompt = "続けてください: " + full_description
	except Exception as e:
	print(f"OpenAI API エラー: {e}")
	return "説明生成に失敗しました。"
	return full_description

	def is_sentence_complete(text):
	"""文末が適切な句読点で終わっているかを確認"""
	return text.endswith(('.', '!', '?', '。', '！', '？'))

	def play_audio(description):
	"""指定された説明を音声で再生"""
	try:
	tts = gTTS(description, lang="ja")
	audio_stream = BytesIO()
	tts.write_to_fp(audio_stream)
	audio_stream.seek(0)

	pygame.mixer.music.load(audio_stream, "mp3")
	pygame.mixer.music.play()
	while pygame.mixer.music.get_busy():
	continue
	except Exception as e:
	print(f"音声出力エラー: {e}")

	def speak_description(description):
	"""音声再生を別スレッドで実行"""
	threading.Thread(target=play_audio, args=(description,)).start()

	def main():
	"""メイン処理"""
	cap = cv2.VideoCapture(0)
	if not cap.isOpened():
	print("カメラにアクセスできません。カメラデバイス番号を確認してください。")
	return

	try:
	while True:
	ret, frame = cap.read()
	if not ret:
	print("フレームをキャプチャできませんでした。")
	break

	detections = detect_objects(frame)
	draw_boxes(frame, detections)
	cv2.imshow("USB Camera - YOLOv8 Object Detection", frame)
	key = cv2.waitKey(10) & 0xFF

	if key == ord(' '):
	print("SPACEキーが押されました。説明を生成します...")
	if detections:
	description = generate_description(detections)
	print("AIからの説明:", description)
	print("-----------------")
	speak_description(description)
	else:
	print("オブジェクトが検出されませんでした。")
	print("-----------------")

	elif key == 27: # ESCキーが押された場合
	print("ESCキーが押されました。音声を停止します...")
	if pygame.mixer.get_init() and pygame.mixer.music.get_busy():
	pygame.mixer.music.stop()

	elif key == ord('q'):
	print("プログラムを終了します...")
	if pygame.mixer.get_init() and pygame.mixer.music.get_busy():
	pygame.mixer.music.stop()
	break

	finally:
	cap.release()
	cv2.destroyAllWindows()

	if __name__ == "__main__":
	main()