Last active
January 3, 2025 21:41
-
-
Save taktamur/873a6a8c55545b68bf0e4e94ed0896c9 to your computer and use it in GitHub Desktop.
SoftwareDesign総集編の各号見出しのpdfを切り出して、特集集記事をテキスト起こしして、notionに放り込むまで。notionへの投入はn8nを使った
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "bcb15066", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!python -m pip install pillow pymupdf\n", | |
"!python -m pip install openai pydantic" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c07d7ff2", | |
"metadata": {}, | |
"source": [ | |
"やりたいこと:\n", | |
"\n", | |
"- SoftwareDesginの目録PDFから、各号の特集記事を抽出してNotionのDBに保存したい\n", | |
"\n", | |
"全体の仕様:\n", | |
"- 各号の特集記事は、./index.pdfに記載されている\n", | |
"- 処理はjupyter labで実装する\n", | |
" - 関数は細かめに分ける\n", | |
" - 各関数には日本語で説明コメントをつける\n", | |
"\n", | |
"index.pdfの仕様\n", | |
"- SoftwareDesignの各号の記事が紹介されている\n", | |
"- 3ページ目から最後まで\n", | |
"- 月刊誌なので、1年で12冊分ある\n", | |
"- 5年分なので60冊分の情報がある\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c2f4b72e", | |
"metadata": {}, | |
"source": [ | |
"step1 PDFから画像を切り抜き\n", | |
"やりたいこと\n", | |
"\n", | |
"- index.pdfから、画像の切り抜きを行いたい。\n", | |
"- 3ページ目から最後まで処理を行う\n", | |
" - 各ページの上下左右の余白(20%)を除去する。\n", | |
" - 上下に切り分けてpng画像として保存する\n", | |
"- 出力ディレクトリは\"sd_img\"とする" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f42d5a26", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"from PIL import Image\n", | |
"\n", | |
"import fitz # PyMuPDF\n", | |
"\n", | |
"# PDFファイルのパス\n", | |
"pdf_path = 'index.pdf'\n", | |
"# 出力ディレクトリ\n", | |
"output_dir = 'sd_img'\n", | |
"\n", | |
"# 出力ディレクトリを削除\n", | |
"if os.path.exists(output_dir):\n", | |
" import shutil\n", | |
" shutil.rmtree(output_dir)\n", | |
"\n", | |
"# 出力ディレクトリが存在しない場合は作成\n", | |
"if not os.path.exists(output_dir):\n", | |
" os.makedirs(output_dir)\n", | |
"\n", | |
"# PDFを開く\n", | |
"pdf_document = fitz.open(pdf_path)\n", | |
"\n", | |
"# 3ページ目から最後まで処理\n", | |
"for page_num in range(3, pdf_document.page_count):\n", | |
" page = pdf_document.load_page(page_num)\n", | |
" # 解像度を上げるためにスケーリングを設定\n", | |
" zoom_x = 4.0 # 横方向のスケーリング\n", | |
" zoom_y = 4.0 # 縦方向のスケーリング\n", | |
" matrix = fitz.Matrix(zoom_x, zoom_y)\n", | |
" \n", | |
" pix = page.get_pixmap(matrix=matrix)\n", | |
"\n", | |
"\n", | |
" # 画像をPILで開く\n", | |
" img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n", | |
"\n", | |
" # 余白を除去するためのパラメータ\n", | |
" left = int(pix.width * 0.05)\n", | |
" top = int(pix.height * 0.05)\n", | |
" right = pix.width - left\n", | |
" bottom = pix.height - top\n", | |
"\n", | |
" # 画像を切り抜く\n", | |
" img_cropped = img.crop((left, top, right, bottom))\n", | |
"\n", | |
" # 上下に切り分ける\n", | |
" upper_half = img_cropped.crop((0, 0, img_cropped.width, img_cropped.height // 2))\n", | |
" lower_half = img_cropped.crop((0, img_cropped.height // 2, img_cropped.width, img_cropped.height))\n", | |
"\n", | |
" # 画像を保存\n", | |
" upper_half.save(os.path.join(output_dir, f'page_{page_num + 1}_1.png'))\n", | |
" lower_half.save(os.path.join(output_dir, f'page_{page_num + 1}_2.png'))\n", | |
"\n", | |
"print(\"画像の切り抜きと保存が完了しました。\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "114854c9", | |
"metadata": {}, | |
"source": [ | |
"## 画像からテキスト情報を取得する\n", | |
"\n", | |
"やりたいこと:\n", | |
"\n", | |
"- sd_imgに保存した画像から、発行年月と記事内容をテキスト情報として取得したい\n", | |
"- 画像から記事内容を取得するには、OpenAIのAPIを使う\n", | |
" - OpenAIのAPIキーは、環境変数の OPENAI_API_KEY を使う\n", | |
" - " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "211db959", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from openai import OpenAI\n", | |
"from pydantic import BaseModel\n", | |
"\n", | |
"import os\n", | |
"import base64\n", | |
"\n", | |
"# OpenAI APIキーを設定\n", | |
"#openai.api_key = os.getenv('OPENAI_API_KEY')\n", | |
"client = OpenAI()\n", | |
"\n", | |
"# 画像ディレクトリ\n", | |
"image_dir = 'sd_img'\n", | |
"\n", | |
"# 画像ファイルのリストを取得\n", | |
"image_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]\n", | |
"\n", | |
"class OcrResult(BaseModel):\n", | |
" title: str\n", | |
" text: str\n", | |
"\n", | |
"# 画像からテキストを抽出する関数\n", | |
"def extract_text_from_image(image_path) -> OcrResult:\n", | |
" with open(image_path, \"rb\") as image_file:\n", | |
" image_data = image_file.read()\n", | |
" \n", | |
" base64_image = base64.b64encode(image_data).decode('utf-8')\n", | |
"\n", | |
" # text = response['data']['text']\n", | |
" # beta.chat.completions.parse(\n", | |
" systemMessage = \"\"\"\\\n", | |
" この画像は、SoftwareDesginの各号の記事紹介です。\n", | |
" ここから発行年月と記事内容を教えてください。\n", | |
" - 出力はJSON形式です。\n", | |
" - JSONのtitleには発行年月日をxx号という形で入れてください。\n", | |
" - JSONのtextには特集記事の記事内容を入れてください。 \n", | |
" - JSONのtextには記事を書いた人の名前は不要です。\n", | |
" - JSONのtextは500文字以内に収めてください。\n", | |
" - それぞれ日本語でお願いします。\n", | |
" \"\"\"\n", | |
" completion = client.beta.chat.completions.parse(\n", | |
" model=\"gpt-4o\", # モデルの指定\n", | |
" response_format=OcrResult,\n", | |
" messages=[\n", | |
" {\"role\": \"system\", \"content\": systemMessage},\n", | |
" {\"role\": \"user\",\n", | |
" \"content\": [\n", | |
" {\n", | |
" \"type\": \"image_url\",\n", | |
" \"image_url\": {\n", | |
" \"url\": f\"data:image/png;base64,{base64_image}\"\n", | |
" },\n", | |
" },\n", | |
" ],\n", | |
" }\n", | |
" ]\n", | |
" )\n", | |
"\n", | |
" return completion.choices[0].message.content\n", | |
"\n", | |
"# sd_jsonディレクトリを削除\n", | |
"if os.path.exists('sd_json'):\n", | |
" import shutil\n", | |
" shutil.rmtree('sd_json')\n", | |
"\n", | |
"# sd_jsonディレクトリが存在しない場合は作成\n", | |
"if not os.path.exists('sd_json'):\n", | |
" os.makedirs('sd_json')\n", | |
"\n", | |
"\n", | |
"# 各画像からテキストを抽出してsd_jsonディレクトリに保存\n", | |
"for image_file in image_files:\n", | |
" image_path = os.path.join(image_dir, image_file)\n", | |
" result = extract_text_from_image(image_path)\n", | |
" print(result)\n", | |
" with open(f'sd_json/{image_file}.json', 'w') as f:\n", | |
" f.write(result)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "04d7be3b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "base", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "SoftwareDesignをnotionに", | |
"nodes": [ | |
{ | |
"parameters": {}, | |
"type": "n8n-nodes-base.manualTrigger", | |
"typeVersion": 1, | |
"position": [ | |
0, | |
0 | |
], | |
"id": "0a5bd26a-020e-4168-93ad-88fa499ec607", | |
"name": "When clicking ‘Test workflow’" | |
}, | |
{ | |
"parameters": { | |
"command": "ls -p \"/Users/tak/proj/jupyterlab/SoftwareDesign/sd_img\" | grep -v / | jq -R . | jq -s ." | |
}, | |
"type": "n8n-nodes-base.executeCommand", | |
"typeVersion": 1, | |
"position": [ | |
220, | |
0 | |
], | |
"id": "7d7469a2-53e1-4b9d-8dad-03cf5079a753", | |
"name": "Execute Command" | |
}, | |
{ | |
"parameters": { | |
"jsCode": "// Loop over input items and add a new field called 'myNewField' to the JSON of each one\nfor (const item of $input.all()) {\n item.json.files = JSON.parse(item.json.stdout);\n}\n\nreturn $input.all();" | |
}, | |
"type": "n8n-nodes-base.code", | |
"typeVersion": 2, | |
"position": [ | |
440, | |
0 | |
], | |
"id": "c57338b6-a625-445b-a8c1-97919f0112d6", | |
"name": "Code" | |
}, | |
{ | |
"parameters": { | |
"assignments": { | |
"assignments": [ | |
{ | |
"id": "af4e3561-11a8-4dbd-8b81-75c8a313dd4f", | |
"name": "files", | |
"value": "={{ $json.files }}", | |
"type": "array" | |
} | |
] | |
}, | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.set", | |
"typeVersion": 3.4, | |
"position": [ | |
660, | |
0 | |
], | |
"id": "82246bb1-5016-46f2-9ff9-0d5524da5972", | |
"name": "Edit Fields" | |
}, | |
{ | |
"parameters": { | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.splitInBatches", | |
"typeVersion": 3, | |
"position": [ | |
1320, | |
0 | |
], | |
"id": "465a899b-f179-492c-bd69-58c1af72c7b2", | |
"name": "Loop Over Items" | |
}, | |
{ | |
"parameters": { | |
"jsCode": "// Loop over input items and add a new field called 'myNewField' to the JSON of each one\nconsole.log($input.all());\n\nreturn $input.all();" | |
}, | |
"type": "n8n-nodes-base.code", | |
"typeVersion": 2, | |
"position": [ | |
1540, | |
120 | |
], | |
"id": "e2dc55f7-0ece-45dc-b15f-a6bb9f765d5d", | |
"name": "Code1" | |
}, | |
{ | |
"parameters": { | |
"fieldToSplitOut": "files", | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.splitOut", | |
"typeVersion": 1, | |
"position": [ | |
880, | |
0 | |
], | |
"id": "da91398b-e12c-4c5e-a5bf-6ed5e5da4e05", | |
"name": "Split Out" | |
}, | |
{ | |
"parameters": { | |
"fileSelector": "=/Users/tak/proj/jupyterlab/SoftwareDesign/sd_img/{{ $json.files }}", | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.readWriteFile", | |
"typeVersion": 1, | |
"position": [ | |
1760, | |
-20 | |
], | |
"id": "a6a3689f-0b57-4361-b880-a0a6bddd2a46", | |
"name": "Read/Write Files from Disk" | |
}, | |
{ | |
"parameters": { | |
"operation": "binaryToPropery", | |
"options": { | |
"keepSource": "json" | |
} | |
}, | |
"type": "n8n-nodes-base.extractFromFile", | |
"typeVersion": 1, | |
"position": [ | |
1980, | |
-20 | |
], | |
"id": "d38bf4ec-35fe-4005-892b-278f92bf03b2", | |
"name": "Extract from File" | |
}, | |
{ | |
"parameters": { | |
"fileSelector": "=/Users/tak/proj/jupyterlab/SoftwareDesign/sd_json/{{ $json.files }}.json", | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.readWriteFile", | |
"typeVersion": 1, | |
"position": [ | |
1760, | |
160 | |
], | |
"id": "61110007-5308-457d-8116-5b431eec0272", | |
"name": "Read/Write Files from Disk1" | |
}, | |
{ | |
"parameters": { | |
"operation": "fromJson", | |
"options": { | |
"keepSource": "json" | |
} | |
}, | |
"type": "n8n-nodes-base.extractFromFile", | |
"typeVersion": 1, | |
"position": [ | |
1980, | |
160 | |
], | |
"id": "875dd841-464f-44ef-9762-87cf658d0cce", | |
"name": "Extract from File1" | |
}, | |
{ | |
"parameters": { | |
"assignments": { | |
"assignments": [ | |
{ | |
"id": "618e9fb5-73e1-416e-9d90-c777be15e5df", | |
"name": "data.img", | |
"value": "={{ $json.data }}", | |
"type": "string" | |
}, | |
{ | |
"id": "5ce3e2f8-45fa-4ae9-a3f8-20a58059b18c", | |
"name": "data.fileName", | |
"value": "={{ $json.fileName }}.json", | |
"type": "string" | |
} | |
] | |
}, | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.set", | |
"typeVersion": 3.4, | |
"position": [ | |
2180, | |
-20 | |
], | |
"id": "bae6a33f-0fa4-47a1-a0cc-3d3e401d40da", | |
"name": "Edit Fields1" | |
}, | |
{ | |
"parameters": { | |
"assignments": { | |
"assignments": [ | |
{ | |
"id": "c9066bb7-6070-412b-b4f9-dd95e25a2da0", | |
"name": "data.title", | |
"value": "={{ $json.data.title }}", | |
"type": "string" | |
}, | |
{ | |
"id": "2fb5d8de-62ea-406d-8fe4-afa4cf0bb1c4", | |
"name": "data.text", | |
"value": "={{ $json.data.text }}", | |
"type": "string" | |
}, | |
{ | |
"id": "2bd73db4-6dd1-4529-8822-c8ae674bb05d", | |
"name": "data.fileName", | |
"value": "={{ $json.fileName }}", | |
"type": "string" | |
} | |
] | |
}, | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.set", | |
"typeVersion": 3.4, | |
"position": [ | |
2200, | |
160 | |
], | |
"id": "57d18455-e1d6-405f-85df-f3c2df04d6df", | |
"name": "Edit Fields2" | |
}, | |
{ | |
"parameters": { | |
"mode": "combine", | |
"advanced": true, | |
"mergeByFields": { | |
"values": [ | |
{ | |
"field1": "data.fileName", | |
"field2": "data.fileName" | |
} | |
] | |
}, | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.merge", | |
"typeVersion": 3, | |
"position": [ | |
2420, | |
120 | |
], | |
"id": "97864b7f-7e77-4e43-b504-190e8d656825", | |
"name": "Merge" | |
}, | |
{ | |
"parameters": { | |
"resource": "databasePage", | |
"databaseId": { | |
"__rl": true, | |
"value": "16b326f0-8e38-806e-9289-fc5503cc0c21", | |
"mode": "list", | |
"cachedResultName": "SoftwareDesign総集編見出し", | |
"cachedResultUrl": "https://www.notion.so/16b326f08e38806e9289fc5503cc0c21" | |
}, | |
"title": "={{ $json.data.title }}", | |
"blockUi": { | |
"blockValues": [ | |
{ | |
"textContent": "={{ $json.data.text }}" | |
} | |
] | |
}, | |
"options": {} | |
}, | |
"type": "n8n-nodes-base.notion", | |
"typeVersion": 2.2, | |
"position": [ | |
2640, | |
120 | |
], | |
"id": "a65851a6-1331-4c67-869f-e9d2f823dbdc", | |
"name": "Notion", | |
"credentials": { | |
"notionApi": { | |
"id": "TJiWF5R1JHLYHgB5", | |
"name": "Notion account" | |
} | |
} | |
} | |
], | |
"pinData": {}, | |
"connections": { | |
"When clicking ‘Test workflow’": { | |
"main": [ | |
[ | |
{ | |
"node": "Execute Command", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Execute Command": { | |
"main": [ | |
[ | |
{ | |
"node": "Code", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Code": { | |
"main": [ | |
[ | |
{ | |
"node": "Edit Fields", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Edit Fields": { | |
"main": [ | |
[ | |
{ | |
"node": "Split Out", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Loop Over Items": { | |
"main": [ | |
[], | |
[ | |
{ | |
"node": "Code1", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Code1": { | |
"main": [ | |
[ | |
{ | |
"node": "Read/Write Files from Disk", | |
"type": "main", | |
"index": 0 | |
}, | |
{ | |
"node": "Read/Write Files from Disk1", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Split Out": { | |
"main": [ | |
[ | |
{ | |
"node": "Loop Over Items", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Read/Write Files from Disk": { | |
"main": [ | |
[ | |
{ | |
"node": "Extract from File", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Read/Write Files from Disk1": { | |
"main": [ | |
[ | |
{ | |
"node": "Extract from File1", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Extract from File": { | |
"main": [ | |
[ | |
{ | |
"node": "Edit Fields1", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Extract from File1": { | |
"main": [ | |
[ | |
{ | |
"node": "Edit Fields2", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Edit Fields1": { | |
"main": [ | |
[ | |
{ | |
"node": "Merge", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Edit Fields2": { | |
"main": [ | |
[ | |
{ | |
"node": "Merge", | |
"type": "main", | |
"index": 1 | |
} | |
] | |
] | |
}, | |
"Merge": { | |
"main": [ | |
[ | |
{ | |
"node": "Notion", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
}, | |
"Notion": { | |
"main": [ | |
[ | |
{ | |
"node": "Loop Over Items", | |
"type": "main", | |
"index": 0 | |
} | |
] | |
] | |
} | |
}, | |
"active": false, | |
"settings": { | |
"executionOrder": "v1" | |
}, | |
"versionId": "cedf3032-9378-44fc-a7a2-4bac27a77aff", | |
"meta": { | |
"instanceId": "6be6a7fcc827d9c757690034fe59425846319099aa90b9ddef9aeb85d9c582d2" | |
}, | |
"id": "Zfe6JJjYa8k5SN8n", | |
"tags": [] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment