Created
May 19, 2023 07:35
-
-
Save sudodo/4b5f0875db177da4c45ba320e5ed10b5 to your computer and use it in GitHub Desktop.
pdf_page_del_ocr.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"mount_file_id": "1GA2E_h_txus-SVRcfyWSYrZ0eGaxo1PE", | |
"authorship_tag": "ABX9TyMqVYN9trGM6cTRFb1BCJp2", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/sudodo/4b5f0875db177da4c45ba320e5ed10b5/pdf_page_del_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# pdf_page_del_ocr\n", | |
"以下の機能を提供します。\n", | |
"* PDFファイルから指定されたページを削除する\n", | |
"* PDFファイルにOCRをかける\n", | |
"\n" | |
], | |
"metadata": { | |
"id": "38_kzcsi7NOD" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install PyPDF2\n", | |
"!pip install pytesseract\n", | |
"!pip install pdf2image" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "d1MNK3Nq0dp3", | |
"outputId": "f6d84064-50f0-47b0-d0a4-e4d4b05ae24f" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Requirement already satisfied: pytesseract in /usr/local/lib/python3.10/dist-packages (0.3.10)\n", | |
"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.1)\n", | |
"Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (8.4.0)\n", | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Collecting pdf2image\n", | |
" Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)\n", | |
"Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from pdf2image) (8.4.0)\n", | |
"Installing collected packages: pdf2image\n", | |
"Successfully installed pdf2image-1.16.3\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from google.colab import drive\n", | |
"drive.mount('/content/gdrive')" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "p4SSooSZ2HlN", | |
"outputId": "ec9b5e6c-94bc-4243-a4e9-1d63372ccf59" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Mounted at /content/gdrive\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# 必要なライブラリをインポートします\n", | |
"import PyPDF2\n", | |
"import pytesseract\n", | |
"from PIL import Image\n", | |
"from pdf2image import convert_from_path\n" | |
], | |
"metadata": { | |
"id": "t_UjwyZB0UgY" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "A_33kfwa0Phw" | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"# 1. PDFのページを削除する\n", | |
"def delete_pages(pdf_file, pages_to_delete):\n", | |
" # PDFファイルを読み込みます\n", | |
" pdf_reader = PyPDF2.PdfReader(pdf_file)\n", | |
"\n", | |
" # 書き込み用のPDFを作成します\n", | |
" pdf_writer = PyPDF2.PdfWriter()\n", | |
"\n", | |
" # 削除するページ以外のページを新しいPDFに書き込みます\n", | |
" for page in range(len(pdf_reader.pages)):\n", | |
" if page not in pages_to_delete:\n", | |
" pdf_writer.add_page(pdf_reader.pages[page])\n", | |
"\n", | |
" # 新しいPDFをファイルに書き込みます\n", | |
" with open('/content/drive/MyDrive/new_file.pdf', 'wb') as out:\n", | |
" pdf_writer.write(out)\n", | |
"\n", | |
" print('Deleted pages from PDF.')\n", | |
"\n", | |
"# 2. PDFをOCRでテキストに変換する\n", | |
"def ocr_pdf(pdf_file):\n", | |
" # PDFを画像に変換します\n", | |
" images = convert_from_path(pdf_file)\n", | |
"\n", | |
" # OCRの結果を格納する変数を初期化します\n", | |
" ocr_text = ''\n", | |
"\n", | |
" # 画像ごとにOCRを実行します\n", | |
" for i, image in enumerate(images):\n", | |
" ocr_text += pytesseract.image_to_string(image)\n", | |
"\n", | |
" # OCRの結果をテキストファイルに保存します\n", | |
" with open('ocr_result.txt', 'w') as out:\n", | |
" out.write(ocr_text)\n", | |
"\n", | |
" print('Performed OCR on PDF.')\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# 使用例\n", | |
"delete_pages('/content/drive/MyDrive/ArXiv Me.pdf', [1, 3]) # 2ページ目と4ページ目を削除\n", | |
"ocr_pdf('/content/drive/MyDrive/ArXiv Me.pdf')\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "OFh9pwO-0vVJ", | |
"outputId": "3e17161d-a29b-4b36-aa00-1392394399e4" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Deleted pages from PDF.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "cf8dL6DN0yG5" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "D3iPTw2G5YIq" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment