sudodo · May 19, 2023 07:35
diff --git a/pdf_page_del_ocr.ipynb b/pdf_page_del_ocr.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "mount_file_id": "1GA2E_h_txus-SVRcfyWSYrZ0eGaxo1PE",
      "authorship_tag": "ABX9TyMqVYN9trGM6cTRFb1BCJp2",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/sudodo/4b5f0875db177da4c45ba320e5ed10b5/pdf_page_del_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# pdf_page_del_ocr\n",
        "以下の機能を提供します。\n",
        "*   PDFファイルから指定されたページを削除する\n",
        "*   PDFファイルにOCRをかける\n",
        "\n"
      ],
      "metadata": {
        "id": "38_kzcsi7NOD"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install PyPDF2\n",
        "!pip install pytesseract\n",
        "!pip install pdf2image"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "d1MNK3Nq0dp3",
        "outputId": "f6d84064-50f0-47b0-d0a4-e4d4b05ae24f"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Requirement already satisfied: pytesseract in /usr/local/lib/python3.10/dist-packages (0.3.10)\n",
            "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.1)\n",
            "Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (8.4.0)\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting pdf2image\n",
            "  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)\n",
            "Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from pdf2image) (8.4.0)\n",
            "Installing collected packages: pdf2image\n",
            "Successfully installed pdf2image-1.16.3\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/gdrive')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "p4SSooSZ2HlN",
        "outputId": "ec9b5e6c-94bc-4243-a4e9-1d63372ccf59"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/gdrive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# 必要なライブラリをインポートします\n",
        "import PyPDF2\n",
        "import pytesseract\n",
        "from PIL import Image\n",
        "from pdf2image import convert_from_path\n"
      ],
      "metadata": {
        "id": "t_UjwyZB0UgY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "A_33kfwa0Phw"
      },
      "outputs": [],
      "source": [
        "\n",
        "# 1. PDFのページを削除する\n",
        "def delete_pages(pdf_file, pages_to_delete):\n",
        "    # PDFファイルを読み込みます\n",
        "    pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
        "\n",
        "    # 書き込み用のPDFを作成します\n",
        "    pdf_writer = PyPDF2.PdfWriter()\n",
        "\n",
        "    # 削除するページ以外のページを新しいPDFに書き込みます\n",
        "    for page in range(len(pdf_reader.pages)):\n",
        "        if page not in pages_to_delete:\n",
        "            pdf_writer.add_page(pdf_reader.pages[page])\n",
        "\n",
        "    # 新しいPDFをファイルに書き込みます\n",
        "    with open('/content/drive/MyDrive/new_file.pdf', 'wb') as out:\n",
        "        pdf_writer.write(out)\n",
        "\n",
        "    print('Deleted pages from PDF.')\n",
        "\n",
        "# 2. PDFをOCRでテキストに変換する\n",
        "def ocr_pdf(pdf_file):\n",
        "    # PDFを画像に変換します\n",
        "    images = convert_from_path(pdf_file)\n",
        "\n",
        "    # OCRの結果を格納する変数を初期化します\n",
        "    ocr_text = ''\n",
        "\n",
        "    # 画像ごとにOCRを実行します\n",
        "    for i, image in enumerate(images):\n",
        "        ocr_text += pytesseract.image_to_string(image)\n",
        "\n",
        "    # OCRの結果をテキストファイルに保存します\n",
        "    with open('ocr_result.txt', 'w') as out:\n",
        "        out.write(ocr_text)\n",
        "\n",
        "    print('Performed OCR on PDF.')\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# 使用例\n",
        "delete_pages('/content/drive/MyDrive/ArXiv Me.pdf', [1, 3])  # 2ページ目と4ページ目を削除\n",
        "ocr_pdf('/content/drive/MyDrive/ArXiv Me.pdf')\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "OFh9pwO-0vVJ",
        "outputId": "3e17161d-a29b-4b36-aa00-1392394399e4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Deleted pages from PDF.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "cf8dL6DN0yG5"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "D3iPTw2G5YIq"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"mount_file_id": "1GA2E_h_txus-SVRcfyWSYrZ0eGaxo1PE",
	"authorship_tag": "ABX9TyMqVYN9trGM6cTRFb1BCJp2",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/sudodo/4b5f0875db177da4c45ba320e5ed10b5/pdf_page_del_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# pdf_page_del_ocr\n",
	"以下の機能を提供します。\n",
	"* PDFファイルから指定されたページを削除する\n",
	"* PDFファイルにOCRをかける\n",
	"\n"
	],
	"metadata": {
	"id": "38_kzcsi7NOD"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install PyPDF2\n",
	"!pip install pytesseract\n",
	"!pip install pdf2image"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "d1MNK3Nq0dp3",
	"outputId": "f6d84064-50f0-47b0-d0a4-e4d4b05ae24f"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
	"Requirement already satisfied: pytesseract in /usr/local/lib/python3.10/dist-packages (0.3.10)\n",
	"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.1)\n",
	"Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (8.4.0)\n",
	"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
	"Collecting pdf2image\n",
	" Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)\n",
	"Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from pdf2image) (8.4.0)\n",
	"Installing collected packages: pdf2image\n",
	"Successfully installed pdf2image-1.16.3\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from google.colab import drive\n",
	"drive.mount('/content/gdrive')"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "p4SSooSZ2HlN",
	"outputId": "ec9b5e6c-94bc-4243-a4e9-1d63372ccf59"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Mounted at /content/gdrive\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# 必要なライブラリをインポートします\n",
	"import PyPDF2\n",
	"import pytesseract\n",
	"from PIL import Image\n",
	"from pdf2image import convert_from_path\n"
	],
	"metadata": {
	"id": "t_UjwyZB0UgY"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "A_33kfwa0Phw"
	},
	"outputs": [],
	"source": [
	"\n",
	"# 1. PDFのページを削除する\n",
	"def delete_pages(pdf_file, pages_to_delete):\n",
	" # PDFファイルを読み込みます\n",
	" pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
	"\n",
	" # 書き込み用のPDFを作成します\n",
	" pdf_writer = PyPDF2.PdfWriter()\n",
	"\n",
	" # 削除するページ以外のページを新しいPDFに書き込みます\n",
	" for page in range(len(pdf_reader.pages)):\n",
	" if page not in pages_to_delete:\n",
	" pdf_writer.add_page(pdf_reader.pages[page])\n",
	"\n",
	" # 新しいPDFをファイルに書き込みます\n",
	" with open('/content/drive/MyDrive/new_file.pdf', 'wb') as out:\n",
	" pdf_writer.write(out)\n",
	"\n",
	" print('Deleted pages from PDF.')\n",
	"\n",
	"# 2. PDFをOCRでテキストに変換する\n",
	"def ocr_pdf(pdf_file):\n",
	" # PDFを画像に変換します\n",
	" images = convert_from_path(pdf_file)\n",
	"\n",
	" # OCRの結果を格納する変数を初期化します\n",
	" ocr_text = ''\n",
	"\n",
	" # 画像ごとにOCRを実行します\n",
	" for i, image in enumerate(images):\n",
	" ocr_text += pytesseract.image_to_string(image)\n",
	"\n",
	" # OCRの結果をテキストファイルに保存します\n",
	" with open('ocr_result.txt', 'w') as out:\n",
	" out.write(ocr_text)\n",
	"\n",
	" print('Performed OCR on PDF.')\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# 使用例\n",
	"delete_pages('/content/drive/MyDrive/ArXiv Me.pdf', [1, 3]) # 2ページ目と4ページ目を削除\n",
	"ocr_pdf('/content/drive/MyDrive/ArXiv Me.pdf')\n"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "OFh9pwO-0vVJ",
	"outputId": "3e17161d-a29b-4b36-aa00-1392394399e4"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Deleted pages from PDF.\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "cf8dL6DN0yG5"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "D3iPTw2G5YIq"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}