lookfwd · May 21, 2024 05:40
diff --git a/chatgpt-news.ipynb b/chatgpt-news.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyMSfa8qcHJ6Nkn5zXwMr0Q0",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/lookfwd/014e0ad2bc94f74737a48f85218ae1d9/chatgpt-news.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import requests"
      ],
      "metadata": {
        "id": "4hHRUIGFrRvh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "html = requests.get(\"https://in.gr\")"
      ],
      "metadata": {
        "id": "e_uXcwISrST3"
      },
      "execution_count": 19,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt: use BeautifulSoup and extract all the links from the page in html\n",
        "\n",
        "from bs4 import BeautifulSoup\n",
        "\n",
        "soup = BeautifulSoup(html.content, 'html.parser')\n",
        "links = []\n",
        "for link in soup.find_all('a'):\n",
        "    links.append(link.get('href'))\n"
      ],
      "metadata": {
        "id": "cuz-b-9FrWcY"
      },
      "execution_count": 22,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "selected = []\n",
        "for link in links:\n",
        "  if link.startswith(\"https://www.in.gr/2024/05\"):\n",
        "    selected.append(link)"
      ],
      "metadata": {
        "id": "YYqvm002reM6"
      },
      "execution_count": 30,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt: make selected unique\n",
        "\n",
        "selected = list(set(selected))\n"
      ],
      "metadata": {
        "id": "gJ61FgIqsK-Z"
      },
      "execution_count": 32,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt: for each url in selected download the page with requests and then with beautifulsoup extract the main text that is in a div with class 'inner-main-article'. Write the text from the pages to a file named protoselido.txt\n",
        "\n",
        "with open('protoselido.txt', 'w') as f:\n",
        "    for url in selected:\n",
        "        page = requests.get(url)\n",
        "        soup = BeautifulSoup(page.content, 'html.parser')\n",
        "        article = soup.find('div', class_='inner-main-article')\n",
        "        if article:\n",
        "            text = article.get_text()\n",
        "            f.write(text + '\\n')\n"
      ],
      "metadata": {
        "id": "wtmMNtyKs0BX"
      },
      "execution_count": 37,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"authorship_tag": "ABX9TyMSfa8qcHJ6Nkn5zXwMr0Q0",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/lookfwd/014e0ad2bc94f74737a48f85218ae1d9/chatgpt-news.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import requests"
	],
	"metadata": {
	"id": "4hHRUIGFrRvh"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"html = requests.get(\"https://in.gr\")"
	],
	"metadata": {
	"id": "e_uXcwISrST3"
	},
	"execution_count": 19,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# prompt: use BeautifulSoup and extract all the links from the page in html\n",
	"\n",
	"from bs4 import BeautifulSoup\n",
	"\n",
	"soup = BeautifulSoup(html.content, 'html.parser')\n",
	"links = []\n",
	"for link in soup.find_all('a'):\n",
	" links.append(link.get('href'))\n"
	],
	"metadata": {
	"id": "cuz-b-9FrWcY"
	},
	"execution_count": 22,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"selected = []\n",
	"for link in links:\n",
	" if link.startswith(\"https://www.in.gr/2024/05\"):\n",
	" selected.append(link)"
	],
	"metadata": {
	"id": "YYqvm002reM6"
	},
	"execution_count": 30,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# prompt: make selected unique\n",
	"\n",
	"selected = list(set(selected))\n"
	],
	"metadata": {
	"id": "gJ61FgIqsK-Z"
	},
	"execution_count": 32,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# prompt: for each url in selected download the page with requests and then with beautifulsoup extract the main text that is in a div with class 'inner-main-article'. Write the text from the pages to a file named protoselido.txt\n",
	"\n",
	"with open('protoselido.txt', 'w') as f:\n",
	" for url in selected:\n",
	" page = requests.get(url)\n",
	" soup = BeautifulSoup(page.content, 'html.parser')\n",
	" article = soup.find('div', class_='inner-main-article')\n",
	" if article:\n",
	" text = article.get_text()\n",
	" f.write(text + '\\n')\n"
	],
	"metadata": {
	"id": "wtmMNtyKs0BX"
	},
	"execution_count": 37,
	"outputs": []
	}
	]
	}