Skip to content

Instantly share code, notes, and snippets.

@lookfwd
Last active May 21, 2024 05:40
Show Gist options
  • Save lookfwd/014e0ad2bc94f74737a48f85218ae1d9 to your computer and use it in GitHub Desktop.
Save lookfwd/014e0ad2bc94f74737a48f85218ae1d9 to your computer and use it in GitHub Desktop.
chatgpt με νέα της ημέρας
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyMSfa8qcHJ6Nkn5zXwMr0Q0",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/lookfwd/014e0ad2bc94f74737a48f85218ae1d9/chatgpt-news.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"import requests"
],
"metadata": {
"id": "4hHRUIGFrRvh"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"html = requests.get(\"https://in.gr\")"
],
"metadata": {
"id": "e_uXcwISrST3"
},
"execution_count": 19,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# prompt: use BeautifulSoup and extract all the links from the page in html\n",
"\n",
"from bs4 import BeautifulSoup\n",
"\n",
"soup = BeautifulSoup(html.content, 'html.parser')\n",
"links = []\n",
"for link in soup.find_all('a'):\n",
" links.append(link.get('href'))\n"
],
"metadata": {
"id": "cuz-b-9FrWcY"
},
"execution_count": 22,
"outputs": []
},
{
"cell_type": "code",
"source": [
"selected = []\n",
"for link in links:\n",
" if link.startswith(\"https://www.in.gr/2024/05\"):\n",
" selected.append(link)"
],
"metadata": {
"id": "YYqvm002reM6"
},
"execution_count": 30,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# prompt: make selected unique\n",
"\n",
"selected = list(set(selected))\n"
],
"metadata": {
"id": "gJ61FgIqsK-Z"
},
"execution_count": 32,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# prompt: for each url in selected download the page with requests and then with beautifulsoup extract the main text that is in a div with class 'inner-main-article'. Write the text from the pages to a file named protoselido.txt\n",
"\n",
"with open('protoselido.txt', 'w') as f:\n",
" for url in selected:\n",
" page = requests.get(url)\n",
" soup = BeautifulSoup(page.content, 'html.parser')\n",
" article = soup.find('div', class_='inner-main-article')\n",
" if article:\n",
" text = article.get_text()\n",
" f.write(text + '\\n')\n"
],
"metadata": {
"id": "wtmMNtyKs0BX"
},
"execution_count": 37,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment