Last active
May 21, 2024 05:40
-
-
Save lookfwd/014e0ad2bc94f74737a48f85218ae1d9 to your computer and use it in GitHub Desktop.
chatgpt με νέα της ημέρας
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyMSfa8qcHJ6Nkn5zXwMr0Q0", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/lookfwd/014e0ad2bc94f74737a48f85218ae1d9/chatgpt-news.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import requests" | |
], | |
"metadata": { | |
"id": "4hHRUIGFrRvh" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"html = requests.get(\"https://in.gr\")" | |
], | |
"metadata": { | |
"id": "e_uXcwISrST3" | |
}, | |
"execution_count": 19, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# prompt: use BeautifulSoup and extract all the links from the page in html\n", | |
"\n", | |
"from bs4 import BeautifulSoup\n", | |
"\n", | |
"soup = BeautifulSoup(html.content, 'html.parser')\n", | |
"links = []\n", | |
"for link in soup.find_all('a'):\n", | |
" links.append(link.get('href'))\n" | |
], | |
"metadata": { | |
"id": "cuz-b-9FrWcY" | |
}, | |
"execution_count": 22, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"selected = []\n", | |
"for link in links:\n", | |
" if link.startswith(\"https://www.in.gr/2024/05\"):\n", | |
" selected.append(link)" | |
], | |
"metadata": { | |
"id": "YYqvm002reM6" | |
}, | |
"execution_count": 30, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# prompt: make selected unique\n", | |
"\n", | |
"selected = list(set(selected))\n" | |
], | |
"metadata": { | |
"id": "gJ61FgIqsK-Z" | |
}, | |
"execution_count": 32, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# prompt: for each url in selected download the page with requests and then with beautifulsoup extract the main text that is in a div with class 'inner-main-article'. Write the text from the pages to a file named protoselido.txt\n", | |
"\n", | |
"with open('protoselido.txt', 'w') as f:\n", | |
" for url in selected:\n", | |
" page = requests.get(url)\n", | |
" soup = BeautifulSoup(page.content, 'html.parser')\n", | |
" article = soup.find('div', class_='inner-main-article')\n", | |
" if article:\n", | |
" text = article.get_text()\n", | |
" f.write(text + '\\n')\n" | |
], | |
"metadata": { | |
"id": "wtmMNtyKs0BX" | |
}, | |
"execution_count": 37, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment