Created
April 2, 2021 19:41
-
-
Save ccortezb/d337bc308e6eb804407231140e8655f4 to your computer and use it in GitHub Desktop.
Ingestando noticias de mi propia web.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Ingestando noticias de mi propia web.ipynb", | |
"provenance": [], | |
"authorship_tag": "ABX9TyN4q7Z8nRJTK+8DwKLR/Bl5", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/ccortezb/d337bc308e6eb804407231140e8655f4/ingestando-noticias-de-mi-propia-web.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BBWvh-jVtn8J" | |
}, | |
"source": [ | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | |
"import os\n", | |
"%matplotlib inline\n", | |
"\n", | |
"cortezcloud_urls = ['https://www.cortez.cloud/categories/fundamentals',\n", | |
" 'https://www.cortez.cloud/categories/serverless',\n", | |
" 'https://www.cortez.cloud/categories/awsugperu']\n", | |
"\n", | |
"def ingestar_noticias(cortezcloud_urls):\n", | |
" cortezcloud_data = []\n", | |
" for url in cortezcloud_urls:\n", | |
" cloud_category = url.split('/')[4]\n", | |
" data = requests.get(url)\n", | |
" soup = BeautifulSoup(data.content, 'html.parser')\n", | |
" \n", | |
" news_blogs = [{'news_headline': headline.find('div', \n", | |
" attrs={\"class\": \"blog-title\"}).string,\n", | |
" 'news_blog': blog.find('div', \n", | |
" attrs={\"class\": \"preview-text\"}).string,\n", | |
" 'cloud_category': cloud_category}\n", | |
" \n", | |
" for headline, blog in \n", | |
" zip(soup.find_all('div', \n", | |
" class_=[\"thumbnail-text\"]),\n", | |
" soup.find_all('div', \n", | |
" class_=[\"thumbnail-text\"]))\n", | |
" ]\n", | |
" cortezcloud_data.extend(news_blogs)\n", | |
" \n", | |
" df = pd.DataFrame(cortezcloud_data)\n", | |
" #df = df[['news_headline', 'news_blog', 'cloud_category']]\n", | |
" return df\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 162 | |
}, | |
"id": "ERIXIl2ewiwn", | |
"outputId": "e6e184cf-1264-4abe-d94f-9ad303cdb51d" | |
}, | |
"source": [ | |
"ultimos_posts = ingestar_noticias(cortezcloud_urls)\n", | |
"ultimos_posts.head(10)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>news_headline</th>\n", | |
" <th>news_blog</th>\n", | |
" <th>cloud_category</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2 Trabajando con Amazon DocumentDB desde mi local</td>\n", | |
" <td>Seguimos en las series de Docdb, esta vez, 2da...</td>\n", | |
" <td>fundamentals</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5 cosas que necesitas empezar a aprender para ...</td>\n", | |
" <td>Aún no entiendes bien cómo empezar a migrar tu...</td>\n", | |
" <td>fundamentals</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Primeros pasos con Amazon DocumentDB y AWS CLI</td>\n", | |
" <td>Estas serán unas series donde hablaremos de un...</td>\n", | |
" <td>fundamentals</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Cómo me convierto en un AWS Community Builder ...</td>\n", | |
" <td>Empieza tu travesía para educar AWS en el Perú</td>\n", | |
" <td>awsugperu</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" news_headline ... cloud_category\n", | |
"0 2 Trabajando con Amazon DocumentDB desde mi local ... fundamentals\n", | |
"1 5 cosas que necesitas empezar a aprender para ... ... fundamentals\n", | |
"2 Primeros pasos con Amazon DocumentDB y AWS CLI ... fundamentals\n", | |
"3 Cómo me convierto en un AWS Community Builder ... ... awsugperu\n", | |
"\n", | |
"[4 rows x 3 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 51 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment