Created
May 7, 2017 01:00
-
-
Save Wesitos/840dccc42ccc4eb372f0325560b9b2f3 to your computer and use it in GitHub Desktop.
Ejemplo de scraping usando aiohttp y BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import aiohttp as req\n", | |
"import asyncio as aio\n", | |
"from bs4 import BeautifulSoup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# async def get_page(url)\n", | |
"@aio.coroutine\n", | |
"def get_page(url):\n", | |
" # await req.get(url)\n", | |
" res = yield from req.get(url)\n", | |
" return (yield from res.text())\n", | |
"\n", | |
"@aio.coroutine\n", | |
"def main():\n", | |
" text = yield from get_page(\"http://elcomercio.pe/tecnologia?ref=portada_home&ft=menu_nav\")\n", | |
" return text" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Asi se ejecuta una corutina" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<!doctype html>\n", | |
"<html lang=\"es\">\n", | |
" <head>\n", | |
" <meta meta charset=\"utf-8\">\n", | |
" <meta meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">\n", | |
"\t <title>Noticias de Tecnología | Galaxy S8 | Samsung | Apple | Note 7 | Nintendo Switch | iPhone 7 | MWC 2017 | Apps | Android | iOS | Pokemon Go | Cyber Monday | Black Friday | Macbook | Google | Pixel | Nintendo | YouTube | Facebook | WhatsApp | PS4 | Xbox | Doodle | Samsung Galaxy | Instagram | Linkedin | WiFi | Pinterest | Snapchat | Mark Zuckerberg | Redes sociales| Netflix | Dota 2 | El Comercio Peru</title>\n", | |
" <link rel=\"dns-prefetch\" href=\"https://cdn.elcomercio.e3.pe/\">\n", | |
" <link rel=\"dns-prefetch\" href=\"http://cde.3.elcomercio.pe/\">\n", | |
" <link rel=\"dns-prefetch\" href=\"http://code2.adtlgc.com/\"/>\n", | |
" <meta name=\"keywords\" content=\"Tecnología, Facebook, YouTube, Google, Doodle, Apple, Instagram, Linkedln, WiFi, Pinterest, Snapchat, WhatsApp, Drones, Redes, sociales, Netf\n" | |
] | |
} | |
], | |
"source": [ | |
"loop = aio.get_event_loop()\n", | |
"text = loop.run_until_complete(main())\n", | |
"print(text[:1000])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Podemos hacer una funcion" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def call_coroutine(coro):\n", | |
" loop = aio.get_event_loop()\n", | |
" return loop.run_until_complete(coro())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Analizamos la respuesta" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"soup = BeautifulSoup(text, 'lxml')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"sections = soup.find_all(class_='ec-ultimas')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"articles = []\n", | |
"for section in sections:\n", | |
" articles.extend(section.find_all('article'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def parse_article_abstract(soup):\n", | |
" return {\n", | |
" 'title': soup.select_one('header h2 a').text,\n", | |
" 'abstract': soup.find('p').text,\n", | |
" 'img': soup.select_one('figure img').attrs.get('src')\n", | |
" }" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data = [parse_article_abstract(s) \n", | |
" for s in articles\n", | |
" # Hay un article que no cumple la estructura\n", | |
" if s.select_one('header h2') is not None\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Automatizamos" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"@aio.coroutine\n", | |
"def main():\n", | |
" text = yield from get_page(\"http://elcomercio.pe/tecnologia?ref=portada_home&ft=menu_nav\")\n", | |
" articles = [parse_article_abstract(s) for s in soup.select('.ec-ultimas article')\n", | |
" if s.select_one('header h2') is not None\n", | |
" ]\n", | |
" return articles" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'abstract': 'El youtuber Rudy Mancuso escenificó una pelea callejera donde los sonidos de los golpes formaron un ritmo musical peculiar',\n", | |
" 'img': 'http://cde.3.elcomercio.pe/ima/0/1/6/8/1/1681826/160x100.jpg',\n", | |
" 'title': \"Una 'pelea musical' es el viral de la semana en YouTube\"}" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data = call_coroutine(main)\n", | |
"data[0]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.4", | |
"language": "python", | |
"name": "python3.4" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment