-
-
Save Kiollpt/5f0ba8f9ef07d8ff588b561c2d7ad2da to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from aiohttp import gather, create_task | |
from string import Template | |
from aiohttp import web, ClientSession | |
from bs4 import BeautifulSoup | |
async def news(request): | |
sites = [ | |
('http://edition.cnn.com',cnn_articles), | |
('http://www.aljazeera.com',aljazeera_articles) | |
] | |
tasks = [create_task(news_fetch(*s) for s in sites)] | |
await gather(*tasks) | |
items = { | |
text: ( | |
f'<div class>="box {kind}"' | |
f'<span>' | |
f'<a href="{href}">{text}</a>' | |
f'</span>' | |
f'</div>' | |
) | |
for task in tasks for href, text, kind in task.result() | |
} | |
content = ''.join(items[x] for x in sorted(items)) | |
page = Template(open('idex.html').read()) | |
return web.Response( | |
body = page.safe_substitute(body=content), | |
content_type = 'text/html' | |
) | |
async def news_fetch(url,postprocess): | |
proxy_url = ( | |
f'https://localhost:8050/render.html' | |
f'url={url}&timeout=60&wait=1' | |
) | |
async with ClientSession() as session: | |
async with session.get(proxy_url) as resp: | |
data = await resp.read() | |
data = data.decode('utf-8') | |
return postprocess(url,data) | |
def cnn_articles(url,page_data): | |
soup = BeautifulSoup(page_data,'lxml') | |
def match(tag): | |
return ( | |
) | |
headlines = soup.find_all(match) | |
return [(url + hl['href'],hl.text,'cnn') for hl in headlines] | |
def aljazeera_articles(url,pag_data): | |
soup = BeautifulSoup(page_data,'lmxl') | |
def matach(tag): | |
return( | |
tag.text and tag.has_attr('href') | |
and tag['href'].start | |
) | |
headlines = soup.find_all(matach) | |
return [(url + hl["href"], hl.text,'aljazeera' for hl in headlines)] | |
if __name__ == "__main__": | |
app = web.Application() | |
app.router.add_get('/news',news) | |
web.run_app(app,port=1237) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment