Skip to content

Instantly share code, notes, and snippets.

@Kiollpt
Last active September 18, 2020 09:57
Show Gist options
  • Save Kiollpt/5f0ba8f9ef07d8ff588b561c2d7ad2da to your computer and use it in GitHub Desktop.
Save Kiollpt/5f0ba8f9ef07d8ff588b561c2d7ad2da to your computer and use it in GitHub Desktop.
from aiohttp import gather, create_task
from string import Template
from aiohttp import web, ClientSession
from bs4 import BeautifulSoup
async def news(request):
sites = [
('http://edition.cnn.com',cnn_articles),
('http://www.aljazeera.com',aljazeera_articles)
]
tasks = [create_task(news_fetch(*s) for s in sites)]
await gather(*tasks)
items = {
text: (
f'<div class>="box {kind}"'
f'<span>'
f'<a href="{href}">{text}</a>'
f'</span>'
f'</div>'
)
for task in tasks for href, text, kind in task.result()
}
content = ''.join(items[x] for x in sorted(items))
page = Template(open('idex.html').read())
return web.Response(
body = page.safe_substitute(body=content),
content_type = 'text/html'
)
async def news_fetch(url,postprocess):
proxy_url = (
f'https://localhost:8050/render.html'
f'url={url}&timeout=60&wait=1'
)
async with ClientSession() as session:
async with session.get(proxy_url) as resp:
data = await resp.read()
data = data.decode('utf-8')
return postprocess(url,data)
def cnn_articles(url,page_data):
soup = BeautifulSoup(page_data,'lxml')
def match(tag):
return (
)
headlines = soup.find_all(match)
return [(url + hl['href'],hl.text,'cnn') for hl in headlines]
def aljazeera_articles(url,pag_data):
soup = BeautifulSoup(page_data,'lmxl')
def matach(tag):
return(
tag.text and tag.has_attr('href')
and tag['href'].start
)
headlines = soup.find_all(matach)
return [(url + hl["href"], hl.text,'aljazeera' for hl in headlines)]
if __name__ == "__main__":
app = web.Application()
app.router.add_get('/news',news)
web.run_app(app,port=1237)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment