Skip to content

Instantly share code, notes, and snippets.

@leekiernan
Last active April 5, 2018 11:25
Show Gist options
  • Save leekiernan/b99f520adf9dd97416ee2ac107db2933 to your computer and use it in GitHub Desktop.
Save leekiernan/b99f520adf9dd97416ee2ac107db2933 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import asyncio
import aiohttp
import aiofiles
import csv
import os
import re
FILE_TO_OPEN = './urls2.txt'
def writerow(writer, url, level):
capture = re.search('level_(\d+).*?selected', str(level))
writer.writerow({ 'URL':url, 'level':capture.group(1) })
def get_urls(file):
contents = open(file, "r")
return contents.readlines()
async def fetch(session, url):
try:
async with session.get(url) as response:
text = await response.read()
return BeautifulSoup(text.decode('utf-8'), 'html.parser')
except aiohttp.client_exceptions.ClientConnectorError:
await asyncio.sleep(0.5)
return await fetch(session, url)
async def handler(url):
connec = aiohttp.connector.TCPConnector(limit=30)
async with aiohttp.ClientSession(connector=connec) as session:
response = await fetch(session, url)
if not response: print(response)
try:
level = response.find_all("ul", class_="hierarchy_level")[-1:]
writerow(writer, url, level)
except AttributeError:
try:
level = response.find_all("ul", class_="hierarchy_level")[-2:-1]
writerow(writer, url, level)
except AttributeError:
print("Problem @ {0}".format(url))
with open('levels.csv', 'w') as csvfile:
# Set up csv writer - write header row.
fieldnames = ['URL', 'level']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
urls = get_urls(FILE_TO_OPEN)
futures = [handler(url.rstrip()) for url in urls]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(futures))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment