import xml.etree, pandas as ๐ผ, bs4 as ๐ฒ
url=\
https://bl.ocks.org/tonyfast/2947b4bb582e193f5b2a7dbf8b009b62
__import__('requests_cache').install_cache('signal')
import requests
response = requests.get(url)
tree = ๐ฒ.BeautifulSoup(response.content, 'xml')
def ravel(soup, level=0):
if isinstance(soup, ๐ฒ.element.Tag):
yield soup, level
for child in getattr(soup, 'children', []):
yield from ravel(child, level+1)
def tidysoup(soup):
return ๐ผ.DataFrame(ravel(tree), columns='tag level'.split()).reset_index().set_index('index level'.split()).tag.apply(
lambda x: ๐ผ.Series({**x.attrs, 'tag': x.name, 'text': x.text})
)