Created
June 17, 2019 18:16
-
-
Save b5/67e4ade3b2325df9478d1dacd09fad97 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
load("http.star", "http") | |
load("bsoup.star", "bsoup") | |
baseUrl = "http://www.lawhelp.org" | |
def download(ctx): | |
soup, rows = fetch_page(baseUrl + '/dc/find-legal-help/directory/') | |
pages = page_links(soup) | |
for page in pages: | |
_, new_rows = fetch_page(page) | |
rows = rows + new_rows | |
return rows | |
def transform(ds,ctx): | |
ds.set_structure(structure) | |
ds.set_body(ctx.download) | |
def fetch_page(url): | |
soup = get_soup(url) | |
rows = soup.find('ul', {'class':'listing'}).find_all('li') | |
rows = [extract_resource_index(row) for row in rows] | |
return (soup, rows) | |
def page_links(soup): | |
els = soup.find('div', { 'class' : 'pagination' }).contents() | |
return [baseUrl + el.attrs()['href'] for el in els if | |
el.attrs().get('href', '') != '' and el.attrs().get('class', '') != 'next'] | |
# schema: | |
# 0 name | |
# 1 description | |
# 2 locality | |
# 3 street_address | |
# 4 postal_code | |
# 5 telephone | |
# 6 website | |
# 7 resource_url | |
# 8 updated | |
def extract_resource_index(soup): | |
data = [''] * 9 | |
h3 = soup.find('h3') | |
if h3: | |
link = h3.find('a') | |
# resource_url: | |
data[7] = baseUrl + link.attrs()['href'].strip() | |
# name: | |
data[0] = link.get_text().strip() | |
# fetch details from sub page | |
details = get_resource_details(data[7]) | |
data[8] = details['updated'].strip() | |
data[1] = details["description"].strip() | |
sa = soup.find('span', { 'class': 'street-address' }) | |
if sa: | |
# street_address: | |
data[3] = sa.get_text().strip() | |
loc = soup.find('span', { 'class': 'locality'}) | |
if loc: | |
# locality | |
data[2] = loc.get_text().strip() | |
pc = soup.find('span', {'class': 'postal-code'}) | |
if pc: | |
# postal_code | |
data[4] = pc.get_text().strip() | |
tel = soup.find('div', {'class': 'tel'}) | |
if tel: | |
# telephone | |
data[5] = tel.get_text().strip() | |
site = soup.find('div', { 'class': 'wrap-all' }) | |
if site: | |
# website | |
data[6] = site.find('a').attrs()['href'] | |
return data | |
def get_resource_details(rel): | |
soup = get_soup(rel) | |
details = { | |
'updated': '', | |
'description': '', | |
} | |
profile = soup.find('div', { 'id': 'profile-tab' }) | |
if profile: | |
sections = profile.find_all('div', { 'class': 'section'}) | |
if len(sections) > 0: | |
details['description'] = "\n".join([p.get_text() for p in sections[0].find_all('p')]) | |
card = soup.find('div', { 'class': 'vcard' }) | |
if card: | |
if len(card.contents()) >= 4: | |
details['updated'] = card.contents()[3].get_text().replace('Last Review and Update:', '').strip() | |
return details | |
def get_soup(url): | |
print(url, "\n") | |
res = http.get(url, headers={ | |
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0", | |
}) | |
return bsoup.parseHtml(res.body()) | |
structure = { | |
'format': 'json', | |
'strict': True, | |
'schema': { | |
'type': 'array', | |
'items': { | |
'type' : 'array', | |
'items': [ | |
{ 'title': 'name', 'type': 'string' }, | |
{ 'title': 'description', 'type': 'string' }, | |
{ 'title': 'locality', 'type': 'string' }, | |
{ 'title': 'street_address', 'type': 'string' }, | |
{ 'title': 'postal_code', 'type': 'string' }, | |
{ 'title': 'telephone', 'type': 'string' }, | |
{ 'title': 'website', 'type': 'string' }, | |
{ 'title': 'resource_url ', 'type': 'string' }, | |
{ 'title': 'updated', 'type': 'string' }, | |
] | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment