Last active
May 22, 2019 00:03
-
-
Save kizernis/7a80d29cde4bc9b4b727194ddd97e17e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract data from the anuga.com downloaded pages and create a sophisticated CSV table | |
from bs4 import BeautifulSoup | |
import csv | |
from tqdm import trange | |
import re | |
page_first = 1 | |
page_last = 7789 | |
with open('anuga_data.csv', 'w', newline='', encoding='utf-8') as f_out: | |
field_names = ['Company Name', 'Address', 'Contact 1', 'Contact 2', 'Email', 'Website', 'Category', 'Sub-Category', 'Product Name', 'Sector', 'Channel', 'Target Market'] | |
writer = csv.DictWriter(f_out, fieldnames=field_names) | |
writer.writeheader() | |
for i in trange(page_first, page_last + 1): | |
with open('html_pages/%04d.html' % i, encoding='utf-8') as f_in: | |
soup = BeautifulSoup(f_in, 'lxml') | |
row = {} | |
row['Company Name'] = soup.find('h1', class_='h1down').get_text().strip() | |
soup_contacts = soup.find_all('div', class_='texts grey') | |
contacts_len = len(soup_contacts) | |
assert contacts_len | |
row['Address'] = re.sub(r'\n[\s\n]*', '\n', soup_contacts[0].get_text().strip()) | |
if 1 < contacts_len: | |
row['Contact 1'] = soup_contacts[1].get_text().strip() | |
if 2 < contacts_len: | |
row['Contact 2'] = soup_contacts[2].get_text().strip() | |
if 3 < contacts_len: | |
row['Email'] = soup_contacts[3].get_text().strip() | |
if 4 < contacts_len: | |
row['Website'] = soup_contacts[4].get_text().strip() | |
# I could probably simplify the following with stripped_strings generator of bs4. | |
soup_div = soup.find('div', class_='searchcontent') | |
non_empty_fields = [x2 for x2 in (x1.get_text().strip() for x1 in soup_div.find_all('b')) if x2 != ''] | |
soup = soup.find_all('ul', class_='ultree') | |
# Products fields can be actually empty but their caption is always there. | |
assert len(soup) == len(non_empty_fields) | |
index = 1 | |
if 'Product sector' in non_empty_fields: | |
row['Sector'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip()) | |
index += 1 | |
if 'Distribution Channel' in non_empty_fields: | |
row['Channel'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip()) | |
index += 1 | |
if 'Target and sales markets' in non_empty_fields: | |
row['Target Market'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip()) | |
data = {} | |
for soup_category in soup[0].find_all('li', recursive=False): | |
# Sometimes there is an empty <li> before categories (encountered in page 1138) | |
try: | |
category = next(soup_category.stripped_strings) | |
except StopIteration: | |
continue | |
# Strangely there is a <ul> for every subcategory and product, not just <li> | |
data[category] = {} | |
for soup_subcategory in (x.find('li') for x in soup_category.find_all('ul', recursive=False)): | |
subcategory = next(soup_subcategory.stripped_strings) | |
data[category][subcategory] = [] | |
for soup_product in (x.find('li') for x in soup_subcategory.find_all('ul', recursive=False)): | |
product = next(soup_product.stripped_strings) | |
data[category][subcategory].append(product) | |
i1 = 0 | |
for i1, category in enumerate(data, start=1): | |
if i1 > 1: | |
row = {} | |
row['Category'] = category | |
i2 = 0 | |
for i2, subcategory in enumerate(data[category], start=1): | |
if i2 > 1: | |
row = {} | |
row['Sub-Category'] = subcategory | |
i3 = 0 | |
for i3, product in enumerate(data[category][subcategory], start=1): | |
if i3 > 1: | |
row = {} | |
row['Product Name'] = product | |
writer.writerow(row) | |
if i3 == 0: | |
writer.writerow(row) | |
if i2 == 0: | |
writer.writerow(row) | |
if i1 == 0: | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment