Created
October 24, 2017 17:44
-
-
Save aflansburg/912bb8d26773c7e42f25b4403b609e7b to your computer and use it in GitHub Desktop.
Scrape an Amazon page for the bullet points
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This involves some manual work depending on the type and category of an item | |
# See below !! CATEGORY / ITEM SPECIFIC INFO !! | |
import requests | |
from bs4 import BeautifulSoup as BS | |
# !! CATEGORY / ITEM SPECIFIC INFO | |
# This function is for the list filter and is some information that unfortunately couldn't be exluded with Beautiful Soup | |
# (or I just couldn't figure it out) | |
def f(x): | |
if x == 'Automotive' or x == 'Replacement Parts' or x == 'Shocks, Struts & Suspension' or x == 'Chassis' or x == 'Body Lift Kits' or x == 'Enter your model numberto make sure this fits.': | |
return x | |
url = "https://www.amazon.com/dp/B00B2B3ZU8" | |
page = requests.get(url) | |
html_contents = page.text | |
soup = BS(html_contents, "html.parser") | |
data = [] | |
for span in soup.find_all('span', {'class': 'a-list-item'}): | |
data.append(span.text) | |
# cut out some of the characters and a unicode character | |
# this may require some tweaking | |
data = [i.strip(' ') for i in data] | |
data = [i.strip('\n') for i in data] | |
data = [i.strip(' ') for i in data] | |
data = [i.strip('\n') for i in data] | |
data = [i.strip('\t') for i in data] | |
data = [i.strip(u'\u203a') for i in data] | |
data = list(filter(None, data)) | |
data = [i.strip('\n') for i in data] | |
data = [i.strip('\t') for i in data] | |
data = [i.replace('\n', '') for i in data] | |
removals = list(filter(f, data)) | |
data = [i for i in data if i not in removals] | |
for i in data: | |
print(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment