Skip to content

Instantly share code, notes, and snippets.

@abd1rahmane
Last active March 10, 2020 17:54
Show Gist options
  • Save abd1rahmane/eeba18b9fbe5e4b36693cd417dab98c8 to your computer and use it in GitHub Desktop.
Save abd1rahmane/eeba18b9fbe5e4b36693cd417dab98c8 to your computer and use it in GitHub Desktop.
Scraping Products information using Keyword and catalog | DISCLAIMER. THIS TOOL IS BEING PROVIDED FOR EDUCATIONAL PURPOSES ONLY
import requests
from bs4 import BeautifulSoup
#
import json
from random import choice
"""
DISCLAIMER. THIS TOOL IS BEING PROVIDED
FOR EDUCATIONAL PURPOSES ONLY
By : @abderrahmane8
"""
class Store(object):
""" Data scraping Model
for the website : website.com
"""
def __init__(self, catalogId, storeId, langId):
super(Store, self).__init__()
self.catalogId = catalogId
self.storeId = storeId
self.langId = langId
self.session = self.session_wizard()
#
self.WEBSITE = "website.com"
self.BASE_LINK = "website.com/shop"
self.SEARCH_ACTION = "VFAjaxGetFilteredSearchResultsView"
self.headers = {
"authority": self.WEBSITE,
"accept": "*/*",
"sec-fetch-dest": "empty",
"x-requested-with": "XMLHttpRequest",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.87 Chrome/80.0.3987.87 Safari/537.36",
"referer": "https://website.com/shop/SearchDisplay?catalogId=13503&storeId=7005&langId=-11&searchTerm=white2",
"accept-language": "en-US,en;q=0.9,fr;q=0.8",
}
def __repr__(self):
return "model : {identifer} ,storeId : {storeId}".format(
identifer=self.__class__.__name__, storeId=self.storeId
)
def session_wizard(self):
# requests.utils
user_Agents = [
# Desktop
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
# Mobile
"Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
]
s1 = requests.session()
s1.headers.update({"User-Agent": choice(user_Agents)})
adapter = requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10)
s1.mount("https://", adapter)
s1.mount("http://", adapter)
return s1
def search_by_keyword(self, keyword, beginIndex, filename):
params = (
("catalogId", self.catalogId),
("searchSource", "N"),
("storeId", self.storeId),
("langId", self.langId),
("searchTerm", keyword.strip()),
("beginIndex", str(beginIndex)),
("facet", ""),
("requesttype", "ajax"),
)
response = requests.get(
"https://{link}/{action}".format(
link=self.BASE_LINK, action=self.SEARCH_ACTION
),
headers=self.headers,
params=params,
)
products_html_data = response.json().get("products") # UTF-8
#
soup = BeautifulSoup(products_html_data, "html.parser")
catalog_results = soup.find("div", {"id": "catalog-results"})
PRODUCT_DATA = []
for item in catalog_results.find_all(
"div", {"class": "product-block-info info info-js"}
):
try:
product = item.find("a", {"class": "product-block-name-link"})
product_link = product.get("href")
product_title = product.get("title")
price = (
(item.find("span", {"class": "product-block-price"}))
.text.replace("£", "")
.strip()
)
row = dict(
product_title=product_title,
product_link=product_link,
product_price=price,
)
PRODUCT_DATA.append(row)
except:
pass
DATA_MODEL = {
"catalogId": catalogId,
"storeId": storeId,
"langId": langId,
"keyword": keyword,
"Products": PRODUCT_DATA,
}
with open("PRODUCT_DATA.json", "w") as f:
json.dump(DATA_MODEL, f)
return DATA_MODEL
# Config the Store...
catalogId = "13503"
storeId = "7005"
langId = "-11"
thenorthface_co_uk = Store(catalogId=catalogId, storeId=storeId, langId=langId)
# Search in the Store
keyword = "Skirt"
beginIndex = 0
raw_result_ = thenorthface_co_uk.search_by_keyword(
keyword=keyword, beginIndex=beginIndex, filename="MyData.json"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment