Created
August 9, 2018 13:35
-
-
Save KielD-01/026f1b58223bd5a70eb9d5852103b6e5 to your computer and use it in GitHub Desktop.
Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import requests | |
import lxml | |
import multiprocessing | |
from bs4 import BeautifulSoup | |
def transform_cache_url(url): | |
return re.sub(r"[^0-9a-zA-Z]", '_', url) | |
def transform_page_urls(text, url): | |
return text.replace(url, '') | |
class Trendsgal: | |
base_uri = 'https://www.trendsgal.com' | |
base_uri_to_replace = '//www.trendsgal.com' | |
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \ | |
'Chrome/68.0.3440.84 Safari/537.36 ' | |
request_headers = {} | |
requests_session = {} | |
categories = [] | |
domParser = None | |
selectors = { | |
'menu_items': 'li[gao="bowen"] dl dd a' | |
} | |
cache_dir = './cached/' | |
def __init__(self): | |
self.request_headers['User-Agent'] = self.user_agent | |
self.requests_session = requests.Session() | |
self.requests_session.headers.update(self.request_headers) | |
if not os.path.exists(self.cache_dir): | |
os.makedirs(self.cache_dir, 0o755, True) | |
self.run() | |
pass | |
def run(self): | |
self.get_categories() | |
def _get_page(self, url=None, prefix=None): | |
url = self.base_uri + url | |
cached = self._get_cached_page(transform_cache_url(url), prefix) | |
if cached['status'] is True: | |
return cached['data'] | |
response = self.requests_session.get(url) | |
dom_parser = BeautifulSoup(response.text, 'lxml') | |
self._set_cached_page(url, prefix, dom_parser) | |
return dom_parser | |
def _get_cached_page(self, url, prefix=None): | |
if prefix is None: | |
prefix = 'cache' | |
url = transform_cache_url(url) | |
cached_file = prefix + '_%(url)s' % {'url': url} | |
status = os.path.isfile(self.cache_dir + cached_file + '.html') | |
data = None | |
if status is True: | |
data = BeautifulSoup(open(self.cache_dir + cached_file + '.html', 'r').read(), 'lxml') | |
return { | |
'status': status, | |
'data': data | |
} | |
def _set_cached_page(self, url, prefix=None, content=None): | |
if prefix is None: | |
prefix = 'cache' | |
url = transform_cache_url(url) | |
cached_file = prefix + '_%(url)s' % {'url': url} | |
cached_file = open(self.cache_dir + cached_file + '.html', 'w') | |
cached_file.write( | |
transform_page_urls( | |
str(content.encode('utf-8')), | |
self.base_uri_to_replace | |
) | |
) | |
cached_file.close() | |
def get_categories(self): | |
self.domParser = self._get_page('/') | |
menu_items = self.domParser.select(self.selectors['menu_items']) | |
category_index = 1 | |
for menu_item in menu_items: | |
category = { | |
'index': category_index, | |
'title': menu_item.text.replace('\\', ''), | |
'link': transform_page_urls(menu_item.attrs['href'], self.base_uri_to_replace), | |
'products': [] | |
} | |
self.categories.append(category) | |
category_index += 1 | |
self.proceed_categories() | |
def proceed_categories(self): | |
workers_jobs = [] | |
for work in self.categories: | |
work_process = multiprocessing.Process( | |
target=self.proceed_category_products, args=(self, work.get('index')) | |
) | |
workers_jobs.append(work_process) | |
work_process.start() | |
def proceed_category_products(self, index): | |
category = self.categories[index] | |
pagination = { | |
'set': False, | |
'total': 1, | |
'current': 1 | |
} | |
while pagination.get('current') <= pagination.get('total'): | |
page = '' | |
if pagination['current'] > 1: | |
page = 'p_%d' % pagination['current'] | |
self._get_page('/%r' % page, 'category_%r_products_%r' % (category.get('title'), page)) | |
pagination['current'] += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment