-
-
Save kurohai/ec68d31e1186bb3acce078ff50bbb688 to your computer and use it in GitHub Desktop.
via search engine or baidu engine spider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
from bs4 import BeautifulSoup as bs | |
import requests | |
import logging | |
from itertools import count | |
import time | |
from blinker import signal | |
from lxml.html.clean import Cleaner | |
from lxml.html import tostring, fromstring, iterlinks | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
import concurrent.futures | |
import traceback | |
from multiprocessing import Pool, TimeoutError | |
import re | |
DownLoaderConf = { | |
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246", | |
"headless": False, | |
"ignore-certificate-errors": True, | |
#"proxy-server": "socks5://127.0.0.1:1080", | |
"no-sandbox": True, | |
"executable_path": "/Users/apple/spider/chromedriver" | |
} | |
def get_driver_opts(conf): | |
opts = Options() | |
path = None | |
for k, v in conf.items(): | |
if type(v) == bool: | |
if v: | |
opts.add_argument("--{0}".format(k)) | |
elif k == "executable_path": | |
path = v | |
else: | |
opts.add_argument("{0}={1}".format(k, v)) | |
assert path is not None, "webdriver path cannot be None" | |
return path, opts | |
class DynamicDownLoader(object): | |
def __init__(self, browser=None, timeout=10): | |
if browser is None: | |
path, opts = get_driver_opts(DownLoaderConf) | |
self.browser = webdriver.Chrome(executable_path=path, chrome_options=opts) | |
self.browser.set_page_load_timeout(timeout) | |
else: | |
self.browser = browser | |
def run(self, req): | |
self.browser.get(req) | |
return self.browser.page_source | |
def close(self): | |
self.browser.quit() | |
class StaticDownLoader(object): | |
def __init__(self): | |
self.session = requests.Session() | |
def run(self, req): | |
self.session.send(req) | |
def close(self): | |
self.session.close() | |
class Engine(object): | |
entry = None | |
max_result = 20 | |
max_requests = 10 | |
DownloaderClass = DynamicDownLoader | |
delay_time = 1 | |
before_crawl = signal("before_crawl") | |
after_crawl = signal("after_crawl") | |
def __init__(self, entry=None, DownloaderClass=None, **kwargs): | |
if entry is not None: | |
self.entry = entry | |
elif not getattr(self, "entry", None): | |
raise ValueError("%s must have a entry" % type(self).__name__) | |
if DownloaderClass is not None: | |
self.DownloaderClass= DownloaderClass | |
elif not getattr(self, "DownloaderClass", None): | |
raise ValueError("%s must have a downloader" % type(self).__name__) | |
self.__dict__.update(kwargs) | |
self.result = [] | |
self.counter = count() | |
self.request_counter = 0 | |
def set_downloader(self, downloader=None, **kwargs): | |
if downloader is None: | |
self.downloader = self.DownloaderClass(**kwargs) | |
else: | |
self.downloader = downloader | |
def crawl(self, key): | |
if not hasattr(self, "downloader"): | |
raise ValueError("%s must set_downloader before crawl" % type(self).__name__) | |
try: | |
self.before_crawl.send(self) | |
while True: | |
if len(self.result) > self.max_result or self.request_counter > self.max_requests: | |
break | |
req = self.make_next_request(key) | |
try: | |
response = self.downloader.run(req) | |
for item in self.parse(response): | |
print(item) | |
self.result.append(item) | |
except: | |
traceback.print_exc() | |
self.request_counter += 1 | |
self.delay() | |
except Exception as e: | |
traceback.print_exc() | |
finally: | |
self.downloader.close() | |
self.after_crawl.send(self, key=key) | |
def make_next_request(self, key, **kwargs): | |
raise NotImplementedError | |
def parse(self, response): | |
raise NotImplementedError | |
def delay(self): | |
time.sleep(self.delay_time) | |
class GGEngine(Engine): | |
entry = "https://www.google.com/search?gl=us&q=" | |
def make_next_request(self, key): | |
url = self.entry + key | |
start = 10 | |
c = next(self.counter) | |
if c == 0: | |
return url | |
else: | |
return "{}&start={}".format(url, start*c) | |
def parse(self,response): | |
soup = bs(response,'lxml') | |
for item in soup.find_all("ol"): | |
i = item.select_one("li:nth-of-type(1) a") | |
if i: | |
href = i["href"] | |
u = self.resolve_href(href) | |
if u: | |
yield u | |
@staticmethod | |
def resolve_href(url): | |
import re | |
g = re.match("(.*?)cache:(.*?):(.*?)\+&cd(.*)",url) | |
if g: | |
return g.group(3) | |
class BDEngine(Engine): | |
entry = "https://www.baidu.com/s?wd=" | |
def make_next_request(self,key): | |
url = self.entry + key | |
pn = 10 | |
c = next(self.counter) | |
if c == 0: | |
return url | |
else: | |
return "{}&pn={}".format(url, pn*c) | |
def parse(self,response): | |
default_window = self.downloader.browser.current_window_handle | |
for item in self.downloader.browser.find_elements_by_css_selector("a.c-showurl"): | |
try: | |
item.click() | |
self.downloader.browser.switch_to_window(self.downloader.browser.window_handles.pop()) | |
yield self.downloader.browser.current_url | |
self.downloader.browser.close() | |
self.downloader.browser.switch_to_window(default_window) | |
time.sleep(0.5) | |
except: | |
continue | |
def static_get_source(url, timeout=10): | |
if url.startswith("http"): | |
response = requests.get(url, timeout=timeout) | |
else: | |
response = requests.get("http://{}".format(url), timeout=timeout) | |
return response.content | |
def html2text(data): | |
try: | |
doc = fromstring(data) | |
cleaner = Cleaner(style=True) | |
doc = cleaner.clean_html(doc) | |
return re.sub('[ \t\n]+'," ",doc.text_content()) | |
except Exception: | |
traceback.print_exc() | |
return '' | |
def dynamic_get_source(url, timeout=10): | |
try: | |
dspider = DynamicDownLoader(timeout=timeout) | |
return dspider.run(url) | |
except: | |
return "" | |
finally: | |
try: | |
dspider.close() | |
except: | |
pass | |
def spider(engine, downloader=None, dynamic=True): | |
loader = dynamic_get_source if dynamic else static_get_source | |
engine.set_downloader(downloader=downloader) | |
@engine.after_crawl.connect | |
def after_crawl(sender, key=""): | |
import os | |
if not os.path.exists("spider_out"): | |
os.makedirs("spider_out") | |
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: | |
tasks = {executor.submit(loader, url, 10):url for url in sender.result} | |
counter = 1 | |
for f in concurrent.futures.as_completed(tasks): | |
url = tasks[f] | |
try: | |
data = f.result() | |
except Exception as exc: | |
print('%r generated an exception: %s' % (url, exc)) | |
else: | |
try: | |
data = html2text(data) | |
data = data.strip() | |
if data: | |
with open("spider_out/{}.html".format(key+str(counter)), "a+") as g: | |
g.write(data) | |
except Exception: | |
traceback.print_exc() | |
continue | |
finally: | |
counter += 1 | |
return engine | |
if __name__ == '__main__': | |
import sys | |
with open(sys.argv[-1]) as f: | |
for line in f: | |
s = spider(BDEngine()) | |
s.crawl(line.strip()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment