Skip to content

Instantly share code, notes, and snippets.

@towry
Created November 28, 2015 10:56
Show Gist options
  • Save towry/0a7ef16573916967b6e0 to your computer and use it in GitHub Desktop.
Save towry/0a7ef16573916967b6e0 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
USE_PROXY = True
if USE_PROXY:
import requesocks as requests
else:
import requests
# configs
proxies = {
"http": "socks5://127.0.0.1:1080",
"https": "socks5://127.0.0.1:1080"
}
HTML_PARSER = "html.parser"
# gb18030
def _make_session():
session = requests.session()
session.proxies = proxies
return session
def _get(session, url):
return session.get(url, proxies=proxies, timeout=35)
def get_soup(url):
session = _make_session()
reponse = _get(session, url)
content = reponse.content.decode('gbk').encode('utf-8')
soup = BeautifulSoup(content, HTML_PARSER)
return soup
"""
Usage:
soup = get_soup('http://www.google.com')
links = soup.find_all('a')
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment