Skip to content

Instantly share code, notes, and snippets.

@jrainlau
Created January 10, 2018 07:55
Show Gist options
  • Save jrainlau/eb6d6c01cd082e6bc45d30a078ab42ee to your computer and use it in GitHub Desktop.
Save jrainlau/eb6d6c01cd082e6bc45d30a078ab42ee to your computer and use it in GitHub Desktop.
小黄文下载爬虫
from bs4 import BeautifulSoup
import requests
import os
def visit_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'
}
r = requests.get(url, headers = headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'lxml')
return soup
def get_article_urls(page):
articles = page.select('body > div.p_list > div.wrapper1200 > div > section.main.fl > div.con > div.bd > ul > li > p > a')
collect = []
for article in articles:
data = {
'title': article.get_text(),
'url': 'http://laotiese.org' + article.get('href')
}
collect.append(data)
return collect
def get_article(articles_urls):
for index, article in enumerate(articles_urls):
article_page = visit_page(article['url'])
title = article['title']
content = article_page.select('#content_news')[0].get_text()
with open('./articles/' + title + '.txt', 'w+') as f:
print('正在下载第' + str(index + 1) + '/' + str(len(articles_urls)) + '篇文章...')
f.write(content)
PAGE = visit_page('http://laotiese.org/txtsex/index.html')
ARTICLE_URLS = get_article_urls(PAGE)
get_article(ARTICLE_URLS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment