Created
October 30, 2022 07:15
-
-
Save prnake/91a2d5ab06cd1c8f685b2cfa1ae114a8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
import requests | |
import html2text | |
from bs4 import BeautifulSoup | |
import codecs | |
def request_get(url): | |
session = requests.Session() | |
headers = { | |
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'} | |
response = requests.get(url, headers=headers, timeout=3) | |
return response | |
def CrawlingItemBlog(base_url, id): | |
second_url = base_url + 'article/details/' | |
url = second_url + id | |
# 发送request请求并接受返回值 | |
item_html = request_get(url) | |
if item_html.status_code == 200: | |
''' | |
需要的信息: | |
1:标题 | |
2:markdown内容 | |
3:发表日期 | |
4:标签 | |
5:类别 | |
''' | |
# 利用BeautifulSoup解析返回的html | |
soup = BeautifulSoup(item_html.text) | |
c = soup.find(id="content_views") | |
# 标题 | |
title_article = soup.find(attrs={'class': 'title-article'}) | |
# 这里是将标题作为最后存储的文件名 | |
file_name = title_article.get_text() | |
title_article = title_article.prettify() | |
# 设置hexo格式博客开头的格式(title) | |
hexo_title = 'title: ' + file_name + '\n' | |
# 文章的categories | |
hexo_categories = '' | |
# 有可能出现这篇文章没有categories的情况 | |
try: | |
hexo_categories = soup.find(attrs={'class': 'tags-box space'}).find(attrs={'class': 'tag-link'}).get_text() | |
except Exception: | |
pass | |
if hexo_categories == '': | |
pass | |
else: | |
# 去除拿到的str中的'\t' | |
hexo_categories = hexo_categories.replace('\t', '') | |
hexo_categories = 'categories:\n' + '- ' + hexo_categories + '\n' | |
# 发表时间 | |
# time = soup.find(attrs={'class': 'time'}).get_text() | |
# s_time1 = time.split('年') | |
# year = s_time1[0] | |
# s_time2 = s_time1[1].split('月') | |
# month = s_time2[0] | |
# s_time3 = s_time2[1].split('日') | |
# day = s_time3[0] | |
# minite = s_time3[1].strip() | |
# hexo_date = 'date: ' + year + '-' + month + '-' + day + ' ' + minite + '\n' | |
hexo_tags = '' | |
# 获取tags | |
tags = '' | |
try: | |
tags = soup.find(attrs={'class': 'tags-box artic-tag-box'}).get_text() | |
except Exception: | |
pass | |
if tags == '': | |
pass | |
else: | |
tags = tags.split('\n') | |
tags = tags[2] | |
tags = tags.replace('\t', ' ') | |
tags = tags.split(' ') | |
hexo_tags = 'tags:\n' | |
for tag in tags: | |
if tag == '': | |
continue | |
else: | |
hexo_tags = hexo_tags + '- ' + tag + '\n' | |
# 将html转化为markdown | |
text_maker = html2text.HTML2Text() | |
text_maker.bypass_tables = False | |
text = text_maker.handle(c.prettify()) | |
# 有的文章名字特殊,会新建文件失败 | |
try: | |
# 写入文件 | |
f = codecs.open('./mds/' + file_name + '.md', 'w', encoding='utf-8') | |
f.write(text) | |
f.close() | |
except Exception: | |
print(file_name) | |
return True | |
else: | |
return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment