https://tmr.js.org/p/5b094510/
req:
https://quiet-wave-55460.herokuapp.com/title?url=https://tmr.js.org
resp:
{
"data": ["xxxx","xxxx"]
}
https://tmr.js.org/p/5b094510/
req:
https://quiet-wave-55460.herokuapp.com/title?url=https://tmr.js.org
resp:
{
"data": ["xxxx","xxxx"]
}
import requests | |
from bs4 import BeautifulSoup | |
from collections import Counter | |
import jieba | |
import jieba.analyse | |
def get_titles(url): | |
headers = { | |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', | |
'sec-fetch-site': 'none', | |
'sec-fetch-mode': 'navigate', | |
'sec-fetch-dest': 'document', | |
'accept-language': 'zh-CN,zh;q=0.9' | |
} | |
result = [] | |
try: | |
response = requests.request("GET", url, headers=headers, timeout=5) | |
# 收集标题 | |
if response.ok: | |
response.encoding = response.apparent_encoding | |
soup = BeautifulSoup(response.text, 'html.parser') | |
titles = [] | |
# 遍历tag | |
tags = ["title", "h1", "h2"] | |
for tag in tags: | |
items = soup.find_all(tag) | |
# 清理,提取文本 | |
texts = [] | |
for item in items: | |
text = clear_text(item.text) | |
if text: | |
texts.append(text) | |
# 取前2个 | |
for text in texts[:2]: | |
titles.append(text) | |
# 遍历class | |
classes = ["title", "content-title", | |
"DailyHeader-title", "question-title"] | |
for c in classes: | |
items = soup.find_all(attrs={"class": c}) | |
# 清理,提取文本 | |
texts = [] | |
for item in items: | |
text = clear_text(item.text) | |
if text: | |
texts.append(text) | |
# 取前2个 | |
for text in texts[:2]: | |
titles.append(text) | |
print("all titles: ", titles) | |
titles = list(set(titles)) | |
if len(titles) <= 1: | |
return titles | |
# 找出最适合的标题 | |
# 拆关键字top3 | |
top_tag = jieba.analyse.extract_tags("\n".join(titles), topK=3) | |
print("top_tag: ", top_tag) | |
# 遍历关键字命中率 | |
counter = Counter() | |
for title in set(titles): | |
for t in top_tag: | |
if t in title: | |
counter[title] += 1 | |
print("counter: ", counter) | |
# 取top2 | |
top_title = counter.most_common(2) | |
length = len(top_title) | |
if length == 2: | |
# 命中次数相同,且有前后缀关系,取最短的 | |
# if top_title[0][1] == top_title[1][1] and (top_title[0][0] in top_title[1][0] or top_title[1][0] in top_title[0][0]): | |
if top_title[0][1] == top_title[1][1] and (top_title[0][0].startswith(top_title[1][0]) or top_title[0][0].endswith(top_title[1][0]) or top_title[1][0].startswith(top_title[0][0]) or top_title[1][0].endswith(top_title[0][0])): | |
result.append(min(top_title, key=lambda x: len(x[0]))[0]) | |
else: | |
for title in top_title: | |
result.append(title[0]) | |
elif length == 1: | |
result.append(top_title[0][0]) | |
except Exception as e: | |
print(e) | |
return result |
beautifulsoup4==4.9.3 | |
bs4==0.0.1 | |
requests==2.25.1 | |
jieba==0.42.1 |