ttttmr · December 30, 2020 13:53
diff --git a/doc.md b/doc.md
diff --git a/get_titles.py b/get_titles.py
 import requests
 from bs4 import BeautifulSoup
 from collections import Counter
 import jieba
 import jieba.analyse

 def get_titles(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'zh-CN,zh;q=0.9'
    }
    result = []
    try:
        response = requests.request("GET", url, headers=headers, timeout=5)
        # 收集标题
        if response.ok:
            response.encoding = response.apparent_encoding
            soup = BeautifulSoup(response.text, 'html.parser')
            titles = []
            # 遍历tag
            tags = ["title", "h1", "h2"]
            for tag in tags:
                items = soup.find_all(tag)
                # 清理，提取文本
                texts = []
                for item in items:
                    text = clear_text(item.text)
                    if text:
                        texts.append(text)
                # 取前2个
                for text in texts[:2]:
                    titles.append(text)
            # 遍历class
            classes = ["title", "content-title",
                       "DailyHeader-title", "question-title"]
            for c in classes:
                items = soup.find_all(attrs={"class": c})
                # 清理，提取文本
                texts = []
                for item in items:
                    text = clear_text(item.text)
                    if text:
                        texts.append(text)
                # 取前2个
                for text in texts[:2]:
                    titles.append(text)
            print("all titles: ", titles)
            titles = list(set(titles))
            if len(titles) <= 1:
                return titles
            # 找出最适合的标题
            # 拆关键字top3
            top_tag = jieba.analyse.extract_tags("\n".join(titles), topK=3)
            print("top_tag: ", top_tag)
            # 遍历关键字命中率
            counter = Counter()
            for title in set(titles):
                for t in top_tag:
                    if t in title:
                        counter[title] += 1
            print("counter: ", counter)
            # 取top2
            top_title = counter.most_common(2)
            length = len(top_title)
            if length == 2:
                # 命中次数相同，且有前后缀关系，取最短的
                # if top_title[0][1] == top_title[1][1] and (top_title[0][0] in top_title[1][0] or top_title[1][0] in top_title[0][0]):
                if top_title[0][1] == top_title[1][1] and (top_title[0][0].startswith(top_title[1][0]) or top_title[0][0].endswith(top_title[1][0]) or top_title[1][0].startswith(top_title[0][0]) or top_title[1][0].endswith(top_title[0][0])):
                    result.append(min(top_title, key=lambda x: len(x[0]))[0])
                else:
                    for title in top_title:
                        result.append(title[0])
            elif length == 1:
                result.append(top_title[0][0])
    except Exception as e:
        print(e)
    return result
diff --git a/requirements.txt b/requirements.txt
 beautifulsoup4==4.9.3
 bs4==0.0.1
 requests==2.25.1
 jieba==0.42.1
	import requests
	from bs4 import BeautifulSoup
	from collections import Counter
	import jieba
	import jieba.analyse

	def get_titles(url):
	headers = {
	'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9',
	'sec-fetch-site': 'none',
	'sec-fetch-mode': 'navigate',
	'sec-fetch-dest': 'document',
	'accept-language': 'zh-CN,zh;q=0.9'
	}
	result = []
	try:
	response = requests.request("GET", url, headers=headers, timeout=5)
	# 收集标题
	if response.ok:
	response.encoding = response.apparent_encoding
	soup = BeautifulSoup(response.text, 'html.parser')
	titles = []
	# 遍历tag
	tags = ["title", "h1", "h2"]
	for tag in tags:
	items = soup.find_all(tag)
	# 清理，提取文本
	texts = []
	for item in items:
	text = clear_text(item.text)
	if text:
	texts.append(text)
	# 取前2个
	for text in texts[:2]:
	titles.append(text)
	# 遍历class
	classes = ["title", "content-title",
	"DailyHeader-title", "question-title"]
	for c in classes:
	items = soup.find_all(attrs={"class": c})
	# 清理，提取文本
	texts = []
	for item in items:
	text = clear_text(item.text)
	if text:
	texts.append(text)
	# 取前2个
	for text in texts[:2]:
	titles.append(text)
	print("all titles: ", titles)
	titles = list(set(titles))
	if len(titles) <= 1:
	return titles
	# 找出最适合的标题
	# 拆关键字top3
	top_tag = jieba.analyse.extract_tags("\n".join(titles), topK=3)
	print("top_tag: ", top_tag)
	# 遍历关键字命中率
	counter = Counter()
	for title in set(titles):
	for t in top_tag:
	if t in title:
	counter[title] += 1
	print("counter: ", counter)
	# 取top2
	top_title = counter.most_common(2)
	length = len(top_title)
	if length == 2:
	# 命中次数相同，且有前后缀关系，取最短的
	# if top_title[0][1] == top_title[1][1] and (top_title[0][0] in top_title[1][0] or top_title[1][0] in top_title[0][0]):
	if top_title[0][1] == top_title[1][1] and (top_title[0][0].startswith(top_title[1][0]) or top_title[0][0].endswith(top_title[1][0]) or top_title[1][0].startswith(top_title[0][0]) or top_title[1][0].endswith(top_title[0][0])):
	result.append(min(top_title, key=lambda x: len(x[0]))[0])
	else:
	for title in top_title:
	result.append(title[0])
	elif length == 1:
	result.append(top_title[0][0])
	except Exception as e:
	print(e)
	return result
	beautifulsoup4==4.9.3
	bs4==0.0.1
	requests==2.25.1
	jieba==0.42.1