Last active
April 6, 2025 01:36
-
-
Save yfdyh000/a66674c8417bb9e388a9646889026339 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import re | |
import pywikibot | |
import mwparserfromhell | |
from pywikibot.exceptions import LockedPageError, PageSaveRelatedError | |
def writeLine(file_path, content): | |
with open(file_path, 'a', encoding='utf-8') as file: | |
file.write(content + '\n') | |
site = pywikibot.Site("zh", "wikipedia") | |
template_name = 'Template:Wayback' # 要查找的模板 | |
template_page = pywikibot.Page(site, template_name) | |
debug = True | |
use_file = True # 设定为 True 以从文件读取页面列表 | |
if debug: | |
# 硬编码的页面名称 | |
page_titles = ["香港特別行政區基本法第二十三條"] # 要处理的页面标题 | |
page_obj = [pywikibot.Page(site, title) for title in page_titles] | |
elif use_file: | |
# https://quarry.wmcloud.org/query/86429 | |
# 从本地文件读取页面标题 | |
with open('pages.txt', 'r', encoding='utf-8') as f: | |
page_titles = [line.strip() for line in f if line.strip()] # 读取非空行 | |
page_obj = [pywikibot.Page(site, title) for title in page_titles] | |
else: | |
# 获取引用该模板的页面 | |
page_obj = list(template_page.getReferences( | |
with_template_inclusion=True, | |
only_template_inclusion=True, | |
namespaces=0, | |
total=50, | |
content=True # page对象立即获取页面内容,批量检索速度更快 | |
)) | |
#page_titles = [page.title() for page in page_obj] # 使用 page.title() 获取标题 | |
#pages = [pywikibot.Page(site, title) for title in page_titles] # 逐个创建页面对象 | |
pages = site.preloadpages( | |
page_obj, | |
content=True | |
) | |
writeLine("cleanupWa_404.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S')) | |
writeLine("cleanupWa_green.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S')) | |
writeLine("cleanupWa_changed.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S')) | |
cite_pattern = re.compile(r"^[Cc]ite", re.IGNORECASE) | |
Webarchive_pattern = re.compile(r"^(Wayback|Webarchive)", re.IGNORECASE) | |
def urlTohttps(s): # 忽略协议差异,视作不敏感 | |
return s.replace("http://", "https://") | |
for page in pages: | |
pageTitle = page.title() | |
print(f'* 页面标题: [[{pageTitle}]]') | |
if not page.exists(): | |
print('页面不存在!') | |
writeLine("cleanupWa_404.log", pageTitle) | |
continue | |
# 使用 mwparserfromhell 解析页面文本 | |
wikicode = mwparserfromhell.parse(page.text) | |
refs = [] | |
for node in wikicode.filter_tags(recursive=True): | |
if node.tag == 'ref': | |
refs.append(node) | |
modified_text = page.text | |
changedNum = 0 | |
for ref in refs: | |
ref_code = mwparserfromhell.parse(ref.contents) | |
cite_template = None | |
wayback_template = None | |
# 查找 <ref> 中的模板 | |
cite_template = ref_code.filter_templates(matches=lambda t: bool(cite_pattern.match(str(t.name)))) | |
wayback_template = ref_code.filter_templates(matches=lambda t: bool(Webarchive_pattern.match(str(t.name)))) | |
if cite_template and wayback_template: | |
cite_url = cite_template[0].get("archive-url").value.strip() if cite_template[0].has("archive-url") else None | |
wayback_url = wayback_template[0].get("url").value.strip() if wayback_template[0].has("url") else None | |
cite_url = re.sub(r'.+(https?://)', r'\1', cite_url) if cite_url else None | |
if wayback_url.count("://") == 2: | |
# for {{Webarchive|url=https://archive.today/20210926104007/https://www.zaobao.com.sg/realtime/china/story20210926-1197510 |date=2021-09-26 }} | |
wayback_url = re.sub(r'.+(https?://)', r'\1', wayback_url) | |
if cite_url and wayback_url and urlTohttps(cite_url) == urlTohttps(wayback_url): | |
print(f'{pageTitle} 发现重复参数, archived URL: {cite_url}') | |
wikicode.remove(wayback_template) | |
ref.contents = ref.contents.strip() | |
changedNum += 1 | |
if changedNum: | |
pywikibot.showDiff(page.text, wikicode) | |
# 手动确认 | |
confirm = input("是否确认提交更改?(y/n): ") | |
if confirm.lower() == 'y': | |
try: | |
page.put(modified_text, | |
summary=f"机器人:清理来源中多余的{changedNum}个网页存档模板", | |
asynchronous=True, | |
minor=True, | |
bot=True, | |
show_diff=True | |
) | |
except LockedPageError: | |
pywikibot.info(f'{pageTitle} Page not saved: page is locked') | |
except PageSaveRelatedError as error: | |
pywikibot.info(f'{pageTitle} Page not saved: {error.args}') | |
#print("{pageTitle} 更改已提交。") | |
else: | |
print("{pageTitle} 更改已取消。") | |
writeLine("cleanupWa_changed.log", pageTitle) | |
else: | |
writeLine("cleanupWa_green.log", pageTitle) | |
#print('-' * 80) # 分隔符 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment