Skip to content

Instantly share code, notes, and snippets.

@yfdyh000
Last active April 6, 2025 01:36
Show Gist options
  • Save yfdyh000/a66674c8417bb9e388a9646889026339 to your computer and use it in GitHub Desktop.
Save yfdyh000/a66674c8417bb9e388a9646889026339 to your computer and use it in GitHub Desktop.
from datetime import datetime
import re
import pywikibot
import mwparserfromhell
from pywikibot.exceptions import LockedPageError, PageSaveRelatedError
def writeLine(file_path, content):
with open(file_path, 'a', encoding='utf-8') as file:
file.write(content + '\n')
site = pywikibot.Site("zh", "wikipedia")
template_name = 'Template:Wayback' # 要查找的模板
template_page = pywikibot.Page(site, template_name)
debug = True
use_file = True # 设定为 True 以从文件读取页面列表
if debug:
# 硬编码的页面名称
page_titles = ["香港特別行政區基本法第二十三條"] # 要处理的页面标题
page_obj = [pywikibot.Page(site, title) for title in page_titles]
elif use_file:
# https://quarry.wmcloud.org/query/86429
# 从本地文件读取页面标题
with open('pages.txt', 'r', encoding='utf-8') as f:
page_titles = [line.strip() for line in f if line.strip()] # 读取非空行
page_obj = [pywikibot.Page(site, title) for title in page_titles]
else:
# 获取引用该模板的页面
page_obj = list(template_page.getReferences(
with_template_inclusion=True,
only_template_inclusion=True,
namespaces=0,
total=50,
content=True # page对象立即获取页面内容,批量检索速度更快
))
#page_titles = [page.title() for page in page_obj] # 使用 page.title() 获取标题
#pages = [pywikibot.Page(site, title) for title in page_titles] # 逐个创建页面对象
pages = site.preloadpages(
page_obj,
content=True
)
writeLine("cleanupWa_404.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
writeLine("cleanupWa_green.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
writeLine("cleanupWa_changed.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
cite_pattern = re.compile(r"^[Cc]ite", re.IGNORECASE)
Webarchive_pattern = re.compile(r"^(Wayback|Webarchive)", re.IGNORECASE)
def urlTohttps(s): # 忽略协议差异,视作不敏感
return s.replace("http://", "https://")
for page in pages:
pageTitle = page.title()
print(f'* 页面标题: [[{pageTitle}]]')
if not page.exists():
print('页面不存在!')
writeLine("cleanupWa_404.log", pageTitle)
continue
# 使用 mwparserfromhell 解析页面文本
wikicode = mwparserfromhell.parse(page.text)
refs = []
for node in wikicode.filter_tags(recursive=True):
if node.tag == 'ref':
refs.append(node)
modified_text = page.text
changedNum = 0
for ref in refs:
ref_code = mwparserfromhell.parse(ref.contents)
cite_template = None
wayback_template = None
# 查找 <ref> 中的模板
cite_template = ref_code.filter_templates(matches=lambda t: bool(cite_pattern.match(str(t.name))))
wayback_template = ref_code.filter_templates(matches=lambda t: bool(Webarchive_pattern.match(str(t.name))))
if cite_template and wayback_template:
cite_url = cite_template[0].get("archive-url").value.strip() if cite_template[0].has("archive-url") else None
wayback_url = wayback_template[0].get("url").value.strip() if wayback_template[0].has("url") else None
cite_url = re.sub(r'.+(https?://)', r'\1', cite_url) if cite_url else None
if wayback_url.count("://") == 2:
# for {{Webarchive|url=https://archive.today/20210926104007/https://www.zaobao.com.sg/realtime/china/story20210926-1197510 |date=2021-09-26 }}
wayback_url = re.sub(r'.+(https?://)', r'\1', wayback_url)
if cite_url and wayback_url and urlTohttps(cite_url) == urlTohttps(wayback_url):
print(f'{pageTitle} 发现重复参数, archived URL: {cite_url}')
wikicode.remove(wayback_template)
ref.contents = ref.contents.strip()
changedNum += 1
if changedNum:
pywikibot.showDiff(page.text, wikicode)
# 手动确认
confirm = input("是否确认提交更改?(y/n): ")
if confirm.lower() == 'y':
try:
page.put(modified_text,
summary=f"机器人:清理来源中多余的{changedNum}个网页存档模板",
asynchronous=True,
minor=True,
bot=True,
show_diff=True
)
except LockedPageError:
pywikibot.info(f'{pageTitle} Page not saved: page is locked')
except PageSaveRelatedError as error:
pywikibot.info(f'{pageTitle} Page not saved: {error.args}')
#print("{pageTitle} 更改已提交。")
else:
print("{pageTitle} 更改已取消。")
writeLine("cleanupWa_changed.log", pageTitle)
else:
writeLine("cleanupWa_green.log", pageTitle)
#print('-' * 80) # 分隔符
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment