yfdyh000 · April 6, 2025 01:36
diff --git a/cleanupWa.py b/cleanupWa.py
 from datetime import datetime
 import re
 import pywikibot
 import mwparserfromhell
 from pywikibot.exceptions import LockedPageError, PageSaveRelatedError

 def writeLine(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(content + '\n')

 site = pywikibot.Site("zh", "wikipedia")

 template_name = 'Template:Wayback' # 要查找的模板
 template_page = pywikibot.Page(site, template_name)

 debug = True
 use_file = True  # 设定为 True 以从文件读取页面列表

 if debug:
    # 硬编码的页面名称
    page_titles = ["香港特別行政區基本法第二十三條"]  # 要处理的页面标题
    page_obj = [pywikibot.Page(site, title) for title in page_titles]
 elif use_file:
    # https://quarry.wmcloud.org/query/86429
    # 从本地文件读取页面标题
    with open('pages.txt', 'r', encoding='utf-8') as f:
        page_titles = [line.strip() for line in f if line.strip()]  # 读取非空行
    page_obj = [pywikibot.Page(site, title) for title in page_titles]
 else:
    # 获取引用该模板的页面
    page_obj = list(template_page.getReferences(
        with_template_inclusion=True,
        only_template_inclusion=True,
        namespaces=0,
        total=50,
        content=True  # page对象立即获取页面内容，批量检索速度更快
    ))
    #page_titles = [page.title() for page in page_obj]  # 使用 page.title() 获取标题

 #pages = [pywikibot.Page(site, title) for title in page_titles]  # 逐个创建页面对象
 pages = site.preloadpages(
    page_obj,
    content=True
 )

 writeLine("cleanupWa_404.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
 writeLine("cleanupWa_green.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
 writeLine("cleanupWa_changed.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

 cite_pattern = re.compile(r"^[Cc]ite", re.IGNORECASE)
 Webarchive_pattern = re.compile(r"^(Wayback|Webarchive)", re.IGNORECASE)

 def urlTohttps(s): # 忽略协议差异，视作不敏感
    return s.replace("http://", "https://")

 for page in pages:
    pageTitle = page.title()
    print(f'* 页面标题: [[{pageTitle}]]')
    if not page.exists():
        print('页面不存在！')
        writeLine("cleanupWa_404.log", pageTitle)
        continue

    # 使用 mwparserfromhell 解析页面文本
    wikicode = mwparserfromhell.parse(page.text)
    refs = []
    for node in wikicode.filter_tags(recursive=True):
        if node.tag == 'ref':
            refs.append(node)

    modified_text = page.text
    changedNum = 0

    for ref in refs:
        ref_code = mwparserfromhell.parse(ref.contents)
        cite_template = None
        wayback_template = None

        # 查找 <ref> 中的模板
        cite_template = ref_code.filter_templates(matches=lambda t: bool(cite_pattern.match(str(t.name))))
        wayback_template = ref_code.filter_templates(matches=lambda t: bool(Webarchive_pattern.match(str(t.name))))
        if cite_template and wayback_template:
            cite_url = cite_template[0].get("archive-url").value.strip() if cite_template[0].has("archive-url") else None
            wayback_url = wayback_template[0].get("url").value.strip() if wayback_template[0].has("url") else None

            cite_url = re.sub(r'.+(https?://)', r'\1', cite_url) if cite_url else None
            if wayback_url.count("://") == 2:
                # for {{Webarchive|url=https://archive.today/20210926104007/https://www.zaobao.com.sg/realtime/china/story20210926-1197510 |date=2021-09-26 }}
                wayback_url = re.sub(r'.+(https?://)', r'\1', wayback_url)

            if cite_url and wayback_url and urlTohttps(cite_url) == urlTohttps(wayback_url):
                print(f'{pageTitle} 发现重复参数, archived URL: {cite_url}')
                wikicode.remove(wayback_template)
                ref.contents = ref.contents.strip()
                changedNum += 1

    if changedNum:
        pywikibot.showDiff(page.text, wikicode)

        # 手动确认
        confirm = input("是否确认提交更改？(y/n): ")
        if confirm.lower() == 'y':
            try:
                page.put(modified_text,
                         summary=f"机器人：清理来源中多余的{changedNum}个网页存档模板",
                         asynchronous=True,
                         minor=True,
                         bot=True,
                         show_diff=True
                         )
            except LockedPageError:
                pywikibot.info(f'{pageTitle} Page not saved: page is locked')
            except PageSaveRelatedError as error:
                pywikibot.info(f'{pageTitle} Page not saved: {error.args}')

            #print("{pageTitle} 更改已提交。")
        else:
            print("{pageTitle} 更改已取消。")
        writeLine("cleanupWa_changed.log", pageTitle)
    else:
        writeLine("cleanupWa_green.log", pageTitle)

    #print('-' * 80)  # 分隔符
	from datetime import datetime
	import re
	import pywikibot
	import mwparserfromhell
	from pywikibot.exceptions import LockedPageError, PageSaveRelatedError

	def writeLine(file_path, content):
	with open(file_path, 'a', encoding='utf-8') as file:
	file.write(content + '\n')

	site = pywikibot.Site("zh", "wikipedia")

	template_name = 'Template:Wayback' # 要查找的模板
	template_page = pywikibot.Page(site, template_name)

	debug = True
	use_file = True # 设定为 True 以从文件读取页面列表

	if debug:
	# 硬编码的页面名称
	page_titles = ["香港特別行政區基本法第二十三條"] # 要处理的页面标题
	page_obj = [pywikibot.Page(site, title) for title in page_titles]
	elif use_file:
	# https://quarry.wmcloud.org/query/86429
	# 从本地文件读取页面标题
	with open('pages.txt', 'r', encoding='utf-8') as f:
	page_titles = [line.strip() for line in f if line.strip()] # 读取非空行
	page_obj = [pywikibot.Page(site, title) for title in page_titles]
	else:
	# 获取引用该模板的页面
	page_obj = list(template_page.getReferences(
	with_template_inclusion=True,
	only_template_inclusion=True,
	namespaces=0,
	total=50,
	content=True # page对象立即获取页面内容，批量检索速度更快
	))
	#page_titles = [page.title() for page in page_obj] # 使用 page.title() 获取标题

	#pages = [pywikibot.Page(site, title) for title in page_titles] # 逐个创建页面对象
	pages = site.preloadpages(
	page_obj,
	content=True
	)

	writeLine("cleanupWa_404.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
	writeLine("cleanupWa_green.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
	writeLine("cleanupWa_changed.log", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

	cite_pattern = re.compile(r"^[Cc]ite", re.IGNORECASE)
	Webarchive_pattern = re.compile(r"^(Wayback\|Webarchive)", re.IGNORECASE)

	def urlTohttps(s): # 忽略协议差异，视作不敏感
	return s.replace("http://", "https://")

	for page in pages:
	pageTitle = page.title()
	print(f'* 页面标题: [[{pageTitle}]]')
	if not page.exists():
	print('页面不存在！')
	writeLine("cleanupWa_404.log", pageTitle)
	continue

	# 使用 mwparserfromhell 解析页面文本
	wikicode = mwparserfromhell.parse(page.text)
	refs = []
	for node in wikicode.filter_tags(recursive=True):
	if node.tag == 'ref':
	refs.append(node)

	modified_text = page.text
	changedNum = 0

	for ref in refs:
	ref_code = mwparserfromhell.parse(ref.contents)
	cite_template = None
	wayback_template = None

	# 查找 <ref> 中的模板
	cite_template = ref_code.filter_templates(matches=lambda t: bool(cite_pattern.match(str(t.name))))
	wayback_template = ref_code.filter_templates(matches=lambda t: bool(Webarchive_pattern.match(str(t.name))))
	if cite_template and wayback_template:
	cite_url = cite_template[0].get("archive-url").value.strip() if cite_template[0].has("archive-url") else None
	wayback_url = wayback_template[0].get("url").value.strip() if wayback_template[0].has("url") else None

	cite_url = re.sub(r'.+(https?://)', r'\1', cite_url) if cite_url else None
	if wayback_url.count("://") == 2:
	# for {{Webarchive\|url=https://archive.today/20210926104007/https://www.zaobao.com.sg/realtime/china/story20210926-1197510 \|date=2021-09-26 }}
	wayback_url = re.sub(r'.+(https?://)', r'\1', wayback_url)

	if cite_url and wayback_url and urlTohttps(cite_url) == urlTohttps(wayback_url):
	print(f'{pageTitle} 发现重复参数, archived URL: {cite_url}')
	wikicode.remove(wayback_template)
	ref.contents = ref.contents.strip()
	changedNum += 1

	if changedNum:
	pywikibot.showDiff(page.text, wikicode)

	# 手动确认
	confirm = input("是否确认提交更改？(y/n): ")
	if confirm.lower() == 'y':
	try:
	page.put(modified_text,
	summary=f"机器人：清理来源中多余的{changedNum}个网页存档模板",
	asynchronous=True,
	minor=True,
	bot=True,
	show_diff=True
	)
	except LockedPageError:
	pywikibot.info(f'{pageTitle} Page not saved: page is locked')
	except PageSaveRelatedError as error:
	pywikibot.info(f'{pageTitle} Page not saved: {error.args}')

	#print("{pageTitle} 更改已提交。")
	else:
	print("{pageTitle} 更改已取消。")
	writeLine("cleanupWa_changed.log", pageTitle)
	else:
	writeLine("cleanupWa_green.log", pageTitle)

	#print('-' * 80) # 分隔符