Skip to content

Instantly share code, notes, and snippets.

@k1pfel
Last active October 23, 2024 01:55
Show Gist options
  • Save k1pfel/44a4ff327f6186c27de737fd260588b0 to your computer and use it in GitHub Desktop.
Save k1pfel/44a4ff327f6186c27de737fd260588b0 to your computer and use it in GitHub Desktop.
import os
import sys
from multiprocessing.pool import ThreadPool as Pool
import mwclient
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from conf import botpassword_self as bps
pool_size = 32
url = "zh.wikipedia.org"
ll = []
blocked = []
bot = []
locked = []
site = mwclient.Site(url)
site.login(bps[0], bps[1])
def filter_block_and_bot(_site, _list) -> None:
for i in range(0, len(_list), 50):
_users = _site.api("query", list="users", usprop="groups|blockinfo", ususers="|".join(_list[i:i+50]))
for _user in _users["query"]["users"]:
for (k,v) in _user.items():
if k == "groups":
if "bot" in v:
bot.append(_user["name"])
if k == "blockid":
blocked.append(_user["name"])
def filter_lock(_site, _user) -> None:
res = _site.api("query", list="globalallusers", aguprop="lockinfo", agufrom=_user, agulimit="1")
for (k, v) in res["query"]["globalallusers"][0].items():
if k == "locked":
locked.append(_user)
if __name__ == "__main__":
with open("maint/list.txt", "r", encoding="utf-8") as f:
for line in f:
ll.append(line.strip())
filter_block_and_bot(site, ll)
for user in blocked:
try:
ll.remove(user)
except ValueError:
pass
for user in bot:
try:
ll.remove(user)
except ValueError:
pass
pool = Pool(pool_size)
for user in ll:
pool.apply_async(filter_lock, (site, user,))
pool.close()
pool.join()
for user in locked:
try:
ll.remove(user)
except ValueError:
pass
# save processed list to file
with open("maint/filtered-list.txt", "w", encoding="utf-8") as f:
for user in ll:
f.write(user + "\n")
print("Blocked users:", blocked, "Number:", len(blocked))
print("Bot accounts:", bot, "Number:", len(bot))
print("Locked users:", locked, "Number:", len(locked))
/*
- 提名前注册满120天 AND 提名120天前有至少500次编辑 AND 提名90天内至少有1次编辑(不包括任何用户页及用户对话页)
- 提名前注册满120天 AND 编辑次数至少3000次
- 提名前注册满120天 AND 编辑条目次数至少1500次
*/
use zhwiki_p;
SET @VOTE_TIME = '20241001000000';
SET @PRIOR_90_VOTE_TIME =
DATE_FORMAT(DATE_SUB(STR_TO_DATE(@VOTE_TIME, '%Y%m%d%H%i%s'), INTERVAL 90 DAY), '%Y%m%d%H%i%s');
SET @PRIOR_120_VOTE_TIME =
DATE_FORMAT(DATE_SUB(STR_TO_DATE(@VOTE_TIME, '%Y%m%d%H%i%s'), INTERVAL 120 DAY), '%Y%m%d%H%i%s');
WITH all_revision AS (SELECT rev_id, rev_actor, rev_page, rev_timestamp
FROM revision_userindex
UNION
SELECT ar_id AS rev_id,
ar_actor AS rev_actor,
ar_page_id AS rev_page,
ar_timestamp AS rev_timestamp
FROM archive_userindex)
SELECT part1.actor_name AS user_name
FROM (SELECT actor_user, actor_name, COUNT(*) AS edits
FROM actor
JOIN user ON actor_user = user.user_id
JOIN all_revision ON actor_id = all_revision.rev_actor
WHERE actor_user IS NOT NULL
AND (user.user_registration < @PRIOR_120_VOTE_TIME
OR user.user_registration IS NULL)
AND all_revision.rev_timestamp < @VOTE_TIME
GROUP BY actor_id
HAVING edits >= 3000) AS part1
UNION
SELECT part2.actor_name AS user_name
FROM (SELECT actor_id, actor_user, actor_name, COUNT(*) AS ns0_edits
FROM actor
JOIN user ON actor_user = user.user_id
JOIN all_revision ON actor_id = all_revision.rev_actor
JOIN page ON page_id = all_revision.rev_page
WHERE actor_user IS NOT NULL
AND (user.user_registration < @PRIOR_120_VOTE_TIME
OR user.user_registration IS NULL)
AND all_revision.rev_timestamp < @VOTE_TIME
AND page_namespace = 0
GROUP BY actor_id
HAVING ns0_edits >= 1500) AS part2
UNION
SELECT part3.actor_name AS user_name
FROM (SELECT actor_id, actor_user, actor_name, COUNT(*) AS edits
FROM (SELECT actor_id, actor_user, actor_name
FROM actor
JOIN user ON actor_user = user.user_id
JOIN all_revision ON actor_id = all_revision.rev_actor
JOIN page ON page_id = all_revision.rev_page
WHERE actor_user IS NOT NULL
AND (user.user_registration < @PRIOR_120_VOTE_TIME
OR user.user_registration IS NULL)
AND all_revision.rev_timestamp >= @PRIOR_90_VOTE_TIME
AND all_revision.rev_timestamp <= @VOTE_TIME
AND page_namespace != 2
AND page_namespace != 3
GROUP BY actor_id) AS user_w_edit_in_90
JOIN all_revision ON user_w_edit_in_90.actor_id = all_revision.rev_actor
WHERE user_w_edit_in_90.actor_user IS NOT NULL
AND all_revision.rev_timestamp < @PRIOR_120_VOTE_TIME
GROUP BY user_w_edit_in_90.actor_id
HAVING edits >= 500) AS part3
ORDER BY user_name;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment