Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ShashkovS/5dc3fd46f0d2ff7ba411e0a0a239ba92 to your computer and use it in GitHub Desktop.
Save ShashkovS/5dc3fd46f0d2ff7ba411e0a0a239ba92 to your computer and use it in GitHub Desktop.
import re
import os
from Levenshtein import distance
def fmt_text(text):
"""Удалить бессмысленные различия"""
text = re.sub(r'[^а-я\s]', '', text.lower()) # Удаляем всё, кроме русских букв и пробелов
text = re.sub(r'\s+', ' ', text) # Удаляем длинные пробелы
return text
def rtv_problem_dict(html_path):
"""Получить словарь задач в папке"""
os.chdir(html_path)
all_problems = {}
for filename in os.listdir(html_path):
if filename.lower().endswith('.html'):
print(filename)
with open(filename, 'r', encoding='utf-8') as f:
html = f.read()
problem_nums = []
for match in re.finditer(r'(?<="problem_num"><b>Задача )(\d+\.\d+)\.', html):
problem_nums.append((match.end(0), match.group(0)))
for match in re.finditer(r'(?<="problem_num">Задача )(\d+\.\d+)\.', html):
problem_nums.append((match.end(0), match.group(0)))
problem_nums.sort()
problem_nums.append((len(html), ''))
for i, (prb_pos, prb_num) in enumerate(problem_nums[:-1]):
all_problems[prb_num[:-1]] = fmt_text(html[prb_pos:problem_nums[i+1][0]])
return all_problems
old_problems = rtv_problem_dict('Y:\\2011\\')
new_problems = rtv_problem_dict('Y:\\')
# Попарно сравниваем каждую старую и каждую новую задачу
old_keys = sorted(old_problems, key=lambda x:(int(x.split('.')[0]), x))
new_keys = sorted(new_problems, key=lambda x:(int(x.split('.')[0]), x))
cross = [[0] * len(new_keys) for _ in range(len(old_keys))]
for i in range(len(old_keys)):
print(old_keys[i], i)
for j in range(len(new_keys)):
cross[i][j] = distance(old_problems[old_keys[i]], new_problems[new_keys[j]])
cross[i][j] /= (len(old_problems[old_keys[i]]) + len(new_problems[new_keys[j]])) / 2
# def cnt(brd):
# return sum(cross[i][j] < brd for i in range(len(old_keys)) for j in range(len(new_keys)))
# Формируем словарь заимствований
taken = {key: '' for key in old_keys}
for i in range(len(old_keys)):
for j in range(len(new_keys)):
if cross[i][j] < 0.01:
taken[old_keys[i]] += '[взято в {}]'.format(new_keys[j])
elif cross[i][j] < 0.3:
taken[old_keys[i]] += '[вероятно взято в {}]'.format(new_keys[j])
elif cross[i][j] < 0.5:
taken[old_keys[i]] += '[может быть взято в {}]'.format(new_keys[j])
# Вставляем теги заимствований
html_path = 'Y:\\2011\\'
for filename in os.listdir(html_path):
if filename.lower().endswith('.html'):
print(filename)
with open(filename, 'r', encoding='utf-8') as f:
html = f.read()
problem_nums = []
for match in re.finditer(r'(?<="problem_num"><b>Задача )(\d+\.\d+)\.', html):
problem_nums.append((match.end(0), match.group(0)))
for match in re.finditer(r'(?<="problem_num">Задача )(\d+\.\d+)\.', html):
problem_nums.append((match.end(0), match.group(0)))
problem_nums.sort()
problem_nums.append((len(html), ''))
res_data = []
last_pos = 0
for i, (prb_pos, prb_num) in enumerate(problem_nums[:-1]):
res_data.append(html[last_pos:prb_pos])
print(taken.get(prb_num[:-1], ''))
res_data.append(taken.get(prb_num[:-1], ''))
last_pos = prb_pos
res_data.append(html[last_pos:])
new_html = ''.join(res_data)
with open(filename, 'w', encoding='utf-8') as f:
f.write(new_html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment