Last active
December 23, 2016 01:03
-
-
Save dokenzy/9226268b0d70f10865b61cc024ac3d19 to your computer and use it in GitHub Desktop.
HTML 구조 비교
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
date: 2016. 12. 22 | |
author: Dokenzy | |
HTML 구조 비교 | |
""" | |
from difflib import HtmlDiff | |
import re | |
import uuid | |
from bs4 import BeautifulSoup, Comment | |
def structure(html): | |
"""html 파일에서 주석과 텍스트를 제거하고 태그만 리턴한다 | |
:param html: html 경로 | |
:type html: str | |
:return: HTML 코드 | |
:rtype: str | |
""" | |
with open(html, encoding='utf-8') as f: | |
soup = BeautifulSoup(f.read(), 'html.parser') | |
comments = soup.find_all(string=lambda text:isinstance(text, Comment)) | |
# [comment.extract() for comment in comments] | |
text = soup.find_all(string=re.compile('\w+')) | |
if text[0] == 'HTML': | |
text = text[1:] | |
[t.replace_with('') for t in text] # 텍스트를 삭제한다. | |
return str(soup) | |
def html_diff(a, b): | |
"""두 개의 html파일에서 주석과 텍스트를 제거하여 비교하고 파일로 저장한다. | |
:param a: HTML 파일 경로 | |
:type a: str | |
:param b: HTML 파일 경로 | |
:type b: str | |
""" | |
a_lines = structure(a).split('\n') | |
b_lines = structure(b).split('\n') | |
diff = HtmlDiff() | |
diff_name = 'diff-{}.html'.format(uuid.uuid4()) | |
with open(diff_name, 'w', encoding='utf-8') as f: | |
f.write(diff.make_file(fromlines=a_lines, tolines=b_lines)) | |
if __name__ == '__main__': | |
html_diff('a.html', 'b.html') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment