Last active
July 14, 2016 01:41
-
-
Save quxiaowei/564cb2874b0d339052be50d93122839e to your computer and use it in GitHub Desktop.
删除 html 中指定标签
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# delete specific tags from html file | |
# by quxiaowei | |
# @ 20160702 | |
i = j = 0 | |
text = """ | |
<p class="p"><b><span style="font-family: 'Times New Roman': font-size: 12pt;">一、岗位及人数</span></b><span style="font-family: 'Times New Roman'; font-size: 12pt;"><o:p></o:p></span></p><p class="p"><span style="font-family: 'Times New Roman'; font-size: 12pt;">办公室工作人员,<font face="Times New Roman">1</font><font face="微软雅黑">名;教师,</font><font face="Times New Roman">4</font><font face="微软雅黑">名。</font></span><font class=" | |
""" | |
text_o = '' | |
tags = set(['p', '/p', 'span', '/span']) | |
for i in range(0, len(text)): | |
if text[i] == '<': | |
text_o = text_o + text[j:i] | |
j = i + 1 | |
elif text[i] == '>': | |
tag = text[j:i].strip().split(' ')[0] | |
print(tag) | |
if tag in tags: | |
text_o = text_o + text[j-1:i+1] | |
j = i + 1 | |
print(text_o) | |
i = j = 0 | |
text_o = '' | |
while i < len(text): | |
try: | |
i = text.index('<', j) | |
text_o += text[j+1:i] | |
j = text.index('>', i) | |
except: | |
break | |
tag = text[i+1:j].strip().split(' ')[0] | |
if tag in tags: | |
text_o += text[i:j+1] | |
text_o += text[j+1:] | |
print(text_o) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment