Skip to content

Instantly share code, notes, and snippets.

@quxiaowei
Last active July 14, 2016 01:41
Show Gist options
  • Save quxiaowei/564cb2874b0d339052be50d93122839e to your computer and use it in GitHub Desktop.
Save quxiaowei/564cb2874b0d339052be50d93122839e to your computer and use it in GitHub Desktop.
删除 html 中指定标签
# delete specific tags from html file
# by quxiaowei
# @ 20160702
i = j = 0
text = """
<p class="p"><b><span style="font-family: 'Times New Roman': font-size: 12pt;">一、岗位及人数</span></b><span style="font-family: 'Times New Roman'; font-size: 12pt;"><o:p></o:p></span></p><p class="p"><span style="font-family: 'Times New Roman'; font-size: 12pt;">办公室工作人员,<font face="Times New Roman">1</font><font face="微软雅黑">名;教师,</font><font face="Times New Roman">4</font><font face="微软雅黑">名。</font></span><font class="
"""
text_o = ''
tags = set(['p', '/p', 'span', '/span'])
for i in range(0, len(text)):
if text[i] == '<':
text_o = text_o + text[j:i]
j = i + 1
elif text[i] == '>':
tag = text[j:i].strip().split(' ')[0]
print(tag)
if tag in tags:
text_o = text_o + text[j-1:i+1]
j = i + 1
print(text_o)
i = j = 0
text_o = ''
while i < len(text):
try:
i = text.index('<', j)
text_o += text[j+1:i]
j = text.index('>', i)
except:
break
tag = text[i+1:j].strip().split(' ')[0]
if tag in tags:
text_o += text[i:j+1]
text_o += text[j+1:]
print(text_o)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment