Skip to content

Instantly share code, notes, and snippets.

@adyliu
Created December 28, 2012 11:48
Show Gist options
  • Save adyliu/4397068 to your computer and use it in GitHub Desktop.
Save adyliu/4397068 to your computer and use it in GitHub Desktop.
substring html code without html breaking
adylab:script adyliu$ python3 htmlsubstring.py
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div><h2>标题党</h2></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,</div></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,</div></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不</div></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不</div></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div></div>
<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div></div>
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
#substring html code without html breaking
#author: Ady Liu(http://github.com/adyliu)
#date: 2012-12-28
notags=('img','br','hr')
class Tag():
def __init__(self,label,pre=None):
self.pre=pre
self.label=label
self.end=False
def __str__(self):
return self.label+':'+str(self.end)
def __repr__(self):
return self.__str__()
def substring2(html,size):
if len(html) <= size:
return html
result,tag,count='','',0,0
tags=[]
for c in html:
result += c
if c == '<':
intag=True
elif c=='>':
intag=False
tag=tag.split()[0]
if tag[0] == '/':
tag = tag.replace('/','')
if tag not in notags:
tags.pop()
else:
if tag[-1] != '/' and tag not in notags:
tags.append(tag)
tag=''
else:
if intag:
tag += c
else:
count+=1
if count>=size: break
while len(tags)>0:
result += '</{0}>'.format(tags.pop())
return result
def substring(html,size):
if len(html) <= size:
return html
result=''
offset,count=0,0
tags=[]
tag=''
cur=None
for c in html:
result += c
offset += 1
if c=='<':
intag=True
elif c=='>':
intag=False
tag=tag.split()[0]
if tag[-1] == '/': tag=tag[0:-1]
if tag[0] == '/':
tag = tag[1:]
if tag not in notags:
while cur:
if cur.label == tag and not cur.end:
cur.end=True
break
cur=cur.pre
else:
if tag not in notags:
t = Tag(tag,cur)
tags.append(t)
cur = t
tag=''
else:
if intag:
tag+=c
else:
count+=1
if count>=size:break
#print('tags',tags)
for t in reversed(tags):
if not t.end:
result += '</{0}>'.format(t.label)
return result
if __name__ == '__main__':
s='<div class="test">测试<span>谁知道&amp;<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div><h2>标题党</h2></div>'
for i in range(18,30):
print(s)
print(substring(s,i))
print(substring2(s,i))
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment