Created
December 28, 2012 11:48
-
-
Save adyliu/4397068 to your computer and use it in GitHub Desktop.
substring html code without html breaking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
adylab:script adyliu$ python3 htmlsubstring.py | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div><h2>标题党</h2></div> | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span></div> | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span></div> | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,</div></div> | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,</div></div> | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不</div></div> | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不</div></div> | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div></div> | |
<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div></div> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#-*- coding: utf-8 -*- | |
#substring html code without html breaking | |
#author: Ady Liu(http://github.com/adyliu) | |
#date: 2012-12-28 | |
notags=('img','br','hr') | |
class Tag(): | |
def __init__(self,label,pre=None): | |
self.pre=pre | |
self.label=label | |
self.end=False | |
def __str__(self): | |
return self.label+':'+str(self.end) | |
def __repr__(self): | |
return self.__str__() | |
def substring2(html,size): | |
if len(html) <= size: | |
return html | |
result,tag,count='','',0,0 | |
tags=[] | |
for c in html: | |
result += c | |
if c == '<': | |
intag=True | |
elif c=='>': | |
intag=False | |
tag=tag.split()[0] | |
if tag[0] == '/': | |
tag = tag.replace('/','') | |
if tag not in notags: | |
tags.pop() | |
else: | |
if tag[-1] != '/' and tag not in notags: | |
tags.append(tag) | |
tag='' | |
else: | |
if intag: | |
tag += c | |
else: | |
count+=1 | |
if count>=size: break | |
while len(tags)>0: | |
result += '</{0}>'.format(tags.pop()) | |
return result | |
def substring(html,size): | |
if len(html) <= size: | |
return html | |
result='' | |
offset,count=0,0 | |
tags=[] | |
tag='' | |
cur=None | |
for c in html: | |
result += c | |
offset += 1 | |
if c=='<': | |
intag=True | |
elif c=='>': | |
intag=False | |
tag=tag.split()[0] | |
if tag[-1] == '/': tag=tag[0:-1] | |
if tag[0] == '/': | |
tag = tag[1:] | |
if tag not in notags: | |
while cur: | |
if cur.label == tag and not cur.end: | |
cur.end=True | |
break | |
cur=cur.pre | |
else: | |
if tag not in notags: | |
t = Tag(tag,cur) | |
tags.append(t) | |
cur = t | |
tag='' | |
else: | |
if intag: | |
tag+=c | |
else: | |
count+=1 | |
if count>=size:break | |
#print('tags',tags) | |
for t in reversed(tags): | |
if not t.end: | |
result += '</{0}>'.format(t.label) | |
return result | |
if __name__ == '__main__': | |
s='<div class="test">测试<span>谁知道&<br>是<br/>什么<br />东东<i>也许吧</i></span><div>我靠,我就不信不行</div><h2>标题党</h2></div>' | |
for i in range(18,30): | |
print(s) | |
print(substring(s,i)) | |
print(substring2(s,i)) | |
print() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment