Created
September 10, 2012 09:51
-
-
Save rambuvn/3690013 to your computer and use it in GitHub Desktop.
Strip some html tag without strip_tags
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function strip_html_tags( $text ) | |
{ | |
$text = preg_replace( | |
array( | |
// Remove invisible content | |
'@<head[^>]*?>.*?</head>@siu', | |
'@<style[^>]*?>.*?</style>@siu', | |
'@<script[^>]*?.*?</script>@siu', | |
'@<object[^>]*?.*?</object>@siu', | |
'@<embed[^>]*?.*?</embed>@siu', | |
'@<applet[^>]*?.*?</applet>@siu', | |
'@<noframes[^>]*?.*?</noframes>@siu', | |
'@<noscript[^>]*?.*?</noscript>@siu', | |
'@<noembed[^>]*?.*?</noembed>@siu', | |
// Add line breaks before and after blocks | |
'@</?((address)|(blockquote)|(center)|(del))@iu', | |
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', | |
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', | |
'@</?((table)|(th)|(td)|(caption))@iu', | |
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', | |
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', | |
'@</?((frameset)|(frame)|(iframe))@iu', | |
), | |
array( | |
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",), $text ); | |
// you can exclude some html tags here, in this case B and A tags | |
return strip_tags( $text , '<b><a>' ); | |
} | |
function strip_spectags( $text ){ | |
$text = preg_replace( | |
array( | |
// Remove invisible content | |
'@<iframe[^>]*?>.*?</iframe>@siu', | |
'@<br.*?>@siu' | |
), | |
array( | |
' ', ' '), $text ); | |
return $text; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you're still around, thanks for the code. It works in most cases I've found. Where it doesn't work is when the html is somehow malformed in which case it seems to produce an empty string. In that case I just resort to a much slower dom method. I also added '@<a[^>]?.?a>@siu', to strip out all href's and the link text. The first time you hit a page with 178,000 a hrefs , it'll become apparent why.