Skip to content

Instantly share code, notes, and snippets.

@icyleaf
Created July 5, 2010 10:41
Show Gist options
  • Save icyleaf/464226 to your computer and use it in GitHub Desktop.
Save icyleaf/464226 to your computer and use it in GitHub Desktop.
Parse HTML entries for allowed tags
<?php
/**
* Parse HTML entries for allowed tags
*
* @author icyleaf <[email protected]>
* @version 0.1
*/
class Text extends Kohana_Text{
//don't parse anything in these tags
public static $literals = array('code', 'pre');
// array with allowed attributes
public static $allowed_tags = array(
'a' => array('href', 'title'),
'img' => array('src', 'align', 'alt'),
'font' => array('family', 'color', 'size'),
'b' => array(),
'i' => array(),
'u' => array(),
's' => array(),
'strong' => array(),
'code' => array(),
'blockquote'=> array(),
'li' => array(),
'ul' => array(),
'ol' => array(),
);
//which url protocols are allowed
public static $allowed_protocols = array(
'http', 'https', 'ftp', 'mailto', 'feed', 'irc', 'thunder', 'bittorrent'
);
/**
* Allows html to be used in strings, but breaks out target related activities.
*
* @param string $str
* @return string
*/
public static function purify($str) {
$str = str_replace(array('&quot;', '&amp;'), array('"','&'), $str);
//handle literals
$literals_str = implode('|', TEXT::$literals);
$literals_reg = '/<('.$literals_str.')((?>[^>A-Za-z\d][^>]*)|)>(.+?)<\/\1((?>[^>A-Za-z\d][^>]*)|)>/si';
$str = preg_replace_callback($literals_reg, array('self', '_filter_block_tags'), $str);
//go through and check attributes of each tag
$str = preg_replace_callback('/<(\S*?)((?>[^>A-Za-z\d][^>]*)|)>(.+?)<\/\1((?>[^>A-Za-z\d][^>]*)|)>/si', array('self', '_filter_allow_tags'), $str);
return nl2br($str);
}
/**
* Filter block tags
*
* @param array $matches
* @return string
*/
private static function _filter_block_tags($matches)
{
return '<'.$matches[1].$matches[2].'>'.htmlspecialchars($matches[3]).'</'.$matches[1].'>';
}
/**
* Filter literals
*
* @param array $matches
* @return string
*/
private static function _filter_literals($matches)
{
// match node name
if(is_array($matches) AND count($matches) > 0 AND ! empty($matches[1]))
{
$node = strtolower($matches[1]);
}
else
{
return '';
}
// check allowed tags
if( ! isset(TEXT::$allowed_tags[$node]))
{
return '';
}
// filter attributes
$attribute_reg = '/(?<=[\s"\'`\/])((?>[\w\-]+))(?>[^A-Za-z\d\'"=]*)=(?>\s*)(((["\'`])(.*?)\4)|((?>[^\s]+)))(?>\s*)/si';
$result = preg_match_all($attribute_reg, $matches[2], $attributes, PREG_SET_ORDER);
$inside = '';
for($i = 0; $i < $result; $i++)
{
$inside .= TEXT::_filter_tag_attributes($node, $attributes[$i]);
}
// replace inside of tag
$content = preg_replace_callback('/<(\S*?)((?>[^>A-Za-z\d][^>]*)|)>(.+?)<\/\1((?>[^>A-Za-z\d][^>]*)|)>/si', array('self', '_filter_allow_tags'), $matches[3]);
return '<'.$node.$inside.'>'.$content.'</'.$node.'>';
}
/**
* Filter tag attributes
*
* @param string $node
* @param array $matches
* @return string
*/
private static function _filter_tag_attributes($node, $matches)
{
$attribute = strtolower($matches[1]);
if ( ! in_array($attribute, TEXT::$allowed_tags[$node]))
{
return '';
}
// do some actions for related tags
switch ($attribute)
{
case 'href':
$value = TEXT::_filter_protocol($value);
break;
}
$value = isset($matches[6]) ? $matches[6] : $matches[5];
return ' '.$attribute.'="'.$value.'"';
}
/**
* Filter protocol of url
*
* @param string $url
* @return string
*/
private static function _filter_protocol($url)
{
$url = trim($url);
if($url[0] == '#')
{
return $url;
}
$url_parts = parse_url($url);
if( ! isset($url_parts['scheme']) OR ! in_array($url_parts['scheme'], TEXT::$allowed_protocols))
{
$url = 'javascript:void(0);';
}
return $url;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment