Regular expressions

Catch all html/xml tags (even with namespaces)

It only captures the tag's FQN along with the optional leading ! and leading/trailing /.

/<((?:\/|\!)?(?:[a-zA-Z0-9_-]+:)?[a-zA-Z0-9_-]+)(?:\s*(?:(?:[a-zA-Z0-9_-]+:)?[a-zA-Z0-9_-]+)(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s]+))?)*\s*(\/?)>/g

Parsing XML/HTML with one regex

var tag_re=/<((?:\/|\!)?(?:[a-zA-Z0-9_-]+:)?[a-zA-Z0-9_-]+)(?:\s*(?:(?:[a-zA-Z0-9_-]+:)?[a-zA-Z0-9_-]+)(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s]+))?)*\s*(\/?)>/g

var parse = function(src) {
    var lastIndex = 0,
        emitTag = function(tag) {
            if (tag.index > lastIndex) {
                var text = src.slice(lastIndex, tag.index).trim()
                if (text) console.log("Normal text: " + text)
            }
            lastIndex = tag.index + tag.length
            console.log("Tag: " + JSON.stringify(tag))
            // TODO: just needs to build a tree with the above, and parse the individual tags
            // with another (simpler) regex
        }

    // Doesn't replace anything
    src.replace(tag_re, function(match, tag, selfclosing, index) {
        var closing = (tag[0] === '/'),
            special = (tag[0] === '!')
        tag = (closing || special) ? tag.substring(1) : tag
        emitTag({
            name: tag,
            closing: closing,
            special: special,
            selfclosing: special || !!selfclosing,
            index: index,
            length: match.length,
            full: match
        })
        return match
    })
}

jcayzac/gist:2265478

Catch all html/xml tags (even with namespaces)

Parsing XML/HTML with one regex