mohamed-ali · January 4, 2014 13:34
diff --git a/lexer.py b/lexer.py
 import ply.lex as lex 

 #list of tokens 
 tokens = (
        'LANGLE', #<
        'LANGLESLASH', #</
        'RANGLE', #>
        'EQUAL', # =
        'STRING', # "hello"
        'WORD', # Welcome! 
    )


 #tokens to ignore
 t_ignore = ' '#shortcut for whitespace

 #start of comment token 
 def t_htmlcomment(token):
    r'<!--'
    token.lexer.begin('htmlcomment')
    
    
 #end of comment token 
 def t_htmlcomment_end(token):
    r'-->'
    token.lexer.lineno += token.value.count('\n')
    token.lexer.begin('INITIAL')

 #skip everything in a comment 
 def t_htmlcomment_error(token):
    token.lexer.skip(1) #pass 


 #new line counter 
 def t_newline(token):
    r'\n'
    token.lexer.lineno +=1
    pass 

 #left angle, slash token 
 def t_LANGLESLASH(token):
    r'</'
    return token

 #left angle token 
 def t_LANGLE(token):
    r'<'
    return token

 #right angle token 
 def t_RANGLE(token):
    r'>'
    return token

 #equal token 
 def t_EQUAL(token):
    r'='
    return token

 #string token 
 def t_STRING(token):
    r'"[^"]*"'
    token.value = token.value[1:-1]
    return token

 #word token 
 def t_WORD(token):
    r'[^ <>\n]+'
    return token


 #how to test 
 webpage = "this is <b>my</b> webpage!"
 htmllexer = lex.lex()
 htmllexer.input(webpage)

 while True:
    tok = htmllexer.token()
    if not tok: break
    print tok
	import ply.lex as lex

	#list of tokens
	tokens = (
	'LANGLE', #<
	'LANGLESLASH', #</
	'RANGLE', #>
	'EQUAL', # =
	'STRING', # "hello"
	'WORD', # Welcome!
	)


	#tokens to ignore
	t_ignore = ' '#shortcut for whitespace

	#start of comment token
	def t_htmlcomment(token):
	r'<!--'
	token.lexer.begin('htmlcomment')


	#end of comment token
	def t_htmlcomment_end(token):
	r'-->'
	token.lexer.lineno += token.value.count('\n')
	token.lexer.begin('INITIAL')

	#skip everything in a comment
	def t_htmlcomment_error(token):
	token.lexer.skip(1) #pass


	#new line counter
	def t_newline(token):
	r'\n'
	token.lexer.lineno +=1
	pass

	#left angle, slash token
	def t_LANGLESLASH(token):
	r'</'
	return token

	#left angle token
	def t_LANGLE(token):
	r'<'
	return token

	#right angle token
	def t_RANGLE(token):
	r'>'
	return token

	#equal token
	def t_EQUAL(token):
	r'='
	return token

	#string token
	def t_STRING(token):
	r'"[^"]*"'
	token.value = token.value[1:-1]
	return token

	#word token
	def t_WORD(token):
	r'[^ <>\n]+'
	return token


	#how to test
	webpage = "this is <b>my</b> webpage!"
	htmllexer = lex.lex()
	htmllexer.input(webpage)

	while True:
	tok = htmllexer.token()
	if not tok: break
	print tok
No results found