Skip to content

Instantly share code, notes, and snippets.

@clausecker
Created May 25, 2015 16:01
Show Gist options
  • Select an option

  • Save clausecker/1d66f419f44a1d3154af to your computer and use it in GitHub Desktop.

Select an option

Save clausecker/1d66f419f44a1d3154af to your computer and use it in GitHub Desktop.
lexing html with regices
/* http://stackoverflow.com/a/1732454/417501 */
%{
#include <ctype.h>
#include <string.h>
#include "urlscanner.h"
%}
/* yylineno ist unportabel und nicht Teil von POSIX! */
%option yylineno
%s PREHREF
%s POSTHREF
%s HREFVAL
%s ATTRIBVAL
%s LTEXT
S [[:blank:]]|\n
%%
<INITIAL><a/{S} BEGIN PREHREF; /* oeffnendes Tag */
<PREHREF>{S}+[^[:blank:]"'>/=]+{S}*={S}* { /* Attribut */
char *attr = yytext;
/* ueberspringe fuehrende Leerzeichen */
while (isblank(*attr)) attr++;
/* liegt ein href-Attribut vor? */
if (strncmp(attr, "href", 4) == 0
&& (isblank(attr[4]) || attr[4] == '='))
BEGIN HREFVAL;
else
BEGIN ATTRIBVAL;
}
<PREHREF>{S}*>[^<]*"</a"{S}*> BEGIN INITIAL; /* ueberspringe href-lose a-Tags */
<POSTHREF>({S}|[^"<]|["][^"]*["])*> BEGIN LTEXT; /* ueberspringe Attribute nach href */
<LTEXT>(\n|[^<])* {
yylval = yytext;
return TOKEN_TEXT;
}
<LTEXT>"</a"{S}*> BEGIN INITIAL;
<HREFVAL>["][^"]*["] { /* Wert des href-Attributs */
yylval = yytext + 1; /* entferne Anfuehrungszeichen */
*strchr(yylval, '"') = '\0';
BEGIN POSTHREF;
return TOKEN_URL;
}
<ATTRIBVAL>["][^"]*["] BEGIN PREHREF; /* andere Attribute */
<INITIAL>.|\n ;
<<EOF>> return MYEOF; /* Verstoss gegen Konventionen! */
%%
/* notwendig, da nicht mit -ll gelinkt wird */
extern int
yywrap(void)
{
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment