Skip to content

Instantly share code, notes, and snippets.

@clausecker
Last active October 2, 2019 23:10
Show Gist options
  • Save clausecker/01cca42e2c9695a4ff0006ef7dfd11dd to your computer and use it in GitHub Desktop.
Save clausecker/01cca42e2c9695a4ff0006ef7dfd11dd to your computer and use it in GitHub Desktop.
Lexer that detects links in html documents
/* http://stackoverflow.com/a/1732454/417501 */
%{
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#define MYEOF EOF
#define TOKEN_URL 257
#define TOKEN_TEXT 258
#ifndef YYSTYPE
#define YYSTYPE yystype
typedef char* yystype;
#endif
extern int yylex();
extern FILE* yyin;
extern yystype yylval;
extern int yylineno;
%}
/* yylineno is not portable and not part of POSIX! */
%option yylineno
%s PREHREF
%s POSTHREF
%s HREFVAL
%s ATTRIBVAL
%s LTEXT
S [[:blank:]]|\n
%%
<INITIAL><a/{S} BEGIN PREHREF; /* opening tag */
<PREHREF>{S}+[^[:blank:]"'>/=]+{S}*={S}* { /* attribute */
char *attr = yytext;
/* skip leading blanks */
while (isblank(*attr)) attr++;
/* href attribute present? */
if (strncmp(attr, "href", 4) == 0
&& (isblank(attr[4]) || attr[4] == '='))
BEGIN HREFVAL;
else
BEGIN ATTRIBVAL;
}
<PREHREF>{S}*>[^<]*"</a"{S}*> BEGIN INITIAL; /* skip a tag without href attribute */
<POSTHREF>({S}|[^"<]|["][^"]*["])*> BEGIN LTEXT; /* skip attributes following href */
<LTEXT>(\n|[^<])* {
yylval = yytext;
return TOKEN_TEXT;
}
<LTEXT>"</a"{S}*> BEGIN INITIAL;
<HREFVAL>["][^"]*["] { /* href attribute value */
yylval = yytext + 1; /* delete quotes */
*strchr(yylval, '"') = '\0';
BEGIN POSTHREF;
return TOKEN_URL;
}
<ATTRIBVAL>["][^"]*["] BEGIN PREHREF; /* other attributes */
<INITIAL>.|\n ;
<<EOF>> return MYEOF; /* violates the conventions! */
%%
/* allow us to link without -ll */
extern int
yywrap(void)
{
return 1;
}
yystype yylval;
int main(int argc, char* argv[]) {
int token;
if (argc != 2) yyin = stdin;
else {
yyin = fopen(argv[1], "r");
if (yyin == 0) {
fprintf(stderr,
"Error: could not open file %s for reading.\n",
argv[1]);
exit(-1);
}
}
while ((token = yylex()) != MYEOF) {
if (token == TOKEN_URL) {
printf("Line: %3d\tURL: %s\n", yylineno, yylval);
} else if (token == TOKEN_TEXT) {
printf("Line: %3d\tTEXT: %s\n", yylineno, yylval);
} else {
printf("Error\n");
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment