Last active
October 2, 2019 23:10
-
-
Save clausecker/01cca42e2c9695a4ff0006ef7dfd11dd to your computer and use it in GitHub Desktop.
Lexer that detects links in html documents
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* http://stackoverflow.com/a/1732454/417501 */ | |
| %{ | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <ctype.h> | |
| #include <string.h> | |
| #define MYEOF EOF | |
| #define TOKEN_URL 257 | |
| #define TOKEN_TEXT 258 | |
| #ifndef YYSTYPE | |
| #define YYSTYPE yystype | |
| typedef char* yystype; | |
| #endif | |
| extern int yylex(); | |
| extern FILE* yyin; | |
| extern yystype yylval; | |
| extern int yylineno; | |
| %} | |
| /* yylineno is not portable and not part of POSIX! */ | |
| %option yylineno | |
| %s PREHREF | |
| %s POSTHREF | |
| %s HREFVAL | |
| %s ATTRIBVAL | |
| %s LTEXT | |
| S [[:blank:]]|\n | |
| %% | |
| <INITIAL><a/{S} BEGIN PREHREF; /* opening tag */ | |
| <PREHREF>{S}+[^[:blank:]"'>/=]+{S}*={S}* { /* attribute */ | |
| char *attr = yytext; | |
| /* skip leading blanks */ | |
| while (isblank(*attr)) attr++; | |
| /* href attribute present? */ | |
| if (strncmp(attr, "href", 4) == 0 | |
| && (isblank(attr[4]) || attr[4] == '=')) | |
| BEGIN HREFVAL; | |
| else | |
| BEGIN ATTRIBVAL; | |
| } | |
| <PREHREF>{S}*>[^<]*"</a"{S}*> BEGIN INITIAL; /* skip a tag without href attribute */ | |
| <POSTHREF>({S}|[^"<]|["][^"]*["])*> BEGIN LTEXT; /* skip attributes following href */ | |
| <LTEXT>(\n|[^<])* { | |
| yylval = yytext; | |
| return TOKEN_TEXT; | |
| } | |
| <LTEXT>"</a"{S}*> BEGIN INITIAL; | |
| <HREFVAL>["][^"]*["] { /* href attribute value */ | |
| yylval = yytext + 1; /* delete quotes */ | |
| *strchr(yylval, '"') = '\0'; | |
| BEGIN POSTHREF; | |
| return TOKEN_URL; | |
| } | |
| <ATTRIBVAL>["][^"]*["] BEGIN PREHREF; /* other attributes */ | |
| <INITIAL>.|\n ; | |
| <<EOF>> return MYEOF; /* violates the conventions! */ | |
| %% | |
| /* allow us to link without -ll */ | |
| extern int | |
| yywrap(void) | |
| { | |
| return 1; | |
| } | |
| yystype yylval; | |
| int main(int argc, char* argv[]) { | |
| int token; | |
| if (argc != 2) yyin = stdin; | |
| else { | |
| yyin = fopen(argv[1], "r"); | |
| if (yyin == 0) { | |
| fprintf(stderr, | |
| "Error: could not open file %s for reading.\n", | |
| argv[1]); | |
| exit(-1); | |
| } | |
| } | |
| while ((token = yylex()) != MYEOF) { | |
| if (token == TOKEN_URL) { | |
| printf("Line: %3d\tURL: %s\n", yylineno, yylval); | |
| } else if (token == TOKEN_TEXT) { | |
| printf("Line: %3d\tTEXT: %s\n", yylineno, yylval); | |
| } else { | |
| printf("Error\n"); | |
| } | |
| } | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment