%{ #include "parse.h" #include "parse.tab.h" #pragma alloca extern YYSTYPE yylval; int inTag; int ignoreWhitespace; int oldWhitespaceState; int currentLine; const char* currentText; int lookup_token(const char* tokenString); %} STRING (\"[^\"]*\") ID ([^ \"\t\n><=!]+) COMMENT ("!"[^>]*) CHARACTER ("&"[^; \n]*;) STARTTAG ("<"[/]?[ \t]*{ID}) STARTCOMMENT ("<"[ \t]*"!") %% ">" {if (inTag != 0) { inTag = 0; return CLOSE_TAG; ignoreWhitespace = oldWhitespaceState; } else { REJECT; } } "=" {if (inTag != 0) { return EQUALS; } else { REJECT; } } {COMMENT} {if (inTag != 0) { int position; if ((strlen(yytext)) >= YYTEXT_SIZE) { fprintf(stderr, "FATAL ERROR: COMMENT TOO LONG, line %i\n", currentLine); } for (position = 0; position < yyleng; position++) { if (yytext[position] == '\n') { currentLine++; } } strcpy(yylval.text, yytext); return COMMENT_TEXT; } else { REJECT; } } {ID} {if (inTag != 0) { strcpy(yylval.text, yytext); return TAG_ID; } else { REJECT; } } {STARTCOMMENT} {inTag = 1; unput(yytext[yyleng - 1]); return START_COMMENT; } {STARTTAG} {int position, index, tag; char* tempString; /* tempString = (char*)malloc(yyleng + 1); */ tempString = (char*)alloca(yyleng + 1); for (position = 1, index = 0; position < yyleng; position++) { if (yytext[position] != ' ' && yytext[position] != '\t') { tempString[index++] = yytext[position]; } } tempString[index] = '\0'; tag = lookup_token(tempString); if (tag == START_UNKNOWN) { /* free(tempString); */ return PLAIN_TEXT; } inTag = 1; oldWhitespaceState = ignoreWhitespace; ignoreWhitespace = 1; strcpy(yylval.text, tempString); /* free(tempString); */ return tag; } {STRING} {if (inTag != 0) { int position; for (position = 0; position < yyleng; position++) { if (yytext[position] == '\n') { currentLine++; } } strcpy(yylval.text, yytext); return TAG_TEXT; } else { REJECT; } } {CHARACTER} {if (inTag != 0) { REJECT; } else { strcpy(yylval.text, yytext); return CHARACTER_ENTITY; } } [ \t\n]* {if (ignoreWhitespace == 0 && inTag == 0) { REJECT; } else { int position; for (position = 0; position < yyleng; position++) { if (yytext[position] == '\n') { currentLine++; } } } } [^<&\n]* {if (inTag != 0) { REJECT; } else { if ((strlen(yytext)) >= YYTEXT_SIZE) { yyless(YYTEXT_SIZE - 1); strncpy(yylval.text, yytext, YYTEXT_SIZE - 1); } else { strcpy(yylval.text, yytext); } return PLAIN_TEXT; } } [\n] {currentLine++; return NEWLINE; } "&"[^; <\n]* {if (inTag != 0) { REJECT; } else { strcpy(yylval.text, yytext); return PLAIN_TEXT; } } "<" { strcpy(yylval.text, yytext); return PLAIN_TEXT; } . { int badchar = yytext[0]; fprintf(stderr, "+++ UNHANDLED %c! (%i)\n", yytext[0], badchar); } %% void lex_init(FILE* infile) { inTag = 0; ignoreWhitespace = 1; currentLine = 1; currentTitle[0] = '\0'; yyin = infile; } void lex_restart(FILE* infile) { inTag = 0; ignoreWhitespace = 1; currentLine = 1; yyrestart(infile); } struct token_table { const char* string; const int token; } tokens[] = { {"html", START_HTML}, {"/html", END_HTML}, {"head", START_HEAD}, {"/head", END_HEAD}, {"body", START_BODY}, {"/body", END_BODY}, {"title", START_TITLE}, {"/title", END_TITLE}, {"isindex", ISINDEX}, {"h1", START_H1}, {"/h1", END_H1}, {"H2", START_H2}, {"/H2", END_H2}, {"H3", START_H3}, {"/H3", END_H3}, {"H4", START_H4}, {"/H4", END_H4}, {"H5", START_H5}, {"/H5", END_H5}, {"H6", START_H6}, {"/H6", END_H6}, {"A", START_A}, {"/A", END_A}, {"I", START_I}, {"/I", END_I}, {"B", START_B}, {"/B", END_B}, {"U", START_U}, {"/U", END_U}, {"S", START_S}, {"/S", END_S}, {"SUP", START_SUP}, {"/SUP", END_SUP}, {"SUB", START_SUB}, {"/SUB", END_SUB}, {"TT", START_TT}, {"/TT", END_TT}, {"EM", START_EM}, {"/EM", END_EM}, {"STRONG", START_STRONG}, {"/STRONG", END_STRONG}, {"PRE", START_PRE}, {"/PRE", END_PRE}, {"LIT", START_LIT}, {"/LIT", END_LIT}, {"QUOTE", START_QUOTE}, {"/QUOTE", END_QUOTE}, {"ABSTRACT", START_ABSTRACT}, {"/ABSTRACT", END_ABSTRACT}, {"BYLINE", START_BYLINE}, {"/BYLINE", END_BYLINE}, {"NOTE", START_NOTE}, {"/NOTE", END_NOTE}, {"ADDRESS", START_ADDRESS}, {"/ADDRESS", END_ADDRESS}, {"BLOCKQUOTE", START_BLOCKQUOTE}, {"/BLOCKQUOTE", END_BLOCKQUOTE}, {"CITE", START_CITE}, {"/CITE", END_CITE}, {"OL", START_OL}, {"/OL", END_OL}, {"UL", START_UL}, {"/UL", END_UL}, {"LI", START_LI}, {"/LI", END_LI}, {"MENU", START_MENU}, {"/MENU", END_MENU}, {"DIR", START_DIR}, {"/DIR", END_DIR}, {"DL", START_DL}, {"/DL", END_DL}, {"DT", START_DT}, {"/DT", END_DT}, {"DD", START_DD}, {"/DD", END_DD}, {"FONT", START_FONT}, {"/FONT", END_FONT}, {"TABLE", START_TABLE}, {"/TABLE", END_TABLE}, {"TH", START_TH}, {"/TH", END_TH}, {"TD", START_TD}, {"/TD", END_TD}, {"TR", START_TR}, {"/TR", END_TR}, {"TB", START_TB}, {"/TB", END_TB}, {"CAPTION", START_CAPTION}, {"/CAPTION", END_CAPTION}, {"HR", HR}, {"BR", BR}, {"P", P}, {"TAB", TAB}, {"IMG", IMG}, {"IMAGE", IMAGE} }; int lookup_token(const char* tokenString) { int index; /* printf("lookup_token(%s), line %i;\n", tokenString, currentLine); */ for (index = 0; index < (sizeof(tokens) / sizeof(tokens[0])); index++) { if (strcasecmp(tokenString, tokens[index].string) == 0) { /* printf("FOUND TOKEN! %s\n", tokenString); */ return tokens[index].token; } } if (tokenString[0] == '/') { return END_UNKNOWN; } else { /* printf("> Unknown tag: %s (treating as text)\n", tokenString); */ return START_UNKNOWN; } }