正则语言引擎:一个简单LEX和YACC结合运用的实例
本文先描述了 熟悉LEX和YACC工具lex工具简介
[第一部分:定义段]
%% [第二部分:词法规则段] %%
[第三部分:辅助函数段]
第一部分Part1.1
%{
#include <ctype.h>
char tokenString[MAXTOKENLEN+1];
%}
Part1.2
digit [0-9]
number {digit}+
letter [a-zA-Z]
identifier {letter}+
newline n
whitespace [ t]+
%s COMMENT BAD
第二部分
"if" {return IF;}
"then" {return THEN;}
"else" {return ELSE;}
第三部分
LEX中的变量
一个简易的词法分析器保留字有 int float main return while do if else for
符号有 < <= = <> > >= + - * / ( ) { } ;
双引号内容当作 书写 %{
#include <stdio.h>
#include <stdlib.h>
#define LT 1
#define LE 2
#define GT 3
#define GE 4
#define EQ 5
#define NE 6
#define WHILE 18
#define DO 19
#define ID 20
#define NUMBER 21
#define RELOP 22
#define NEWLINE 23
#define IF 24
#define ELSE 25
#define FOR 26
#define PLUS 27
#define MINUS 28
#define TIMES 29
#define OVER 30
#define LPAREN 31
#define RPAREN 32
#define SEMI 33
#define OP 34
#define LBRACE 35
#define RBRACE 36
#define INT 37
#define MAIN 38
#define RETURN 39
#define FLOAT 40
#define MSG 41
#define ERRORCHAR 42
int yylval;
int installID ();
int installNum ();
%}
delim [ t n]
ws {delim}+
letter [A-Za-z_]
digit [0-9]
id {letter}({letter}|{digit})*
number {digit}+(.{digit}+)?(E[+-]?{digit}+)?
%%
{ws} {;}
int {return (INT);}
float {return (FLOAT);}
main {return (MAIN);}
return {return (RETURN);}
while {return (WHILE);}
do {return (DO);}
if {return (IF);}
else {return (ELSE);}
for {return (FOR);}
{id} {yylval = installID (); return (ID);}
{number} {yylval = installNum (); return (NUMBER);}
"<" {yylval = LT; return (RELOP);}
"<=" {yylval = LE; return (RELOP);}
"=" {yylval = EQ; return (RELOP);}
"<>" {yylval = NE; return (RELOP);}
">" {yylval = GT; return (RELOP);}
">=" {yylval = GE; return (RELOP);}
"+" {yylval = PLUS; return (OP);}
"-" {yylval = MINUS; return (OP);}
"*" {yylval = TIMES; return (OP);}
"/" {yylval = OVER; return (OP);}
"(" {return LPAREN;}
")" {return RPAREN;}
";" {return SEMI;}
"{" {return LBRACE;}
"}" {return RBRACE;}
""" {
char c;
int i = 0;
do{
yytext[i++]=c;
c = input();
} while (c != '"');
yytext[i]=' ';
return MSG;
}
. {yylval = ERRORCHAR; return ERRORCHAR;}
%%
int installID () {
return ID;
}
int installNum () {
return NUMBER;
}
int yywrap (){
return 1;
}
void writeout(int c){
switch(c){
case ERRORCHAR: fprintf(yyout,"(ERRORCHAR,"%s") ",yytext);break;
case RELOP: fprintf(yyout,"(RELOP,yytext);break;
case WHILE: fprintf(yyout,"(WHILE,yytext);break;
case DO: fprintf(yyout,"(DO,yytext);break;
case NUMBER: fprintf(yyout,"(NUM,yytext);break;
case ID: fprintf(yyout,"(ID,yytext);break;
case NEWLINE: fprintf(yyout,"n");break;
case IF: fprintf(yyout,"(IF,yytext);break;
case ELSE: fprintf(yyout,"(ELSE,yytext);break;
case FOR: fprintf(yyout,"(FOR,yytext);break;
case PLUS:fprintf(yyout,"(PLUS,yytext);break;
case MINUS:fprintf(yyout,"(MINUS,yytext);break;
case OVER:fprintf(yyout,"(OVER,yytext);break;
case LPAREN:fprintf(yyout,"(LPAREN,yytext);break;
case RPAREN:fprintf(yyout,"(RPAREN,yytext);break;
case SEMI:fprintf(yyout,"(SEMI,yytext);break;
case LBRACE:fprintf(yyout,"(LBRACE,yytext);break;
case RBRACE:fprintf(yyout,"(RBRACE,yytext);break;
case INT:fprintf(yyout,"(INT,yytext);break;
case MAIN:fprintf(yyout,"(MAIN,yytext);break;
case RETURN:fprintf(yyout,"(RETURN,yytext);break;
case FLOAT:fprintf(yyout,"(FLOAT,yytext);break;
case MSG:fprintf(yyout,"(MSG,yytext);break;
case OP:{
switch(yylval){
case PLUS: fprintf(yyout,yytext);break;
case MINUS: fprintf(yyout,yytext);break;
case TIMES: fprintf(yyout,"(TIMES,yytext);break;
case OVER: fprintf(yyout,yytext);break;
}
};break;
default:fprintf(yyout,"(我不知道发生了什么,但是你大概是写错了. "%s") ",yytext);break;
}
return;
}
int main (int argc,char* argv[]){
int c,j=0;
if (argc>=2){
if ((yyin = fopen(argv[1],"r")) == NULL){
printf("Can't open file %sn",argv[1]);
return 1;
}
if (argc>=3){
yyout=fopen(argv[2],"w");
}
}
else{
printf("Usage:./scanner [source file] [dst file]n");
exit(0);
}
while ((c = yylex())!=0){
writeout(c);
writeout(NEWLINE);
}
if(argc>=2){
fclose(yyin);
if (argc>=3) fclose(yyout);
}
return 0;
}
学习YACC工具Yacc 代表
引用网上的一个比喻
创建有四个步骤
YACC 语法规则Yacc 语法规则具有以下一般格式: result: components { /* action to be taken in C */ };
在这个例子中,result 是规则描述的非终端符号。Components 是根据规则放在一起的不同的终端和非终端符号。 如果匹配特定序列的话 Components 后面可以跟随要执行的动作。 param : NAME EQ NAME { printf("tName:%stValue(name):%sn",$1,$3);}
| NAME EQ VALUE{ printf("tName:%stValue(value):%sn",$3);}
;
如果上例中序列 NAME EQ NAME 被匹配,将执行相应的 { } 括号中的动作。 这里另一个有用的就是 lexer 通过 Yacc 的变量 yylval 返回这些值。标记 NAME 的 Lex 代码是这样的: char [A-Za-z]
name {char}+
%%
{name} { yylval = strdup(yytext);
return NAME; }
文件解析例子的规则段是这样的: 文件解析的语法file : record file
| record
;
record: NAME EQ AGE { printf("%s is now %s years old!!!",$3);};
%%
一般来说,Yacc 最好提供 yyerror(char msg) 函数的代码。 当解析器遇到错误时调用 yyerror(char msg)。错误消息作为参数来传递。 一个简单的 yyerror( char* ) 可能是这样的: int yyerror(char* msg){
printf("Error: %s encountered at line number:%dn",msg,yylineno);
}
用LEX和YACC工具分析正则表达式正则语言的词法定义
写出其词法分析器为 %{
#include <stdio.h>
#include <string.h>
#define LETTER 1
#define D_QUOTATION 2
#define BACKSLASH 3
#define L_PARENTHESES 4
#define R_PARENTHESES 5
#define L_SQUARE_BRACK 6
#define R_SQUARE_BRACK 7
#define L_CURLY_BRACES 8
#define R_CURLY_BRACES 9
#define STAR 10
#define PLUS 11
#define HORIZONTAL_LINE 12
#define DOT 13
#define QUESTION_MARK 14
#define VERTICAL_LINE 15
#define CARET 16
#define EQUAL 17
#define EMPTY 18
#define BLANK 19
%}
allset [!-~]
blank [trn]+
%%
""" {return (D_QUOTATION);}
"" {return (BACKSLASH);}
"(" {return (L_PARENTHESES);}
")" {return (R_PARENTHESES);}
"[" {return (L_SQUARE_BRACK);}
"]" {return (R_SQUARE_BRACK);}
"{" {return (L_CURLY_BRACES);}
"}" {return (R_CURLY_BRACES);}
"*" {return (STAR);}
"+" {return (PLUS);}
"-" {return (HORIZONTAL_LINE);}
"." {return (DOT);}
"?" {return (QUESTION_MARK);}
"|" {return (VERTICAL_LINE);}
"^" {return (CARET);}
"=" {return (EQUAL);}
"ε" {return (EMPTY);}
{blank} {return (BLANK);}
{allset} {return (LETTER);}
%%
int yywrap(){
return 1;
}
int main (int argc,j=0;
if (argc>=2){
if ((yyin = fopen(argv[1],"r")) == NULL){
printf("Can't open file %sn",argv[1]);
return 1;
}
}
else{
printf("Usage:./scanner [source file] [dst file]n");
exit(0);
}
while ((c = yylex())!=0){
printf("%dn",c);
}
fclose(yyin);
return 0;
}
简单测试 正则表达式的语法规则可以用 binary = (0|1)+
real = {integer}.{nature}
正则表达式的上下文无关文法为 # List表示多个正则表达式的列表
List → List Named_RE | Named_RE
# Name_RE表示命名或未命名的正则表达式
Named_RE → ID = RE | RE
# RE 表示正则表达式
RE → RE “|” No_Select | No_Select
# No_Select表示不包含选择运算的正则表达式
No_Select → No_Select No_Join | No_Join
# No_Join表示不包含选择或者连接运算的正则表达式
No_Join → Base | Base * | Base + | Base ?
# Base表示最基本的正则表达式
Base → ( RE ) | LETTER | [No_Select] | "No_select " | . | { ID }
# ID表示标识符
ID → ID <letter> | ID <number> | <letter>
根据这些规则写的 %token D_QUOTATION BACKSLASH L_PARENTHESES R_PARENTHESES L_SQUARE_BRACK
R_SQUARE_BRACK L_CURLY_BRACES R_CURLY_BRACES STAR PLUS HORIZONTAL_LINE
DOT QUESTION_MARK VERTICAL_LINE CARET EQUAL EMPTY RE_LETTER LETTER
NUMBER BLANK
%%
List : Named_RE {printf("Finish one linennn");} List
| Named_RE {printf("Finish prog! End!nn");}
;
Named_RE : ID EQUAL {printf("Find an IDnn");} RE BLANK
| RE {printf("Find an REn");} BLANK
;
RE : RE VERTICAL_LINE {printf("Finish VERTICAL_LINEn");} No_Select
| { printf("%d No_select startn",debug_i); debug_i++;}
No_Select
{ printf("Finish %d No_Selectn",debug_i);debug_i--;}
;
No_Select : No_Join {printf("Find next joinn");} No_Join
| No_Join {printf("End of No_Selectn");}
;
No_Join : Base
| Base STAR
| Base PLUS
| Base QUESTION_MARK
;
Base : L_PARENTHESES RE R_PARENTHESES
| L_SQUARE_BRACK RE R_SQUARE_BRACK
| D_QUOTATION No_Select D_QUOTATION
| L_CURLY_BRACES ID {printf("Confirm an IDn");} R_CURLY_BRACES
| DOT
| LETTER
| NUMBER
| RE_LETTER
;
ID : ID LETTER
| ID NUMBER
| LETTER
;
%%
尝试出现很多
文件的前一部分为归约规则,如下: 0 $accept: List $end
1 @1: /* empty */
2 List: Named_RE @1 List
3 | Named_RE
...
中间部分为每个状态的意义与 state 20
13 No_Select: No_Join . @6 No_Join
14 | No_Join .
D_QUOTATION reduce using rule 12 (@6)
D_QUOTATION [reduce using rule 14 (No_Select)]
R_PARENTHESES reduce using rule 14 (No_Select)
R_SQUARE_BRACK reduce using rule 14 (No_Select)
VERTICAL_LINE reduce using rule 14 (No_Select)
BLANK reduce using rule 14 (No_Select)
$default reduce using rule 12 (@6)
@6 go to state 32
可以看到,在状态20中,遇到 为了解决这个问题,这里简单处理,舍弃其中的一条规则。我的语言只 但是对于引号的判断还是有问题,修改引号部分如下: Base : L_PARENTHESES RE R_PARENTHESES | L_SQUARE_BRACK SQUARE_PART R_SQUARE_BRACK | D_QUOTATION QUOTA_PART D_QUOTATION | L_CURLY_BRACES ID R_CURLY_BRACES | DOT | LETTER | NUMBER | RE_LETTER ; QUOTA_PART : QUOTA_PART LETTER | LETTER ; 解决
最终的 %token D_QUOTATION BACKSLASH L_PARENTHESES R_PARENTHESES L_SQUARE_BRACK
R_SQUARE_BRACK L_CURLY_BRACES R_CURLY_BRACES STAR PLUS HORIZONTAL_LINE
DOT QUESTION_MARK VERTICAL_LINE CARET EQUAL EMPTY RE_LETTER LETTER
NUMBER BLANK
%%
List : Named_RE {printf("======Finish one line======nnn");} List
| Named_RE {printf("======Finish prog! End!======n");}
;
Named_RE : ID EQUAL {printf("==Find an ID==nn");} RE {printf("==Find a RE==nn");} BLANK
;
RE : RE VERTICAL_LINE {printf("Finish VERTICAL_LINEn");} No_Select
| { printf("%d No_select startn",debug_i); debug_i--;}
;
No_Select : No_Select {printf("Find next joinn");} No_Join
| No_Join {printf("End of No_Selectn");}
;
No_Join : Base
| Base STAR
| Base PLUS
| Base QUESTION_MARK
;
Base : L_PARENTHESES RE R_PARENTHESES
| L_SQUARE_BRACK SQUARE_PART R_SQUARE_BRACK
| D_QUOTATION QUOTA_PART {printf("QUOTATION ENDn");}
| L_CURLY_BRACES ID {printf("Confirm an IDnn");} R_CURLY_BRACES
| DOT
| LETTER
| NUMBER
| RE_LETTER
;
QUOTA_PART : LETTER QUOTA_PART
| LETTER D_QUOTATION
;
SQUARE_PART : LETTER HORIZONTAL_LINE LETTER
| NUMBER HORIZONTAL_LINE NUMBER
;
ID : ID LETTER
| ID NUMBER
| LETTER
;
%%
输入测试文件 letter = [A-Z]
number = [0-9]
id = ({letter}_)+({number}|{letter})*
测试结果 TOKEN:LETTER
TOKEN:LETTER
TOKEN:LETTER
TOKEN:LETTER
TOKEN:LETTER
TOKEN:LETTER
TOKEN:EQUAL
==Find an ID==
0 No_select start
TOKEN:L_SQUARE_BRACK
TOKEN:LETTER
TOKEN:HORIZONTAL_LINE
TOKEN:LETTER
TOKEN:R_SQUARE_BRACK
TOKEN:BLANK
End of No_Select
Finish 1 No_Select
==Find a RE==
TOKEN:LETTER
======Finish one line======
...# 后面略去
======Finish prog! End!======
生成中间语言(C语言)考虑直接使用控制流的执行难度,这里用了正则语言的一个子集,去除了所有 由于YACC中生成的头文件中默认为自己定义的 Makefile如下: CC = gcc
test:parse example
./parse example
gcc out.c -o test
parse:lex.yy.c y.tab.c
gcc lex.yy.c y.tab.c -o parse
y.tab.c:RE.y
yacc -d RE.y
cp y.tab.h{,.tmp}
sed 's/typedef int YYSTYPE/typedef char* YYSTYPE/' y.tab.h.tmp > y.tab.h
rm y.tab.h.tmp
lex.yy.c:RE.l
flex RE.l
clean:
-rm out.c
-rm test
-rm parse
-rm lex.yy.c
-rm y.tab.c
%%
List : Named_RE List
| Named_RE
;
Named_RE : ID { printf("== Find an ID ==> %s ==nn",$1); gen_id($1); }
EQUAL RE {printf("== Find a RE== nn"); gen_id_trail();} BLANK
;
RE : { printf("%d No_select startn",debug_i); debug_i--;}
;
No_Select : No_Select {printf("Find next joinn");} No_Join {base_i++;}
| No_Join {base_i++;printf("End of No_Selectn");}
;
No_Join : Base
;
Base : L_PARENTHESES RE R_PARENTHESES
| L_SQUARE_BRACK SQUARE_PART R_SQUARE_BRACK
| D_QUOTATION QUOTA_PART {printf("= QUOTATION %s =n",$2);gen_quote($2);} D_QUOTATION
| L_CURLY_BRACES ID {printf("= Call an ID %s =n",$2);gen_call($2);} R_CURLY_BRACES
| DOT {gen_dot();}
| LETTER {gen_next($1);}
| NUMBER {gen_next($1);}
| RE_LETTER {gen_next($1);}
;
QUOTA_PART : QUOTA_PART LETTER {strcat($1,$2);}
| LETTER
| QUOTA_PART NUMBER {strcat($1,$2);}
| NUMBER
;
SQUARE_PART : LETTER HORIZONTAL_LINE LETTER {gen_square($1,$3);}
| NUMBER HORIZONTAL_LINE NUMBER {gen_square($1,$3);}
;
ID : ID LETTER {strcat($1,$2);}
| ID NUMBER {strcat($1,$2);}
| LETTER
;
%%
int main(int argc,char* argv[]){
if(argc == 2){
yyin = fopen(argv[1],"r");
if(yyin == NULL){
printf("Cannot open file!n");
exit(0);
}
outfile = fopen("out.c","w");
gen_head();
yyparse();
gen_main();
fclose(outfile);
}
else{
printf("Usage:./test [filename]n");
}
return 0;
}
int yyerror(char *msg){
printf("Error encountered: %s n",msg);
return 0;
}
其中用于生成 void gen_main(){
fprintf(outfile,"int main(int argc,char* argv[]){n");
fprintf(outfile,"tcur_pos = 0;n");
fprintf(outfile,"tif(argc == 2){n");
fprintf(outfile,"ttstrcpy(in_str,argv[1]);n");
fprintf(outfile,"tttarget();n");
fprintf(outfile,"ttprintf("ACCEPTn");n");
fprintf(outfile,"t}n");
fprintf(outfile,"treturn 0;n");
fprintf(outfile,"}n");
}
由于大部分类似,这里就不贴过多代码了。 测试文件: letter = [A-Z]
num = [0-9]
target = "test"{letter} {num}.
生成C语言代码 #include <stdio.h>
#include <stdlib.h>
#include <string.h>
char in_str[1000];
int cur_pos;
void letter(){
if(in_str[cur_pos]>=65&&in_str[cur_pos]<=90){
cur_pos++;
}
else{
printf("ERROR DOTn");exit(0);
}
}
void num(){
if(in_str[cur_pos]>=48&&in_str[cur_pos]<=57){
cur_pos++;
}
else{
printf("ERROR DOTn");exit(0);
}
}
void target(){
if(in_str[cur_pos]=='t'){
cur_pos++;
}
else{
printf("ERROR tn");exit(0);
}
if(in_str[cur_pos]=='e'){
cur_pos++;
}
else{
printf("ERROR en");exit(0);
}
if(in_str[cur_pos]=='s'){
cur_pos++;
}
else{
printf("ERROR sn");exit(0);
}
if(in_str[cur_pos]=='t'){
cur_pos++;
}
else{
printf("ERROR tn");exit(0);
}
letter();
num();
if(in_str[cur_pos]>=33&&in_str[cur_pos]<=126){
cur_pos++;
}
else{
printf("ERROR DOTn");exit(0);
}
}
int main(int argc,char* argv[]){
cur_pos = 0;
if(argc == 2){
strcpy(in_str,argv[1]);
target();
printf("ACCEPTn");
}
return 0;
}
测试: ./test testA9E ACCEPT (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |