用户:Antigng/AF/AFTokenizer
外观
< User:Antigng | AF
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "AFTokenizer.h"
#include "AFParser.h"
#include "mem.h"
#include "struct.h"
const unsigned char hex_decoding[]=
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 0, 0, 0, 0, 0,
0,11,12,13,14,15,16, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,11,12,13,14,15,16, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#define isValidForHex(ch) (hex_decoding[(unsigned char)(ch)])
const unsigned char idvalid[]=
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#define isValidForID(ch) (idvalid[(unsigned char)(ch)])
const unsigned char spacevalid[]=
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#define isSpace(ch) (spacevalid[(unsigned char)(ch)])
#define forwardRead(s,c,i,o) (c=s[++i]+o)
static void throwError(int count,unsigned char symbol,const char *reason)
{
char symText[16];
if(!symbol)
{
sprintf(symText,"EOL");
}
else if((symbol>=32)&&(symbol<=126))
{
sprintf(symText,"'%c'",symbol);
}
else
{
sprintf(symText,"'\\u%02x'",symbol);
}
fprintf(stderr,"Lexer: Error at char number %d: %s - %s.\n",count,symText,reason);
return;
}
static char *text=NULL;
static int top=0;
static int numLexer(const char *source,int start,struct _AFToken *tok_p)
{
unsigned char ch;
int count=0;
int type=0;
int ivalue;
double fvalue,mul;
switch(start)
{
case 0:
ivalue=0;
goto _DFAState_2;
break;
case 1:
goto _DFAState_4;
break;
case 2:
ivalue=source[0]-'0';
break;
default:
goto _DFAFailure;
}
_DFAState_1:
forwardRead((unsigned char)source,ch,count,(-'0'));
if(ch<10)
{
ivalue=ivalue*10+ch;
goto _DFAState_1;
}
else if(ch!=(unsigned char)('.'-'0'))
{
goto _DFAWrapup;
}
_DFAState_2:
forwardRead((unsigned char)source,ch,count,(-'0'));
if(ch<10)
{
type=1;
fvalue=(double)ivalue+(double)ch*0.1;
mul=0.1;
}
else
{
goto _DFAFailure;
}
_DFAState_3:
forwardRead((unsigned char)source,ch,count,(-'0'));
if(ch<10)
{
mul=mul*0.1;
fvalue+=mul*(double)ch;
goto _DFAState_3;
}
else
{
goto _DFAWrapup;
}
_DFAState_4:
forwardRead(source,ch,count,0);
switch(ch)
{
case '.':
ivalue=0;
goto _DFAState_2;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
ivalue=ch-'0';
goto _DFAState_1;
break;
case 'b':
ivalue=0;
goto _DFAState_7;
break;
case 'o':
ivalue=0;
break;
case 'x':
ivalue=0;
goto _DFAState_6;
break;
default:
ivalue=0;
goto _DFAWrapup;
}
_DFAState_5:
forwardRead((unsigned char)source,ch,count,(-'0'));
if(ch<8)
{
ivalue=ivalue*8+ch;
goto _DFAState_5;
}
else
{
goto _DFAWrapup;
}
_DFAState_6:
forwardRead(source,ch,count,0);
if((ch=isValidForHex(ch))>0)
{
ivalue=ivalue*16+ch-1;
goto _DFAState_6;
}
else
{
goto _DFAWrapup;
}
_DFAState_7:
forwardRead((unsigned char)source,ch,count,(-'0'));
if(ch<2)
{
ivalue=ivalue*2+ch;
goto _DFAState_7;
}
else
{
goto _DFAWrapup;
}
_DFAWrapup:
switch(source[count])
{
case 0:
case '\t':
case '\n':
case '\v':
case '\f':
case '\r':
case ' ':
case '!':
case '%':
case '&':
case '(':
case ')':
case '*':
case '+':
case ',':
case '-':
case '/':
case ':':
case ';':
case '<':
case '=':
case '>':
case '?':
case '[':
case ']':
case '^':
case '|':
if(type)
{
tok_p->type=T_FLOAT;
tok_p->value.f=fvalue;
}
else
{
tok_p->type=T_INT;
tok_p->value.i=ivalue;
}
return count;
break;
}
_DFAFailure:
return -1;
}
static int idLexer(const char *source,unsigned int *hash_p)
{
int count=0;
unsigned int hash=0;
char ch=source[0];
do
{
str_update_hash(hash,ch);
text[top++]=ch;
count++;
}while(isValidForID(ch=source[count]));
text[top++]=0;
*hash_p=hash;
return count;
}
static int stringLexer(const char *source,char start_ch)
{
int count=-1;
char num=0;
char ch;
_DFAState_0:
forwardRead(source,ch,count,0);
if(!ch)
{
return -1;
}
else if(ch=='\\')
{
}
else if(ch==start_ch)
{
text[top++]=0;
return count;
}
else
{
text[top++]=ch;
goto _DFAState_0;
}
forwardRead(source,ch,count,0);
switch(ch)
{
case 0:
return -1;
break;
case '\\':
case '\'':
case '\"':
text[top++]=ch;
goto _DFAState_0;
break;
case 'n':
text[top++]='\n';
goto _DFAState_0;
break;
case 'r':
text[top++]='\r';
goto _DFAState_0;
break;
case 't':
text[top++]='\t';
goto _DFAState_0;
break;
case 'x':
text[top]='\\';
text[top+1]='x';
break;
default:
text[top]='\\';
text[top+1]=ch;
top+=2;
goto _DFAState_0;
break;
}
forwardRead(source,ch,count,0);
if(!ch)
{
return -1;
}
text[top+2]=ch;
if((ch=isValidForHex(ch))!=0)
{
num=ch-1;
}
else
{
top+=3;
goto _DFAState_0;
}
forwardRead(source,ch,count,0);
if(!ch)
{
return -1;
}
if((ch=isValidForHex(ch))>0)
{
text[top++]=num*16+ch-1;
}
else
{
text[top+3]=ch;
top+=4;
}
goto _DFAState_0;
}
static int skipOverSpaces(const char *source,char *x)
{
int count=1;
while(isSpace(source[count]))
{
count++;
}
*x=source[count];
return count;
}
static int skipOverComments(const char *source)
{
int count=0;
_DFAState_0:
switch(source[count])
{
case 0:
return -1;
break;
case '*':
break;
default:
count++;
goto _DFAState_0;
}
count++;
switch(source[count])
{
case 0:
return -1;
break;
case '/':
return count+1;
break;
default:
count++;
goto _DFAState_0;
}
}
static struct hashlist *AFKeywords=NULL;
static enum _AFKeywordType AFKeywordType[]=
{
K_like,
K_in,
K_contains,
K_rlike,
K_irlike,
K_if,
K_then,
K_else,
K_end,
K_true,
K_false,
K_null
};
/* Get next token*/
static const char *AFSource=NULL;
static int AFCount=0;
static char AFCur=0;
#define getIdWithKeywords() \
{\
enum _AFKeywordType *key;\
unsigned int hash;\
int basecount=top;\
int len=idLexer(AFSource+AFCount,&hash);\
AFCount+=len-1;\
if(str_hashquery_withhash(hash,AFKeywords,text+basecount,(void **)&key))\
{\
tok_p->type=T_KEYWORD;\
tok_p->value.key=*key;\
}\
else\
{\
tok_p->type=T_ID;\
tok_p->value.s.begin=text+basecount;\
tok_p->value.s.offset=len;\
tok_p->value.s.hash=hash;\
}\
}
#define getIdWithoutKeywords() \
{\
unsigned int hash;\
int basecount=top;\
int len=idLexer(AFSource+AFCount,&hash);\
AFCount+=len-1;\
tok_p->type=T_ID;\
tok_p->value.s.begin=text+basecount;\
tok_p->value.s.offset=len;\
tok_p->value.s.hash=hash;\
}
#define setTokenWithOP(OP) \
{\
tok_p->type=T_OP;\
tok_p->value.op=OP;\
}
#define setTokenWithString() \
{\
AFCount++;\
{\
int basetop=top;\
int len=stringLexer(AFSource+AFCount,AFCur);\
if(len>=0)\
{\
AFCount+=len;\
tok_p->type=T_STRING;\
tok_p->value.s.begin=text+basetop;\
tok_p->value.s.offset=top-basetop-1;\
}\
else\
{\
throwError(AFCount,AFCur,"unclosed string");\
return 1;\
}\
}\
}
#define setTokenWithNum(start) \
{\
int len=numLexer(AFSource+AFCount,start,tok_p);\
if(len>0)\
{\
AFCount+=len-1;\
}\
else\
{\
throwError(AFCount,AFCur,"invalid character in number");\
return 1;\
}\
}
int AFGetNextToken(struct _AFToken *tok_p)
{
_Tokenizer_Start:
switch(AFCur)
{
/* EOF here */
case 0:
tok_p->type=T_NONE;
return 0;
break;
/* spaces here */
case '\t':
case '\n':
case '\v':
case '\f':
case '\r':
case ' ':
AFCount+=skipOverSpaces(AFSource+AFCount,&AFCur);
goto _Tokenizer_Start;
break;
/* separators & operators here */
case '!':
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='=')
{
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='=')
{
setTokenWithOP(O_SINEQ);
}
else
{
setTokenWithOP(O_INEQ);
return 0;
}
}
else
{
setTokenWithOP(O_NOT);
return 0;
}
break;
case '%':
setTokenWithOP(O_REM);
break;
case '&':
setTokenWithOP(O_AND);
break;
case '(':
tok_p->type=T_BRA;
break;
case ')':
tok_p->type=T_KET;
break;
case '*':
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='*')
{
setTokenWithOP(O_EXP);
}
else
{
setTokenWithOP(O_MUL);
return 0;
}
break;
case '+':
setTokenWithOP(O_PLUS);
break;
case ',':
tok_p->type=T_COMMA;
break;
case '-':
setTokenWithOP(O_MINUS);
break;
case '/':
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='*')
{
int len;
len=skipOverComments(AFSource+AFCount);
if(len>=0)
{
AFCount+=len;
AFCur=AFSource[AFCount];
goto _Tokenizer_Start;
}
else
{
throwError(AFCount,AFCur,"unclosed comments");
return 1;
}
}
else
{
setTokenWithOP(O_DIV);
}
return 0;
break;
case ':':
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='=')
{
setTokenWithOP(O_SET);
}
else
{
setTokenWithOP(O_TER_S);
return 0;
}
break;
case ';':
tok_p->type=T_STATEMENT_SEPARATOR;
break;
case '<':
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='=')
{
setTokenWithOP(O_LE);
}
else
{
setTokenWithOP(O_L);
return 0;
}
break;
case '=':
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='=')
{
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='=')
{
setTokenWithOP(O_SEQ);
}
else
{
setTokenWithOP(O_EQ);
return 0;
}
}
else
{
setTokenWithOP(O_EQ);
return 0;
}
break;
case '>':
forwardRead(AFSource,AFCur,AFCount,0);
if(AFCur=='=')
{
setTokenWithOP(O_GE);
}
else
{
setTokenWithOP(O_G);
return 0;
}
break;
case '?':
setTokenWithOP(O_TER_Q);
break;
case '[':
tok_p->type=T_SQUARE_BRA;
break;
case ']':
tok_p->type=T_SQUARE_KET;
break;
case '^':
setTokenWithOP(O_XOR);
break;
case '|':
setTokenWithOP(O_OR);
break;
/* strings here */
case '\"':
case '\'':
setTokenWithString();
break;
/* numbers here */
case '.':
setTokenWithNum(0);
break;
case '0':
setTokenWithNum(1);
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
setTokenWithNum(2);
break;
/* ids & tokens here */
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
case 'a':
case 'b':
getIdWithoutKeywords();
break;
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
getIdWithKeywords();
break;
default:
throwError(AFCount,AFCur,"invalid character");
return 1;
break;
}
forwardRead(AFSource,AFCur,AFCount,0);
return 0;
}
void AFTokenizerIni()
{
AFKeywords=hashini();
str_hashadd(AFKeywords,"like",AFKeywordType+K_like);
str_hashadd(AFKeywords,"matches",AFKeywordType+K_like);
str_hashadd(AFKeywords,"in",AFKeywordType+K_in);
str_hashadd(AFKeywords,"contains",AFKeywordType+K_contains);
str_hashadd(AFKeywords,"rlike",AFKeywordType+K_rlike);
str_hashadd(AFKeywords,"regex",AFKeywordType+K_rlike);
str_hashadd(AFKeywords,"irlike",AFKeywordType+K_irlike);
str_hashadd(AFKeywords,"if",AFKeywordType+K_if);
str_hashadd(AFKeywords,"then",AFKeywordType+K_then);
str_hashadd(AFKeywords,"else",AFKeywordType+K_else);
str_hashadd(AFKeywords,"end",AFKeywordType+K_end);
str_hashadd(AFKeywords,"true",AFKeywordType+K_true);
str_hashadd(AFKeywords,"false",AFKeywordType+K_false);
str_hashadd(AFKeywords,"null",AFKeywordType+K_null);
return;
}
void AFTokenizerReset(const char *source,unsigned int source_len)
{
static unsigned int text_limit=0;
/* Reset text. Text cannot be longer than the source code. */
if(text_limit<source_len)
{
s_free(text);
text=(char *)s_malloc(sizeof(char)*source_len);
text_limit=source_len;
}
top=0;
/* Reset the pointer to the source code. */
AFSource=source;
AFCount=0;
AFCur=*source;
return ;
}