代码如下
其他的部分还没有完成,只考虑数值解析这一段。主要是从可读性跟性能两个方面看
package compile.craft;
import lombok.extern.slf4j.Slf4j;
import static compile.craft.CharUtils.*;
@Slf4j
public class Lexer {
boolean fetchedEOF = false;
private final String source;
private int pos = 0;
private char ch;
private int line = 0;
private int col = 0;
public Lexer(String source) {
this.source = source;
}
/**
* 1. line 维护
*
*/
public Token nextToken() {
if (fetchedEOF) {
return null;
}
while (true) {
do {
advance();
} while (isBlank(ch));
if (ch == EOF) {
fetchedEOF = true;
return null;
}
// 处理 // 注释
if (isIdentifierStart(ch)) {
return scanIdentifier();
}
if (ch == '.') {
return scanNumber();
}
if (CharUtils.isDigit(ch)) {
return scanNumber();
}
break;
}
return null;
}
private Token scanNumber() {
int start = pos - 1;
TokenKind tokenKind = null;
if (ch == '0') {
advance();
// HEX_LITERAL: '0' [xX] [0-9a-fA-F] ([0-9a-fA-F_]* [0-9a-fA-F])? [lL]?;
// HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?;
// HexDigits: HexDigit ((HexDigit | '_')* HexDigit)?;
// Digits: [0-9] ([0-9_]* [0-9])?;
if (ch == 'x' || ch == 'X') {
if (peek() == '.') {
advance();
scanHexFraction(false);
String lexeme = source.substring(start, pos);
return new Token(TokenKind.HEX_FLOAT_LITERAL, lexeme);
} else {
scanHex();
if (ch == '.') {
scanHexFraction(true);
tokenKind = TokenKind.HEX_FLOAT_LITERAL;
} else if (ch == 'p' || ch == 'P') {
scanExp();
tokenKind = TokenKind.HEX_FLOAT_LITERAL;
} else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') {
tokenKind = TokenKind.HEX_FLOAT_LITERAL;
} else if (ch == 'l' || ch == 'L') {
tokenKind = TokenKind.HEX_LITERAL;
} else {
retreat();
tokenKind = TokenKind.HEX_LITERAL;
}
String lexeme = source.substring(start, pos);
return new Token(tokenKind, lexeme);
}
} else if (ch == 'b' || ch == 'B') {
// BINARY_LITERAL: '0' [bB] [01] ([01_]* [01])? [lL]?;
scanBit();
if ((ch != 'l') && (ch != 'L')) {
retreat();
}
String lexeme = source.substring(start, pos);
return new Token(TokenKind.BINARY_LITERAL, lexeme);
} else if (ch == '_' || isOct(ch)) {
// OCT_LITERAL: '0' '_'* [0-7] ([0-7_]* [0-7])? [lL]?;
// FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
// FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]);
scanOct();
if (isDigit(ch)) {
scanDigit();
if (ch == '.') {
scanFraction();
} else if (ch == 'e' || ch == 'E') {
scanExp();
} else if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) {
error("invalid oct literal");
}
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == '.') {
scanFraction();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'e' || ch == 'E') {
scanExp();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'd' || ch == 'D' || ch == 'f' || ch == 'F') {
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'l' || ch == 'L') {
tokenKind = TokenKind.OCT_LITERAL;
} else {
retreat();
tokenKind = TokenKind.OCT_LITERAL;
}
String lexeme = source.substring(start, pos);
return new Token(tokenKind, lexeme);
} else {
// FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
// FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]);
// DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?;
// ExponentPart: [eE] [+-]? Digits;
if (isDigit(ch)) {
do {
advance();
} while (isDigit(ch));
if (ch == '.') {
scanFraction();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'e' || ch == 'E') {
scanExp();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if ((ch == 'f') || ch == 'F' || ch == 'd' || ch == 'D') {
tokenKind = TokenKind.FLOAT_LITERAL;
} else {
error("invalid float literal");
}
} else if (ch == '.') {
scanFraction();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'e' || ch == 'E') {
scanExp();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'l' || ch == 'L') {
tokenKind = TokenKind.DECIMAL_LITERAL;
} else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') {
tokenKind = TokenKind.FLOAT_LITERAL;
} else {
retreat();
tokenKind = TokenKind.DECIMAL_LITERAL;
}
String lexeme = source.substring(start, pos);
return new Token(tokenKind, lexeme);
}
} else {
// FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
// FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]);
// DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?;
// ExponentPart: [eE] [+-]? Digits;
if (ch == '.') {
scanDigit();
if (ch == 'e' || ch == 'E') {
scanExp();
}
tokenKind = TokenKind.FLOAT_LITERAL;
} else {
scanDigit(true);
if (ch == '.') {
scanFraction();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'e' || ch == 'E') {
scanExp();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') {
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'l' || ch == 'L') {
tokenKind = TokenKind.DECIMAL_LITERAL;
} else {
retreat();
tokenKind = TokenKind.DECIMAL_LITERAL;
}
}
String lexeme = source.substring(start, pos);
return new Token(tokenKind, lexeme);
}
}
private Token scanIdentifier() {
int start = pos - 1;
do {
advance();
} while (isIdentifierChar(ch));
String lexeme = source.substring(start, pos);
retreat();
if (Token.isKeyword(lexeme)) {
TokenKind kind = Token.kind(lexeme);
return new Token(kind, kind.literal);
} else {
return new Token(TokenKind.IDENTIFIER, lexeme);
}
}
private void scanHex() {
advance();
if (!isHex(ch)) {
error("invalid hexadecimal literal");
}
do {
if (ch == '_') {
do {
advance();
} while(ch == '_');
if (!isHex(ch)) {
error("invalid hexadecimal literal");
}
}
do {
advance();
} while (isHex(ch));
} while (ch == '_');
}
private void scanBit() {
advance();
if (!isBit(ch)) {
error("invalid binary literal");
}
do {
if (ch == '_') {
do {
advance();
} while(ch == '_');
if (!isBit(ch)) {
error("invalid binary literal");
}
}
do {
advance();
} while (isBit(ch));
} while (ch == '_');
}
private void scanOct() {
do {
if (ch == '_') {
do {
advance();
} while(ch == '_');
if (!isDigit(ch)) {
error("invalid octal literal");
}
if (!isOct(ch)) {
return;
}
}
do {
advance();
} while (isOct(ch));
} while (ch == '_');
}
private void scanDigit(boolean hasDigit) {
advance();
if (hasDigit) {
if ((ch != '_') && !isDigit(ch)) {
return;
}
} else if (!isDigit(ch)) {
error("invalid decimal literal");
}
do {
if (ch == '_') {
do {
advance();
} while(ch == '_');
if (!isDigit(ch)) {
error("invalid decimal literal");
}
}
do {
advance();
} while (isDigit(ch));
} while (ch == '_');
}
private void scanDigit() {
scanDigit(false);
}
// FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
private void scanFraction() {
if (isDigit(peek())) {
scanDigit();
} else {
advance();
}
if (ch == 'e' || ch == 'E') {
char c = peek();
if (c == '+' || c == '-') {
advance();
}
scanDigit();
}
if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) {
retreat();
}
}
// HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?;
private void scanHexFraction(boolean hasDigit) {
if (hasDigit) {
if (isHex(peek())) {
scanHex();
} else {
advance();
}
} else {
scanHex();
}
if ((ch != 'p') && (ch != 'P')) {
error("invalid hexadecimal literal");
}
char c = peek();
if (c == '+' || c == '-') {
advance();
}
scanDigit();
if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) {
retreat();
}
}
private void scanExp() {
char c = peek();
if (c == '+' || c == '-') {
advance();
}
scanDigit();
if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) {
retreat();
}
}
private void advance() {
if (pos >= source.length()) {
ch = CharUtils.EOF;
return;
}
ch = source.charAt(pos++);
}
private void retreat() {
if (ch != EOF) {
if (--pos < 0) {
error("tokenizer exceed beginning of source");
}
}
}
private char peek() {
if (pos >= source.length()) {
return EOF;
}
return source.charAt(pos);
}
private void error(String msg) {
log.error("lexer error: {}", msg);
throw new RuntimeException(msg);
}
}
1
angryPHP 2023-03-30 19:48:57 +08:00
有没有测试用例啊
|
2
lucaslee 2023-03-30 21:56:35 +08:00
第一印象:满屏的 edfl 字符判断都看晕了 XD
|
3
kwh 2023-03-30 22:03:06 +08:00
chatGPT:让我试试
|
4
neptuno 2023-03-30 22:08:42 +08:00 via iPhone
好长
|
5
NeoZephyr OP @angryPHP
有的 ```java def "scan number without exception"() { when: Lexer lexer = new Lexer(source) Token token = lexer.nextToken() then: kind == token.kind.name() lexeme == token.lexeme where: source | kind | lexeme // BINARY_LITERAL: '0' [bB] [01] ([01_]* [01])? [lL]?; "0b0" | "BINARY_LITERAL" | "0b0" "0b1" | "BINARY_LITERAL" | "0b1" "0b000" | "BINARY_LITERAL" | "0b000" "0b010" | "BINARY_LITERAL" | "0b010" "0B0L" | "BINARY_LITERAL" | "0B0L" "0b10___10l" | "BINARY_LITERAL" | "0b10___10l" "0b000___0L" | "BINARY_LITERAL" | "0b000___0L" "0b001___0L" | "BINARY_LITERAL" | "0b001___0L" // OCT_LITERAL: '0' '_'* [0-7] ([0-7_]* [0-7])? [lL]?; "0000" | "OCT_LITERAL" | "0000" "0__007_2" | "OCT_LITERAL" | "0__007_2" "0__7_0L" | "OCT_LITERAL" | "0__7_0L" "07_0L" | "OCT_LITERAL" | "07_0L" "0000l" | "OCT_LITERAL" | "0000l" "0007l" | "OCT_LITERAL" | "0007l" "0_00_77__0L" | "OCT_LITERAL" | "0_00_77__0L" // HEX_LITERAL: '0' [xX] [0-9a-fA-F] ([0-9a-fA-F_]* [0-9a-fA-F])? [lL]?; "0x0" | "HEX_LITERAL" | "0x0" "0x000l" | "HEX_LITERAL" | "0x000l" "0x0eeeL" | "HEX_LITERAL" | "0x0eeeL" "0x000eee" | "HEX_LITERAL" | "0x000eee" "0x1_E2E3e5" | "HEX_LITERAL" | "0x1_E2E3e5" "0xe___ee_eL" | "HEX_LITERAL" | "0xe___ee_eL" // DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?; "0L" | "DECIMAL_LITERAL" | "0L" "0" | "DECIMAL_LITERAL" | "0" "9l" | "DECIMAL_LITERAL" | "9l" "99___9L" | "DECIMAL_LITERAL" | "99___9L" "9___99___9L" | "DECIMAL_LITERAL" | "9___99___9L" "9999" | "DECIMAL_LITERAL" | "9999" "1_000_000" | "DECIMAL_LITERAL" | "1_000_000" // HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?; "0x.fp012" | "HEX_FLOAT_LITERAL" | "0x.fp012" "0x.f__f_fp-0012d" | "HEX_FLOAT_LITERAL" | "0x.f__f_fp-0012d" "0xf_ffP12d" | "HEX_FLOAT_LITERAL" | "0xf_ffP12d" "0xf_f.P-12d" | "HEX_FLOAT_LITERAL" | "0xf_f.P-12d" "0xf_f.f_fP+12" | "HEX_FLOAT_LITERAL" | "0xf_f.f_fP+12" "0x0.0000000fp-11" | "HEX_FLOAT_LITERAL" | "0x0.0000000fp-11" "0xf_ff.P12d" | "HEX_FLOAT_LITERAL" | "0xf_ff.P12d" "0X0P0f" | "HEX_FLOAT_LITERAL" | "0X0P0f" "0X0P0" | "HEX_FLOAT_LITERAL" | "0X0P0" "0X0_0__123P0f" | "HEX_FLOAT_LITERAL" | "0X0_0__123P0f" "0XeP0f" | "HEX_FLOAT_LITERAL" | "0XeP0f" "0X000.P0f" | "HEX_FLOAT_LITERAL" | "0X000.P0f" "0X00e.P0f" | "HEX_FLOAT_LITERAL" | "0X00e.P0f" "0X0e__0.0P0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0P0f" "0X0e__0.0__0P0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__0P0f" "0X0e__0.0__e0P-0__0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__e0P-0__0f" "0X0e__0.0__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__e0P+0_1__0f" "0X0.0__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0.0__e0P+0_1__0f" "0X0.00P0f" | "HEX_FLOAT_LITERAL" | "0X0.00P0f" "0X0.0eP0f" | "HEX_FLOAT_LITERAL" | "0X0.0eP0f" "0X0.e__00P0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00P0f" "0X0.e__00__0P0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00__0P0f" "0X0.e__00__e0P-0__0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00__e0P-0__0f" "0X0e.0__00__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0e.0__00__e0P+0_1__0f" "0X.0__00__e0P-0_1__0F" | "HEX_FLOAT_LITERAL" | "0X.0__00__e0P-0_1__0F" "0X.0__00__e0P-0_1__0" | "HEX_FLOAT_LITERAL" | "0X.0__00__e0P-0_1__0" // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]); // ExponentPart: [eE] [+-]? Digits "0f" | "FLOAT_LITERAL" | "0f" "00f" | "FLOAT_LITERAL" | "00f" "0__0_0f" | "FLOAT_LITERAL" | "0__0_0f" "0001f" | "FLOAT_LITERAL" | "0001f" "0e0f" | "FLOAT_LITERAL" | "0e0f" "0e0" | "FLOAT_LITERAL" | "0e0" "1e1" | "FLOAT_LITERAL" | "1e1" "0_0e-0_0f" | "FLOAT_LITERAL" | "0_0e-0_0f" "0_0e0_120f" | "FLOAT_LITERAL" | "0_0e0_120f" "0001e0009" | "FLOAT_LITERAL" | "0001e0009" "000012345e1" | "FLOAT_LITERAL" | "000012345e1" "0_00___9900d" | "FLOAT_LITERAL" | "0_00___9900d" ".0" | "FLOAT_LITERAL" | ".0" ".0e0" | "FLOAT_LITERAL" | ".0e0" ".0_000" | "FLOAT_LITERAL" | ".0_000" ".0___0990" | "FLOAT_LITERAL" | ".0___0990" ".000e0__0__0" | "FLOAT_LITERAL" | ".000e0__0__0" ".000e0__0__0f" | "FLOAT_LITERAL" | ".000e0__0__0f" ".000e-0__9__0f" | "FLOAT_LITERAL" | ".000e-0__9__0f" ".9e-0__1_0f" | "FLOAT_LITERAL" | ".9e-0__1_0f" "0__00." | "FLOAT_LITERAL" | "0__00." "0__090." | "FLOAT_LITERAL" | "0__090." "99__9." | "FLOAT_LITERAL" | "99__9." "000.000" | "FLOAT_LITERAL" | "000.000" "0__10.090" | "FLOAT_LITERAL" | "0__10.090" "000__1_0.090" | "FLOAT_LITERAL" | "000__1_0.090" "000__1_0.090e12" | "FLOAT_LITERAL" | "000__1_0.090e12" "0__10.090e1__00" | "FLOAT_LITERAL" | "0__10.090e1__00" "3.1415926" | "FLOAT_LITERAL" | "3.1415926" "0.030" | "FLOAT_LITERAL" | "0.030" "0.6" | "FLOAT_LITERAL" | "0.6" ".0f" | "FLOAT_LITERAL" | ".0f" ".0_000f" | "FLOAT_LITERAL" | ".0_000f" ".0___0990f" | "FLOAT_LITERAL" | ".0___0990f" ".000e0__0__0f" | "FLOAT_LITERAL" | ".000e0__0__0f" "0__00.f" | "FLOAT_LITERAL" | "0__00.f" "0__090.f" | "FLOAT_LITERAL" | "0__090.f" "99__9.f" | "FLOAT_LITERAL" | "99__9.f" "000.000f" | "FLOAT_LITERAL" | "000.000f" "0__10.090f" | "FLOAT_LITERAL" | "0__10.090f" "000__1_0.090f" | "FLOAT_LITERAL" | "000__1_0.090f" "000__1_0.090e12f" | "FLOAT_LITERAL" | "000__1_0.090e12f" "0__10.090e1__00f" | "FLOAT_LITERAL" | "0__10.090e1__00f" "3.1415926f" | "FLOAT_LITERAL" | "3.1415926f" "0.030f" | "FLOAT_LITERAL" | "0.030f" "0.6f" | "FLOAT_LITERAL" | "0.6f" } ``` |
8
lucaslee 2023-03-30 22:50:48 +08:00
才看到注释,写那么多代码是在翻译正则?
|
9
Leviathann 2023-03-30 23:02:58 +08:00
见不得这种副作用满天飞的代码
|
10
popvlovs 2023-03-31 13:56:52 +08:00
看的我眼花 o_o ....
我只用过 antlr ,从来没考虑过自己手搓 lexer ,这是要拿来练手么? |