From 7cfceeca2d5965baf6dcd45f07ebeaedbe48856f Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 3 Nov 2015 22:31:27 -0700 Subject: [PATCH] parser generator beginnings --- CMakeLists.txt | 24 ++ README.md | 19 +- src/buffer.hpp | 8 +- src/grammar.txt | 53 ++++ src/main.cpp | 787 +++------------------------------------------- src/parser.hpp | 87 +++++ src/parsergen.cpp | 340 ++++++++++++++++++++ src/tokenizer.cpp | 328 +++++++++++++++++++ src/tokenizer.hpp | 73 +++++ src/util.hpp | 8 + 10 files changed, 972 insertions(+), 755 deletions(-) create mode 100644 src/grammar.txt create mode 100644 src/parser.hpp create mode 100644 src/parsergen.cpp create mode 100644 src/tokenizer.cpp create mode 100644 src/tokenizer.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 356ff6d3cb..7c8deec0d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,10 +20,21 @@ include_directories( ${CMAKE_BINARY_DIR} ) +set(GRAMMAR_TXT "${CMAKE_BINARY_DIR}/simple.txt") +set(PARSER_CPP "${CMAKE_BINARY_DIR}/parser.cpp") + set(ZIG_SOURCES "${CMAKE_SOURCE_DIR}/src/main.cpp" "${CMAKE_SOURCE_DIR}/src/util.cpp" "${CMAKE_SOURCE_DIR}/src/buffer.cpp" + "${CMAKE_SOURCE_DIR}/src/tokenizer.cpp" + ${PARSER_CPP} +) + +set(PARSERGEN_SOURCES + "${CMAKE_SOURCE_DIR}/src/parsergen.cpp" + "${CMAKE_SOURCE_DIR}/src/util.cpp" + "${CMAKE_SOURCE_DIR}/src/buffer.cpp" ) set(CONFIGURE_OUT_FILE "${CMAKE_BINARY_DIR}/config.h") @@ -49,3 +60,16 @@ target_link_libraries(zig LINK_PUBLIC ${LLVM_LIBRARIES} ) install(TARGETS zig DESTINATION bin) + +add_executable(parsergen ${PARSERGEN_SOURCES}) +set_target_properties(parsergen PROPERTIES + LINKER_LANGUAGE C + COMPILE_FLAGS ${EXE_CFLAGS}) + + +add_custom_command( + OUTPUT ${PARSER_CPP} + COMMAND parsergen ARGS ${GRAMMAR_TXT} ${PARSER_CPP} + DEPENDS ${GRAMMAR_TXT} ${PARSERGEN_SOURCES} + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} +) diff --git a/README.md b/README.md index 7151c428e4..4054a94516 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ readable, safe, optimal, and concise code to solve any computing problem. * Creating a C library should be a primary use case. Should be easy to export an auto-generated .h file. * Generics such as containers. - * Do not depend on libc. + * Do not depend on libc unless explicitly imported. * First class error code support. * Include documentation generator. * Eliminate the need for make, cmake, etc. @@ -25,6 +25,7 @@ readable, safe, optimal, and concise code to solve any computing problem. - Whitespace at the end of line is a compile error. * Resilient to parsing errors to make IDE integration work well. * Source code is UTF-8. + * Shebang line OK so language can be used for "scripting" as well. ## Roadmap @@ -35,3 +36,19 @@ readable, safe, optimal, and concise code to solve any computing problem. * Unit tests. * Simple .so library * How should the Widget use case be solved? In Genesis I'm using C++ and inheritance. + +## Grammar + +``` +Root : FnDecl* +FnDecl : TokenFn TokenSymbol TokenLParen list(ParamDecl, TokenComma, 0) TokenRParen (TokenArrow Type)? Block +ParamDecl : TokenSymbol TokenColon Type +Type : TokenSymbol | PointerType +PointerType : TokenStar (TokenConst | TokenMut) Type +Block : TokenLBrace Statement* Expression? TokenRBrace +Statement : ExpressionStatement | ReturnStatement +ExpressionStatement : Expression TokenSemicolon +ReturnStatement : TokenReturn Expression TokenSemicolon +Expression : TokenNumber | TokenString | FnCall +FnCall : TokenSymbol TokenLParen list(Expression, TokenComma, 0) TokenRParen +``` diff --git a/src/buffer.hpp b/src/buffer.hpp index 463f3baca8..47975f5fe9 100644 --- a/src/buffer.hpp +++ b/src/buffer.hpp @@ -49,19 +49,19 @@ static inline void buf_deinit(Buf *buf) { buf->list.deinit(); } -static inline void buf_init_from_mem(Buf *buf, char *ptr, int len) { +static inline void buf_init_from_mem(Buf *buf, const char *ptr, int len) { buf->list.resize(len + 1); memcpy(buf_ptr(buf), ptr, len); buf->list.at(buf_len(buf)) = 0; } -static inline Buf *buf_create_from_mem(char *ptr, int len) { +static inline Buf *buf_create_from_mem(const char *ptr, int len) { Buf *buf = allocate(1); buf_init_from_mem(buf, ptr, len); return buf; } -static inline Buf *buf_create_from_str(char *str) { +static inline Buf *buf_create_from_str(const char *str) { return buf_create_from_mem(str, strlen(str)); } @@ -138,7 +138,7 @@ static inline Buf *buf_dirname(Buf *buf) { return buf_slice(buf, 0, i); } } - zig_panic("TODO buf_dirname no slash"); + return buf_create_from_mem("", 0); } diff --git a/src/grammar.txt b/src/grammar.txt new file mode 100644 index 0000000000..0305fa7741 --- /dev/null +++ b/src/grammar.txt @@ -0,0 +1,53 @@ +Root : many(FnDecl) token(EOF) { + $$ = ast_create_root($1); +} + +FnDecl : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) option(token(Arrow) Type) Block { + $$ = ast_create_fn_decl($2, $4, $6, $7); +} + +ParamDecl : token(Symbol) token(Colon) Type { + $$ = ast_create_param_decl($1, $2); +} + +Type : token(Symbol) { + $$ = ast_create_symbol_type($1); +} | PointerType { + $$ = $1; +} + +PointerType : token(Star) token(Const) Type { + $$ = ast_create_pointer_type($2, $3); +} | token(Star) token(Mut) Type { + $$ = ast_create_pointer_type($2, $3); +} + +Block : token(LBrace) many(Statement) option(Expression) token(RBrace) { + $$ = ast_create_block($2, $3); +} + +Statement : ExpressionStatement { + $$ = $1; +} | ReturnStatement { + $$ = $1; +} + +ExpressionStatement : Expression token(Semicolon) { + $$ = ast_create_expression_statement($1); +} + +ReturnStatement : token(Return) Expression token(Semicolon) { + $$ = ast_create_return_statement($2); +} + +Expression : token(Number) { + $$ = ast_create_number($1); +} | token(String) { + $$ = ast_create_string($1); +} | FnCall { + $$ = $1; +} + +FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) { + $$ = ast_create_fn_call($1, $3); +} diff --git a/src/main.cpp b/src/main.cpp index 597638b87b..c313c7918a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -9,6 +9,8 @@ #include "util.hpp" #include "list.hpp" #include "buffer.hpp" +#include "parser.hpp" +#include "tokenizer.hpp" #include #include @@ -48,154 +50,9 @@ static Buf *fetch_file(FILE *f) { return buf; } -static inline bool mem_eql_str(const char *mem, size_t mem_len, const char *str) { - size_t str_len = strlen(str); - if (str_len != mem_len) - return false; - return memcmp(mem, str, mem_len) == 0; -} - - -#define WHITESPACE \ - ' ': \ - case '\t': \ - case '\n': \ - case '\f': \ - case '\r': \ - case 0xb - -#define DIGIT \ - '0': \ - case '1': \ - case '2': \ - case '3': \ - case '4': \ - case '5': \ - case '6': \ - case '7': \ - case '8': \ - case '9' - -#define ALPHA \ - 'a': \ - case 'b': \ - case 'c': \ - case 'd': \ - case 'e': \ - case 'f': \ - case 'g': \ - case 'h': \ - case 'i': \ - case 'j': \ - case 'k': \ - case 'l': \ - case 'm': \ - case 'n': \ - case 'o': \ - case 'p': \ - case 'q': \ - case 'r': \ - case 's': \ - case 't': \ - case 'u': \ - case 'v': \ - case 'w': \ - case 'x': \ - case 'y': \ - case 'z': \ - case 'A': \ - case 'B': \ - case 'C': \ - case 'D': \ - case 'E': \ - case 'F': \ - case 'G': \ - case 'H': \ - case 'I': \ - case 'J': \ - case 'K': \ - case 'L': \ - case 'M': \ - case 'N': \ - case 'O': \ - case 'P': \ - case 'Q': \ - case 'R': \ - case 'S': \ - case 'T': \ - case 'U': \ - case 'V': \ - case 'W': \ - case 'X': \ - case 'Y': \ - case 'Z' - -#define SYMBOL_CHAR \ - ALPHA: \ - case DIGIT: \ - case '_' - - -enum TokenId { - TokenIdSymbol, - TokenIdKeywordFn, - TokenIdKeywordReturn, - TokenIdKeywordMut, - TokenIdKeywordConst, - TokenIdLParen, - TokenIdRParen, - TokenIdComma, - TokenIdStar, - TokenIdLBrace, - TokenIdRBrace, - TokenIdStringLiteral, - TokenIdSemicolon, - TokenIdNumberLiteral, - TokenIdPlus, - TokenIdColon, - TokenIdArrow, - TokenIdDash, -}; - -struct Token { - TokenId id; - int start_pos; - int end_pos; - int start_line; - int start_column; -}; - -enum TokenizeState { - TokenizeStateStart, - TokenizeStateSymbol, - TokenizeStateNumber, - TokenizeStateString, - TokenizeStateSawDash, -}; - -struct Tokenize { - Buf *buf; - int pos; - TokenizeState state; - ZigList *tokens; - int line; - int column; - Token *cur_tok; - Buf *cur_dir_path; - ZigList *include_paths; -}; - -__attribute__ ((format (printf, 2, 3))) -static void tokenize_error(Tokenize *t, const char *format, ...) { - int line; - int column; - if (t->cur_tok) { - line = t->cur_tok->start_line + 1; - column = t->cur_tok->start_column + 1; - } else { - line = t->line + 1; - column = t->column + 1; - } +void ast_error(Token *token, const char *format, ...) { + int line = token->start_line + 1; + int column = token->start_column + 1; va_list ap; va_start(ap, format); @@ -206,349 +63,35 @@ static void tokenize_error(Tokenize *t, const char *format, ...) { exit(EXIT_FAILURE); } -static void begin_token(Tokenize *t, TokenId id) { - assert(!t->cur_tok); - t->tokens->add_one(); - Token *token = &t->tokens->last(); - token->start_line = t->line; - token->start_column = t->column; - token->id = id; - token->start_pos = t->pos; - t->cur_tok = token; -} - -static void end_token(Tokenize *t) { - assert(t->cur_tok); - t->cur_tok->end_pos = t->pos + 1; - - char *token_mem = buf_ptr(t->buf) + t->cur_tok->start_pos; - int token_len = t->cur_tok->end_pos - t->cur_tok->start_pos; - - if (mem_eql_str(token_mem, token_len, "fn")) { - t->cur_tok->id = TokenIdKeywordFn; - } else if (mem_eql_str(token_mem, token_len, "return")) { - t->cur_tok->id = TokenIdKeywordReturn; - } else if (mem_eql_str(token_mem, token_len, "mut")) { - t->cur_tok->id = TokenIdKeywordMut; - } else if (mem_eql_str(token_mem, token_len, "const")) { - t->cur_tok->id = TokenIdKeywordConst; - } - - t->cur_tok = nullptr; -} - -static ZigList *tokenize(Buf *buf, ZigList *include_paths, Buf *cur_dir_path) { - Tokenize t = {0}; - t.tokens = allocate>(1); - t.buf = buf; - t.cur_dir_path = cur_dir_path; - t.include_paths = include_paths; - for (t.pos = 0; t.pos < buf_len(t.buf); t.pos += 1) { - uint8_t c = buf_ptr(t.buf)[t.pos]; - switch (t.state) { - case TokenizeStateStart: - switch (c) { - case WHITESPACE: - break; - case ALPHA: - t.state = TokenizeStateSymbol; - begin_token(&t, TokenIdSymbol); - break; - case DIGIT: - t.state = TokenizeStateNumber; - begin_token(&t, TokenIdNumberLiteral); - break; - case '"': - begin_token(&t, TokenIdStringLiteral); - t.state = TokenizeStateString; - break; - case '(': - begin_token(&t, TokenIdLParen); - end_token(&t); - break; - case ')': - begin_token(&t, TokenIdRParen); - end_token(&t); - break; - case ',': - begin_token(&t, TokenIdComma); - end_token(&t); - break; - case '*': - begin_token(&t, TokenIdStar); - end_token(&t); - break; - case '{': - begin_token(&t, TokenIdLBrace); - end_token(&t); - break; - case '}': - begin_token(&t, TokenIdRBrace); - end_token(&t); - break; - case ';': - begin_token(&t, TokenIdSemicolon); - end_token(&t); - break; - case ':': - begin_token(&t, TokenIdColon); - end_token(&t); - break; - case '+': - begin_token(&t, TokenIdPlus); - end_token(&t); - break; - case '-': - begin_token(&t, TokenIdDash); - t.state = TokenizeStateSawDash; - break; - default: - tokenize_error(&t, "invalid character: '%c'", c); - } - break; - case TokenizeStateSymbol: - switch (c) { - case SYMBOL_CHAR: - break; - default: - t.pos -= 1; - end_token(&t); - t.state = TokenizeStateStart; - continue; - } - break; - case TokenizeStateString: - switch (c) { - case '"': - end_token(&t); - t.state = TokenizeStateStart; - break; - default: - break; - } - break; - case TokenizeStateNumber: - switch (c) { - case DIGIT: - break; - default: - t.pos -= 1; - end_token(&t); - t.state = TokenizeStateStart; - continue; - } - break; - case TokenizeStateSawDash: - switch (c) { - case '>': - t.cur_tok->id = TokenIdArrow; - end_token(&t); - t.state = TokenizeStateStart; - break; - default: - end_token(&t); - t.state = TokenizeStateStart; - break; - } - break; - } - if (c == '\n') { - t.line += 1; - t.column = 0; - } else { - t.column += 1; - } - } - // EOF - switch (t.state) { - case TokenizeStateStart: - break; - case TokenizeStateSymbol: - end_token(&t); - break; - case TokenizeStateString: - tokenize_error(&t, "unterminated string"); - break; - case TokenizeStateNumber: - end_token(&t); - break; - case TokenizeStateSawDash: - end_token(&t); - break; - } - assert(!t.cur_tok); - return t.tokens; -} - -static const char * token_name(Token *token) { - switch (token->id) { - case TokenIdSymbol: return "Symbol"; - case TokenIdKeywordFn: return "Fn"; - case TokenIdKeywordConst: return "Const"; - case TokenIdKeywordMut: return "Mut"; - case TokenIdKeywordReturn: return "Return"; - case TokenIdLParen: return "LParen"; - case TokenIdRParen: return "RParen"; - case TokenIdComma: return "Comma"; - case TokenIdStar: return "Star"; - case TokenIdLBrace: return "LBrace"; - case TokenIdRBrace: return "RBrace"; - case TokenIdStringLiteral: return "StringLiteral"; - case TokenIdSemicolon: return "Semicolon"; - case TokenIdNumberLiteral: return "NumberLiteral"; - case TokenIdPlus: return "Plus"; - case TokenIdColon: return "Colon"; - case TokenIdArrow: return "Arrow"; - case TokenIdDash: return "Dash"; - } - return "(invalid token)"; -} - -static void print_tokens(Buf *buf, ZigList *tokens) { - for (int i = 0; i < tokens->length; i += 1) { - Token *token = &tokens->at(i); - printf("%s ", token_name(token)); - fwrite(buf_ptr(buf) + token->start_pos, 1, token->end_pos - token->start_pos, stdout); - printf("\n"); - } -} - -struct AstNode; - -enum NodeType { - NodeTypeRoot, - NodeTypeFnDecl, - NodeTypeParam, - NodeTypeType, - NodeTypeBlock, -}; - -struct AstNodeFnDecl { - Buf name; - ZigList params; - AstNode *return_type; - AstNode *body; -}; - -struct AstNodeRoot { - ZigList fn_decls; -}; - -enum AstNodeTypeType { - AstNodeTypeTypePrimitive, - AstNodeTypeTypePointer, -}; - -enum AstPrimitiveType { - AstPrimitiveTypeVoid, - AstPrimitiveTypeU8, - AstPrimitiveTypeI8, - AstPrimitiveTypeU16, - AstPrimitiveTypeI16, - AstPrimitiveTypeU32, - AstPrimitiveTypeI32, - AstPrimitiveTypeU64, - AstPrimitiveTypeI64, - AstPrimitiveTypeUSize, - AstPrimitiveTypeISize, - AstPrimitiveTypeF32, - AstPrimitiveTypeF64, -}; - - -struct AstNodeType { - AstNodeTypeType type; - AstPrimitiveType primitive_type; - AstNode *pointer_type; - bool is_const; -}; - -struct AstNodeParam { - Buf name; - AstNode *type; -}; - -enum AstState { - AstStateStart, - AstStateFn, - AstStateFnLParen, - AstStateFnParamName, - AstStateParamColon, - AstStateType, - AstStateTypeEnd, - AstStateFnParamComma, - AstStateFnDeclArrow, - AstStateFnDeclBlock, - AstStatePointerType, - AstStateBlock, -}; - -struct AstNode { - enum NodeType type; - AstNode *parent; - AstState prev_state; - union { - AstNodeRoot root; - AstNodeFnDecl fn_decl; - AstNodeType type; - AstNodeParam param; - } data; -}; - -struct BuildAst { - Buf *buf; - AstNode *root; - AstState state; - int line; - int column; - AstNode *cur_node; -}; - -__attribute__ ((format (printf, 2, 3))) -static void ast_error(BuildAst *b, const char *format, ...) { - int line = b->line + 1; - int column = b->column + 1; - - va_list ap; - va_start(ap, format); - fprintf(stderr, "Error: Line %d, column %d: ", line, column); - vfprintf(stderr, format, ap); - fprintf(stderr, "\n"); - va_end(ap); - exit(EXIT_FAILURE); -} - -static AstNode *ast_create_node(BuildAst *b, NodeType type) { - AstNode *child = allocate(1); - child->prev_state = b->state; - child->parent = b->cur_node; - child->type = type; - return child; -} - -static void ast_make_node_current(BuildAst *b, AstNode *node) { - b->cur_node = node; -} - -static void ast_up_stack(BuildAst *b) { - assert(b->cur_node->parent); - b->state = b->cur_node->prev_state; - b->cur_node = b->cur_node->parent; -} - - static const char *node_type_str(NodeType node_type) { switch (node_type) { - case NodeTypeRoot: return "Root"; - case NodeTypeFnDecl: return "FnDecl"; - case NodeTypeParam: return "Param"; - case NodeTypeType: return "Type"; - case NodeTypeBlock: return "Block"; + case NodeTypeRoot: + return "Root"; + case NodeTypeFnDecl: + return "FnDecl"; + case NodeTypeParamDecl: + return "ParamDecl"; + case NodeTypeType: + return "Type"; + case NodeTypePointerType: + return "PointerType"; + case NodeTypeBlock: + return "Block"; + case NodeTypeStatement: + return "Statement"; + case NodeTypeExpressionStatement: + return "ExpressionStatement"; + case NodeTypeReturnStatement: + return "ReturnStatement"; + case NodeTypeExpression: + return "Expression"; + case NodeTypeFnCall: + return "FnCall"; } zig_panic("unreachable"); } -static void print_ast(AstNode *node, int indent) { +static void ast_print(AstNode *node, int indent) { for (int i = 0; i < indent; i += 1) { fprintf(stderr, " "); } @@ -558,7 +101,7 @@ static void print_ast(AstNode *node, int indent) { fprintf(stderr, "%s\n", node_type_str(node->type)); for (int i = 0; i < node->data.root.fn_decls.length; i += 1) { AstNode *child = node->data.root.fn_decls.at(i); - print_ast(child, indent + 2); + ast_print(child, indent + 2); } break; case NodeTypeFnDecl: @@ -568,278 +111,21 @@ static void print_ast(AstNode *node, int indent) { for (int i = 0; i < node->data.fn_decl.params.length; i += 1) { AstNode *child = node->data.fn_decl.params.at(i); - print_ast(child, indent + 2); + ast_print(child, indent + 2); } - print_ast(node->data.fn_decl.return_type, indent + 2); + ast_print(node->data.fn_decl.return_type, indent + 2); - print_ast(node->data.fn_decl.body, indent + 2); + ast_print(node->data.fn_decl.body, indent + 2); break; } - case NodeTypeParam: - { - Buf *name_buf = &node->data.param.name; - fprintf(stderr, "%s '%s'\n", node_type_str(node->type), buf_ptr(name_buf)); - - print_ast(node->data.param.type, indent + 2); - break; - } - case NodeTypeType: - fprintf(stderr, "%s\n", node_type_str(node->type)); - break; - case NodeTypeBlock: + default: fprintf(stderr, "%s\n", node_type_str(node->type)); break; } } -static void ast_end_fn_param_list(BuildAst *b) { - b->state = AstStateFnDeclArrow; -} - -static AstNode *build_ast(Buf *buf, ZigList *tokens) { - BuildAst b = {0}; - b.state = AstStateStart; - b.buf = buf; - b.root = allocate(1); - b.root->type = NodeTypeRoot; - b.cur_node = b.root; - - for (int i = 0; i < tokens->length; i += 1) { - Token *token = &tokens->at(i); - char *token_mem = buf_ptr(buf) + token->start_pos; - int token_len = token->end_pos - token->start_pos; - b.line = token->start_line; - b.column = token->start_column; - switch (b.state) { - case AstStateStart: - assert(b.cur_node->type == NodeTypeRoot); - if (token->id == TokenIdKeywordFn) { - AstNode *child = ast_create_node(&b, NodeTypeFnDecl); - b.cur_node->data.root.fn_decls.append(child); - ast_make_node_current(&b, child); - b.state = AstStateFn; - } else { - Buf msg = {0}; - buf_appendf(&msg, "unexpected %s: '", token_name(token)); - buf_append_mem(&msg, token_mem, token_len); - buf_append_str(&msg, "'"); - ast_error(&b, "%s", buf_ptr(&msg)); - break; - } - break; - case AstStateFn: - if (token->id != TokenIdSymbol) - ast_error(&b, "expected symbol"); - buf_init_from_mem(&b.cur_node->data.fn_decl.name, token_mem, token_len); - b.state = AstStateFnLParen; - break; - case AstStateFnLParen: - if (token->id != TokenIdLParen) - ast_error(&b, "expected '('"); - b.state = AstStateFnParamName; - break; - case AstStateFnParamName: - switch (token->id) { - case TokenIdSymbol: - { - b.state = AstStateFnParamComma; - AstNode *child = ast_create_node(&b, NodeTypeParam); - buf_init_from_mem(&child->data.param.name, token_mem, token_len); - b.cur_node->data.fn_decl.params.append(child); - ast_make_node_current(&b, child); - b.state = AstStateParamColon; - break; - } - case TokenIdRParen: - ast_end_fn_param_list(&b); - break; - default: - ast_error(&b, "expected parameter name"); - break; - } - break; - case AstStateParamColon: - { - if (token->id != TokenIdColon) - ast_error(&b, "expected ':'"); - assert(b.cur_node->type == NodeTypeParam); - b.state = AstStateTypeEnd; - AstNode *child = ast_create_node(&b, NodeTypeType); - b.cur_node->data.param.type = child; - ast_make_node_current(&b, child); - b.state = AstStateType; - break; - } - case AstStateType: - switch (token->id) { - case TokenIdSymbol: - assert(b.cur_node->type == NodeTypeType); - if (mem_eql_str(token_mem, token_len, "u8")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeU8; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "i8")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeI8; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "u16")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeU16; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "i16")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeI16; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "u32")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeU32; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "i32")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeI32; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "u64")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeU64; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "i64")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeI64; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "usize")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeUSize; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "isize")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeISize; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "f32")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeF32; - ast_up_stack(&b); - } else if (mem_eql_str(token_mem, token_len, "f64")) { - b.cur_node->data.type.type = AstNodeTypeTypePrimitive; - b.cur_node->data.type.primitive_type = AstPrimitiveTypeF64; - ast_up_stack(&b); - } else { - Buf msg = {0}; - buf_append_str(&msg, "invalid primitive type: '"); - buf_append_mem(&msg, token_mem, token_len); - buf_append_str(&msg, "'"); - ast_error(&b, "%s", buf_ptr(&msg)); - } - break; - case TokenIdStar: - b.cur_node->data.type.type = AstNodeTypeTypePointer; - b.state = AstStatePointerType; - break; - default: - ast_error(&b, "expected type name"); - break; - } - break; - case AstStatePointerType: - { - if (token->id == TokenIdKeywordMut) { - b.cur_node->data.type.is_const = false; - } else if (token->id == TokenIdKeywordConst) { - b.cur_node->data.type.is_const = true; - } else { - ast_error(&b, "expected 'mut' or 'const'"); - } - b.state = AstStateTypeEnd; - AstNode *child = ast_create_node(&b, NodeTypeType); - b.cur_node->data.type.pointer_type = child; - ast_make_node_current(&b, child); - b.state = AstStateType; - break; - } - case AstStateTypeEnd: - ast_up_stack(&b); - i -= 1; - continue; - case AstStateFnParamComma: - switch (token->id) { - case TokenIdComma: - b.state = AstStateFnParamName; - break; - case TokenIdRParen: - ast_end_fn_param_list(&b); - break; - default: - ast_error(&b, "expected ',' or ')'"); - break; - - } - break; - case AstStateFnDeclArrow: - switch (token->id) { - case TokenIdArrow: - { - assert(b.cur_node->type == NodeTypeFnDecl); - b.state = AstStateFnDeclBlock; - AstNode *child = ast_create_node(&b, NodeTypeType); - b.cur_node->data.fn_decl.return_type = child; - ast_make_node_current(&b, child); - b.state = AstStateType; - break; - } - case TokenIdLBrace: - { - AstNode *node = ast_create_node(&b, NodeTypeType); - node->data.type.type = AstNodeTypeTypePrimitive; - node->data.type.primitive_type = AstPrimitiveTypeVoid; - b.cur_node->data.fn_decl.return_type = node; - - b.state = AstStateTypeEnd; - AstNode *child = ast_create_node(&b, NodeTypeBlock); - b.cur_node->data.fn_decl.body = child; - ast_make_node_current(&b, child); - b.state = AstStateBlock; - break; - } - default: - ast_error(&b, "expected '->' or '}'"); - break; - } - break; - case AstStateFnDeclBlock: - { - if (token->id != TokenIdLBrace) - ast_error(&b, "expected '{'"); - - b.state = AstStateTypeEnd; - AstNode *child = ast_create_node(&b, NodeTypeBlock); - b.cur_node->data.fn_decl.body = child; - ast_make_node_current(&b, child); - b.state = AstStateBlock; - break; - } - case AstStateBlock: - switch (token->id) { - case TokenIdSymbol: - zig_panic("TODO symbol"); - break; - default: - { - Buf msg = {0}; - buf_appendf(&msg, "unexpected %s: '", token_name(token)); - buf_append_mem(&msg, token_mem, token_len); - buf_append_str(&msg, "'"); - ast_error(&b, "%s", buf_ptr(&msg)); - break; - } - } - break; - } - } - - return b.root; -} - char cur_dir[1024]; int main(int argc, char **argv) { @@ -896,14 +182,15 @@ int main(int argc, char **argv) { fprintf(stderr, "----------------\n"); fprintf(stderr, "%s\n", buf_ptr(in_data)); - ZigList *tokens = tokenize(in_data, &include_paths, cur_dir_path); + ZigList *tokens = tokenize(in_data, cur_dir_path); fprintf(stderr, "\nTokens:\n"); fprintf(stderr, "---------\n"); print_tokens(in_data, tokens); - AstNode *root = build_ast(in_data, tokens); - print_ast(root, 0); + AstNode *root = ast_parse(in_data, tokens); + assert(root); + ast_print(root, 0); return EXIT_SUCCESS; diff --git a/src/parser.hpp b/src/parser.hpp new file mode 100644 index 0000000000..0f15228d01 --- /dev/null +++ b/src/parser.hpp @@ -0,0 +1,87 @@ +#ifndef ZIG_PARSER_HPP +#define ZIG_PARSER_HPP + +#include "list.hpp" +#include "buffer.hpp" +#include "tokenizer.hpp" + +struct AstNode; + +enum NodeType { + NodeTypeRoot, + NodeTypeFnDecl, + NodeTypeParamDecl, + NodeTypeType, + NodeTypePointerType, + NodeTypeBlock, + NodeTypeStatement, + NodeTypeExpressionStatement, + NodeTypeReturnStatement, + NodeTypeExpression, + NodeTypeFnCall, +}; + +struct AstNodeRoot { + ZigList fn_decls; +}; + +struct AstNodeFnDecl { + Buf name; + ZigList params; + AstNode *return_type; + AstNode *body; +}; + +struct AstNodeParamDecl { + Buf name; + AstNode *type; +}; + +enum AstNodeTypeType { + AstNodeTypeTypePrimitive, + AstNodeTypeTypePointer, +}; + +struct AstNodeType { + AstNodeTypeType type; + AstNode *child; +}; + +struct AstNodePointerType { + AstNode *const_or_mut; + AstNode *type; +}; + +struct AstNodeBlock { + ZigList expressions; +}; + +struct AstNodeExpression { + AstNode *child; +}; + +struct AstNodeFnCall { + Buf name; + ZigList params; +}; + +struct AstNode { + enum NodeType type; + AstNode *parent; + union { + AstNodeRoot root; + AstNodeFnDecl fn_decl; + AstNodeType type; + AstNodeParamDecl param_decl; + AstNodeBlock block; + AstNodeExpression expression; + AstNodeFnCall fn_call; + } data; +}; + +__attribute__ ((format (printf, 2, 3))) +void ast_error(Token *token, const char *format, ...); + +AstNode * ast_parse(Buf *buf, ZigList *tokens); + +#endif diff --git a/src/parsergen.cpp b/src/parsergen.cpp new file mode 100644 index 0000000000..486ee702d4 --- /dev/null +++ b/src/parsergen.cpp @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2015 Andrew Kelley + * + * This file is part of zig, which is MIT licensed. + * See http://opensource.org/licenses/MIT + */ + +#include "util.hpp" +#include "buffer.hpp" +#include "list.hpp" + +#include +#include +#include +#include +#include +#include + +static Buf *fetch_file(FILE *f) { + int fd = fileno(f); + struct stat st; + if (fstat(fd, &st)) + zig_panic("unable to stat file: %s", strerror(errno)); + off_t big_size = st.st_size; + if (big_size > INT_MAX) + zig_panic("file too big"); + int size = (int)big_size; + + Buf *buf = buf_alloc_fixed(size); + size_t amt_read = fread(buf_ptr(buf), 1, buf_len(buf), f); + if (amt_read != (size_t)buf_len(buf)) + zig_panic("error reading: %s", strerror(errno)); + + return buf; +} + +static int usage(const char *arg0) { + fprintf(stderr, "Usage: %s in-grammar.txt out-parser.c\n", arg0); + return 1; +} + +struct Token { + Buf name; + int id; +}; + +struct RuleNode; + +struct RuleTuple { + ZigList children; +}; + +struct RuleMany { + RuleNode *child; +}; + +struct RuleOption { + ZigList child; +}; + +struct RuleOr { + ZigList children; +}; + +struct RuleToken { + Token *token; +}; + +struct RuleBlock { + Buf *body; +}; + +struct RuleList { + RuleNode *rule; + RuleToken *separator; +}; + +struct RuleSubRule { + RuleNode *child; +}; + +enum RuleNodeType { + RuleNodeTypeTuple, + RuleNodeTypeMany, + RuleNodeTypeList, + RuleNodeTypeOption, + RuleNodeTypeOr, + RuleNodeTypeToken, + RuleNodeTypeSubRule, +}; + +struct RuleNode { + RuleNodeType type; + union { + RuleTuple tuple; + RuleMany many; + RuleList list; + RuleOption option; + RuleOr _or; + RuleToken token; + RuleSubRule sub_rule; + }; +}; + +enum ParserStateType { + ParserStateTypeError, + ParserStateTypeOk, +}; + +struct ParserStateError { + Buf *msg; +}; + +struct ParserState { + ParserStateType type; + // One for each token ID. + ParserState **transition; + int index; + union { + ParserStateError error; + }; +}; + +struct Gen { + ParserState *cur_state; + ZigList transition_table; + ZigList tokens; + RuleNode *root; +}; + +static ParserState *create_state(Gen *g, ParserStateType type) { + ParserState *state = allocate(1); + state->type = type; + state->index = g->transition_table.length; + state->transition = allocate(g->tokens.length); + g->transition_table.append(state); + return state; +} + +static void fill_state_with_transition(Gen *g, ParserState *source, ParserState *dest) { + for (int i = 0; i < g->tokens.length; i += 1) { + source->transition[i] = dest; + } +} + +static void gen(Gen *g, RuleNode *node) { + switch (node->type) { + case RuleNodeTypeToken: + { + ParserState *ok_state = create_state(g, ParserStateTypeOk); + ParserState *err_state = create_state(g, ParserStateTypeError); + + err_state->error.msg = buf_sprintf("expected token '%s'", buf_ptr(&node->token.token->name)); + + fill_state_with_transition(g, g->cur_state, err_state); + g->cur_state->transition[node->token.token->id] = ok_state; + g->cur_state = ok_state; + } + break; + case RuleNodeTypeTuple: + { + for (int i = 0; i < node->tuple.children.length; i += 1) { + RuleNode *child = node->tuple.children.at(i); + gen(g, child); + } + } + break; + case RuleNodeTypeMany: + zig_panic("TODO"); + break; + case RuleNodeTypeList: + zig_panic("TODO"); + break; + case RuleNodeTypeOption: + zig_panic("TODO"); + break; + case RuleNodeTypeOr: + zig_panic("TODO"); + break; + case RuleNodeTypeSubRule: + zig_panic("TODO"); + break; + } +} + +static Token *find_token_by_name(Gen *g, Buf *name) { + for (int i = 0; i < g->tokens.length; i += 1) { + Token *token = g->tokens.at(i); + if (buf_eql_buf(name, &token->name)) + return token; + } + return nullptr; +} + +static Token *find_or_create_token(Gen *g, Buf *name) { + Token *token = find_token_by_name(g, name); + if (!token) { + token = allocate(1); + token->id = g->tokens.length; + buf_init_from_mem(&token->name, buf_ptr(name), buf_len(name)); + g->tokens.append(token); + } + return token; +} + +int main(int argc, char **argv) { + const char *in_filename = argv[1]; + const char *out_filename = argv[2]; + + if (!in_filename || !out_filename) + return usage(argv[0]); + + FILE *in_f; + if (strcmp(in_filename, "-") == 0) { + in_f = stdin; + } else { + in_f = fopen(in_filename, "rb"); + } + + FILE *out_f; + if (strcmp(out_filename, "-") == 0) { + out_f = stdout; + } else { + out_f = fopen(out_filename, "wb"); + } + + if (!in_f || !out_f) + zig_panic("unable to open file(s)"); + + Buf *in_buf = fetch_file(in_f); + + ZigList rules = {0}; + Gen g = {0}; + + //zig_panic("TODO initialize rules"); + { + Token *star_token = find_or_create_token(&g, buf_create_from_str("Star")); + Token *lparen_token = find_or_create_token(&g, buf_create_from_str("LParen")); + Token *eof_token = find_or_create_token(&g, buf_create_from_str("Eof")); + + RuleNode *root = allocate(1); + root->type = RuleNodeTypeTuple; + + RuleNode *star_node = allocate(1); + star_node->type = RuleNodeTypeToken; + star_node->token.token = star_token; + root->tuple.children.append(star_node); + + RuleNode *lparen_node = allocate(1); + lparen_node->type = RuleNodeTypeToken; + lparen_node->token.token = lparen_token; + root->tuple.children.append(lparen_node); + + RuleNode *eof_node = allocate(1); + eof_node->type = RuleNodeTypeToken; + eof_node->token.token = eof_token; + root->tuple.children.append(eof_node); + + rules.append(root); + } + + g.root = rules.at(0); + + g.cur_state = create_state(&g, ParserStateTypeOk); + gen(&g, g.root); + + + + (void)in_buf; + + fprintf(out_f, "/* This file is auto-generated by parsergen.cpp */\n"); + fprintf(out_f, "#include \"src/parser.hpp\"\n"); + fprintf(out_f, "#include \n"); + + fprintf(out_f, "\n"); + fprintf(out_f, "/*\n"); + fprintf(out_f, "enum TokenId {\n"); + for (int i = 0; i < g.tokens.length; i += 1) { + Token *token = g.tokens.at(i); + fprintf(out_f, " TokenId%s = %d,\n", buf_ptr(&token->name), token->id); + } + fprintf(out_f, "};\n"); + fprintf(out_f, "*/\n"); + for (int i = 0; i < g.tokens.length; i += 1) { + Token *token = g.tokens.at(i); + fprintf(out_f, "static_assert(TokenId%s == %d, \"wrong token id\");\n", + buf_ptr(&token->name), token->id); + } + + fprintf(out_f, "AstNode * ast_parse(Buf *buf, ZigList *tokens) {\n"); + + fprintf(out_f, " static const int transition[%d][%d] = {\n", g.transition_table.length, g.tokens.length); + for (int state_index = 0; state_index < g.transition_table.length; state_index += 1) { + ParserState *state = g.transition_table.at(state_index); + fprintf(out_f, " {\n"); + + for (int token_id = 0; token_id < g.tokens.length; token_id += 1) { + ParserState *dest = state->transition[token_id]; + fprintf(out_f, " %d,\n", dest ? dest->index : -1); + } + + fprintf(out_f, " },\n"); + } + fprintf(out_f, " };\n"); + + + fprintf(out_f, " int state = 0;\n"); + fprintf(out_f, " AstNode *root = nullptr;\n"); + + fprintf(out_f, " for (int i = 0; i < tokens->length; i += 1) {\n"); + fprintf(out_f, " Token *token = &tokens->at(i);\n"); + fprintf(out_f, " switch (state) {\n"); + + for (int i = 0; i < g.transition_table.length; i += 1) { + ParserState *state = g.transition_table.at(i); + fprintf(out_f, " case %d:\n", i); + fprintf(out_f, " fprintf(stderr, \"state = %%d\\n\", state);\n"); + switch (state->type) { + case ParserStateTypeError: + fprintf(out_f, " ast_error(token, \"%s\");\n", buf_ptr(state->error.msg)); + break; + case ParserStateTypeOk: + fprintf(out_f, " assert(transition[%d][token->id] >= 0);\n", state->index); + fprintf(out_f, " assert(transition[%d][token->id] < %d);\n", + state->index, g.transition_table.length); + fprintf(out_f, " state = transition[%d][token->id];\n", state->index); + break; + } + fprintf(out_f, " break;\n"); + } + fprintf(out_f, " default:\n"); + fprintf(out_f, " zig_panic(\"unreachable\");\n"); + + fprintf(out_f, " }\n"); + fprintf(out_f, " }\n"); + fprintf(out_f, " return root;\n"); + fprintf(out_f, "}\n"); + + +} diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp new file mode 100644 index 0000000000..9a54776cc5 --- /dev/null +++ b/src/tokenizer.cpp @@ -0,0 +1,328 @@ +#include "tokenizer.hpp" +#include "util.hpp" + +#include +#include +#include + +#define WHITESPACE \ + ' ': \ + case '\t': \ + case '\n': \ + case '\f': \ + case '\r': \ + case 0xb + +#define DIGIT \ + '0': \ + case '1': \ + case '2': \ + case '3': \ + case '4': \ + case '5': \ + case '6': \ + case '7': \ + case '8': \ + case '9' + +#define ALPHA \ + 'a': \ + case 'b': \ + case 'c': \ + case 'd': \ + case 'e': \ + case 'f': \ + case 'g': \ + case 'h': \ + case 'i': \ + case 'j': \ + case 'k': \ + case 'l': \ + case 'm': \ + case 'n': \ + case 'o': \ + case 'p': \ + case 'q': \ + case 'r': \ + case 's': \ + case 't': \ + case 'u': \ + case 'v': \ + case 'w': \ + case 'x': \ + case 'y': \ + case 'z': \ + case 'A': \ + case 'B': \ + case 'C': \ + case 'D': \ + case 'E': \ + case 'F': \ + case 'G': \ + case 'H': \ + case 'I': \ + case 'J': \ + case 'K': \ + case 'L': \ + case 'M': \ + case 'N': \ + case 'O': \ + case 'P': \ + case 'Q': \ + case 'R': \ + case 'S': \ + case 'T': \ + case 'U': \ + case 'V': \ + case 'W': \ + case 'X': \ + case 'Y': \ + case 'Z' + +#define SYMBOL_CHAR \ + ALPHA: \ + case DIGIT: \ + case '_' + + +struct Tokenize { + Buf *buf; + int pos; + TokenizeState state; + ZigList *tokens; + int line; + int column; + Token *cur_tok; + Buf *cur_dir_path; +}; + +__attribute__ ((format (printf, 2, 3))) +static void tokenize_error(Tokenize *t, const char *format, ...) { + int line; + int column; + if (t->cur_tok) { + line = t->cur_tok->start_line + 1; + column = t->cur_tok->start_column + 1; + } else { + line = t->line + 1; + column = t->column + 1; + } + + va_list ap; + va_start(ap, format); + fprintf(stderr, "Error: Line %d, column %d: ", line, column); + vfprintf(stderr, format, ap); + fprintf(stderr, "\n"); + va_end(ap); + exit(EXIT_FAILURE); +} + +static void begin_token(Tokenize *t, TokenId id) { + assert(!t->cur_tok); + t->tokens->add_one(); + Token *token = &t->tokens->last(); + token->start_line = t->line; + token->start_column = t->column; + token->id = id; + token->start_pos = t->pos; + t->cur_tok = token; +} + +static void end_token(Tokenize *t) { + assert(t->cur_tok); + t->cur_tok->end_pos = t->pos + 1; + + char *token_mem = buf_ptr(t->buf) + t->cur_tok->start_pos; + int token_len = t->cur_tok->end_pos - t->cur_tok->start_pos; + + if (mem_eql_str(token_mem, token_len, "fn")) { + t->cur_tok->id = TokenIdKeywordFn; + } else if (mem_eql_str(token_mem, token_len, "return")) { + t->cur_tok->id = TokenIdKeywordReturn; + } else if (mem_eql_str(token_mem, token_len, "mut")) { + t->cur_tok->id = TokenIdKeywordMut; + } else if (mem_eql_str(token_mem, token_len, "const")) { + t->cur_tok->id = TokenIdKeywordConst; + } + + t->cur_tok = nullptr; +} + +ZigList *tokenize(Buf *buf, Buf *cur_dir_path) { + Tokenize t = {0}; + t.tokens = allocate>(1); + t.buf = buf; + t.cur_dir_path = cur_dir_path; + for (t.pos = 0; t.pos < buf_len(t.buf); t.pos += 1) { + uint8_t c = buf_ptr(t.buf)[t.pos]; + switch (t.state) { + case TokenizeStateStart: + switch (c) { + case WHITESPACE: + break; + case ALPHA: + t.state = TokenizeStateSymbol; + begin_token(&t, TokenIdSymbol); + break; + case DIGIT: + t.state = TokenizeStateNumber; + begin_token(&t, TokenIdNumberLiteral); + break; + case '"': + begin_token(&t, TokenIdStringLiteral); + t.state = TokenizeStateString; + break; + case '(': + begin_token(&t, TokenIdLParen); + end_token(&t); + break; + case ')': + begin_token(&t, TokenIdRParen); + end_token(&t); + break; + case ',': + begin_token(&t, TokenIdComma); + end_token(&t); + break; + case '*': + begin_token(&t, TokenIdStar); + end_token(&t); + break; + case '{': + begin_token(&t, TokenIdLBrace); + end_token(&t); + break; + case '}': + begin_token(&t, TokenIdRBrace); + end_token(&t); + break; + case ';': + begin_token(&t, TokenIdSemicolon); + end_token(&t); + break; + case ':': + begin_token(&t, TokenIdColon); + end_token(&t); + break; + case '+': + begin_token(&t, TokenIdPlus); + end_token(&t); + break; + case '-': + begin_token(&t, TokenIdDash); + t.state = TokenizeStateSawDash; + break; + default: + tokenize_error(&t, "invalid character: '%c'", c); + } + break; + case TokenizeStateSymbol: + switch (c) { + case SYMBOL_CHAR: + break; + default: + t.pos -= 1; + end_token(&t); + t.state = TokenizeStateStart; + continue; + } + break; + case TokenizeStateString: + switch (c) { + case '"': + end_token(&t); + t.state = TokenizeStateStart; + break; + default: + break; + } + break; + case TokenizeStateNumber: + switch (c) { + case DIGIT: + break; + default: + t.pos -= 1; + end_token(&t); + t.state = TokenizeStateStart; + continue; + } + break; + case TokenizeStateSawDash: + switch (c) { + case '>': + t.cur_tok->id = TokenIdArrow; + end_token(&t); + t.state = TokenizeStateStart; + break; + default: + end_token(&t); + t.state = TokenizeStateStart; + break; + } + break; + } + if (c == '\n') { + t.line += 1; + t.column = 0; + } else { + t.column += 1; + } + } + // EOF + switch (t.state) { + case TokenizeStateStart: + break; + case TokenizeStateSymbol: + end_token(&t); + break; + case TokenizeStateString: + tokenize_error(&t, "unterminated string"); + break; + case TokenizeStateNumber: + end_token(&t); + break; + case TokenizeStateSawDash: + end_token(&t); + break; + } + begin_token(&t, TokenIdEof); + end_token(&t); + assert(!t.cur_tok); + return t.tokens; +} + +static const char * token_name(Token *token) { + switch (token->id) { + case TokenIdEof: return "EOF"; + case TokenIdSymbol: return "Symbol"; + case TokenIdKeywordFn: return "Fn"; + case TokenIdKeywordConst: return "Const"; + case TokenIdKeywordMut: return "Mut"; + case TokenIdKeywordReturn: return "Return"; + case TokenIdLParen: return "LParen"; + case TokenIdRParen: return "RParen"; + case TokenIdComma: return "Comma"; + case TokenIdStar: return "Star"; + case TokenIdLBrace: return "LBrace"; + case TokenIdRBrace: return "RBrace"; + case TokenIdStringLiteral: return "StringLiteral"; + case TokenIdSemicolon: return "Semicolon"; + case TokenIdNumberLiteral: return "NumberLiteral"; + case TokenIdPlus: return "Plus"; + case TokenIdColon: return "Colon"; + case TokenIdArrow: return "Arrow"; + case TokenIdDash: return "Dash"; + } + return "(invalid token)"; +} + +void print_tokens(Buf *buf, ZigList *tokens) { + for (int i = 0; i < tokens->length; i += 1) { + Token *token = &tokens->at(i); + printf("%s ", token_name(token)); + fwrite(buf_ptr(buf) + token->start_pos, 1, token->end_pos - token->start_pos, stdout); + printf("\n"); + } +} + diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp new file mode 100644 index 0000000000..4904bb0382 --- /dev/null +++ b/src/tokenizer.hpp @@ -0,0 +1,73 @@ +#ifndef ZIG_TOKENIZER_HPP +#define ZIG_TOKENIZER_HPP + +#include "buffer.hpp" + +/* +enum TokenId { + TokenIdEof, + TokenIdSymbol, + TokenIdKeywordFn, + TokenIdKeywordReturn, + TokenIdKeywordMut, + TokenIdKeywordConst, + TokenIdLParen, + TokenIdRParen, + TokenIdComma, + TokenIdStar, + TokenIdLBrace, + TokenIdRBrace, + TokenIdStringLiteral, + TokenIdSemicolon, + TokenIdNumberLiteral, + TokenIdPlus, + TokenIdColon, + TokenIdArrow, + TokenIdDash, +}; +*/ + +// TODO: debug delete this +enum TokenId { + TokenIdStar = 0, + TokenIdLParen = 1, + TokenIdEof = 2, + TokenIdSymbol, + TokenIdKeywordFn, + TokenIdKeywordReturn, + TokenIdKeywordMut, + TokenIdKeywordConst, + TokenIdRParen, + TokenIdComma, + TokenIdLBrace, + TokenIdRBrace, + TokenIdStringLiteral, + TokenIdSemicolon, + TokenIdNumberLiteral, + TokenIdPlus, + TokenIdColon, + TokenIdArrow, + TokenIdDash, +}; + +struct Token { + TokenId id; + int start_pos; + int end_pos; + int start_line; + int start_column; +}; + +enum TokenizeState { + TokenizeStateStart, + TokenizeStateSymbol, + TokenizeStateNumber, + TokenizeStateString, + TokenizeStateSawDash, +}; + +ZigList *tokenize(Buf *buf, Buf *cur_dir_path); + +void print_tokens(Buf *buf, ZigList *tokens); + +#endif diff --git a/src/util.hpp b/src/util.hpp index 0cb731b95d..427b1c3294 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -60,4 +60,12 @@ template static inline T clamp(T min_value, T value, T max_value) { return max(min(value, max_value), min_value); } + +static inline bool mem_eql_str(const char *mem, size_t mem_len, const char *str) { + size_t str_len = strlen(str); + if (str_len != mem_len) + return false; + return memcmp(mem, str, mem_len) == 0; +} + #endif