tokenizing hello.zig

This commit is contained in:
Andrew Kelley 2015-11-01 22:21:33 -07:00
parent 5f48463bdd
commit 34f8d80eac
6 changed files with 182 additions and 178 deletions

View file

@ -1,5 +1,32 @@
# zig lang
C upgrade.
An experiment in writing a low-level programming language with the intent to
replace C. Zig intends to be a small language, yet powerful enough to write
readable, safe, optimal, and concise code to solve any computing problem.
Start with C.
## Goals
* Ability to run arbitrary code at compile time and generate code.
* Completely compatible with C libraries with no wrapper necessary.
* Creating a C library should be a primary use case. Should be easy to export
an auto-generated .h file.
* Generics such as containers.
* Do not depend on libc.
* First class error code support.
* Include documentation generator.
* Eliminate the need for make, cmake, etc.
* Friendly toward package maintainers.
* Eliminate the need for C headers (when using zig internally).
* Ability to declare dependencies as Git URLS with commit locking (can
provide a tag or sha1).
* Rust-style enums.
* Opinionated when it makes life easier.
- Tab character in source code is a compile error.
- Whitespace at the end of line is a compile error.
* Resilient to parsing errors to make IDE integration work well.
* Source code is UTF-8.
## Roadmap
* Hello, world.
* How should the Widget use case be solved? In Genesis I'm using C++ and inheritance.

View file

@ -23,3 +23,24 @@ Buf *buf_sprintf(const char *format, ...) {
return buf;
}
void buf_appendf(Buf *buf, const char *format, ...) {
va_list ap, ap2;
va_start(ap, format);
va_copy(ap2, ap);
int len1 = vsnprintf(nullptr, 0, format, ap);
assert(len1 >= 0);
size_t required_size = len1 + 1;
int orig_len = buf_len(buf);
buf_resize(buf, orig_len + required_size);
int len2 = vsnprintf(buf_ptr(buf) + orig_len, required_size, format, ap2);
assert(len2 == len1);
va_end(ap2);
va_end(ap);
}

View file

@ -93,6 +93,9 @@ static inline void buf_append_char(Buf *buf, uint8_t c) {
buf_append_mem(buf, (const char *)&c, 1);
}
void buf_appendf(Buf *buf, const char *format, ...)
__attribute__ ((format (printf, 2, 3)));
static inline bool buf_eql_mem(Buf *buf, const char *mem, int mem_len) {
if (buf_len(buf) != mem_len)
return false;

View file

@ -129,7 +129,6 @@ static Buf *fetch_file(FILE *f) {
enum TokenId {
TokenIdDirective,
TokenIdSymbol,
TokenIdLParen,
TokenIdRParen,
@ -141,6 +140,9 @@ enum TokenId {
TokenIdSemicolon,
TokenIdNumberLiteral,
TokenIdPlus,
TokenIdColon,
TokenIdArrow,
TokenIdDash,
};
struct Token {
@ -153,14 +155,10 @@ struct Token {
enum TokenizeState {
TokenizeStateStart,
TokenizeStateDirective,
TokenizeStateDirectiveName,
TokenizeStateIncludeQuote,
TokenizeStateDirectiveEnd,
TokenizeStateInclude,
TokenizeStateSymbol,
TokenizeStateString,
TokenizeStateNumber,
TokenizeStateString,
TokenizeStateSawDash,
};
struct Tokenize {
@ -171,11 +169,7 @@ struct Tokenize {
int line;
int column;
Token *cur_tok;
Buf *directive_name;
Buf *cur_dir_path;
uint8_t unquote_char;
int quote_start_pos;
Buf *include_path;
ZigList<char *> *include_paths;
};
@ -217,68 +211,6 @@ static void end_token(Tokenize *t) {
t->cur_tok = nullptr;
}
static void put_back(Tokenize *t, int count) {
t->pos -= count;
}
static void begin_directive(Tokenize *t) {
t->state = TokenizeStateDirective;
begin_token(t, TokenIdDirective);
assert(!t->directive_name);
t->directive_name = buf_alloc();
}
static bool find_and_include_file(Tokenize *t, char *dir_path, char *file_path) {
Buf *full_path = buf_sprintf("%s/%s", dir_path, file_path);
FILE *f = fopen(buf_ptr(full_path), "rb");
if (!f)
return false;
Buf *contents = fetch_file(f);
buf_splice_buf(t->buf, t->pos, t->pos, contents);
return true;
}
static void render_include(Tokenize *t, Buf *target_path, char unquote_char) {
if (unquote_char == '"') {
if (find_and_include_file(t, buf_ptr(t->cur_dir_path), buf_ptr(target_path)))
return;
}
for (int i = 0; i < t->include_paths->length; i += 1) {
char *include_path = t->include_paths->at(i);
if (find_and_include_file(t, include_path, buf_ptr(target_path)))
return;
}
tokenize_error(t, "include path \"%s\" not found", buf_ptr(target_path));
}
static void end_directive(Tokenize *t) {
end_token(t);
if (t->include_path) {
render_include(t, t->include_path, t->unquote_char);
t->include_path = nullptr;
}
t->state = TokenizeStateStart;
}
static void end_directive_name(Tokenize *t) {
if (buf_eql_str(t->directive_name, "include")) {
t->state = TokenizeStateInclude;
t->directive_name = nullptr;
} else {
tokenize_error(t, "invalid directive name: \"%s\"", buf_ptr(t->directive_name));
}
}
static void end_symbol(Tokenize *t) {
put_back(t, 1);
end_token(t);
t->state = TokenizeStateStart;
}
static ZigList<Token> *tokenize(Buf *buf, ZigList<char *> *include_paths, Buf *cur_dir_path) {
Tokenize t = {0};
t.tokens = allocate<ZigList<Token>>(1);
@ -300,9 +232,6 @@ static ZigList<Token> *tokenize(Buf *buf, ZigList<char *> *include_paths, Buf *c
t.state = TokenizeStateNumber;
begin_token(&t, TokenIdNumberLiteral);
break;
case '#':
begin_directive(&t);
break;
case '"':
begin_token(&t, TokenIdStringLiteral);
t.state = TokenizeStateString;
@ -335,88 +264,31 @@ static ZigList<Token> *tokenize(Buf *buf, ZigList<char *> *include_paths, Buf *c
begin_token(&t, TokenIdSemicolon);
end_token(&t);
break;
case ':':
begin_token(&t, TokenIdColon);
end_token(&t);
break;
case '+':
begin_token(&t, TokenIdPlus);
end_token(&t);
break;
case '-':
begin_token(&t, TokenIdDash);
t.state = TokenizeStateSawDash;
break;
default:
tokenize_error(&t, "invalid character: '%c'", c);
}
break;
case TokenizeStateDirective:
switch (c) {
case '\n':
end_directive_name(&t);
end_directive(&t);
break;
case ' ':
case '\t':
case '\f':
case '\r':
case 0xb:
break;
case SYMBOL_CHAR:
t.state = TokenizeStateDirectiveName;
buf_append_char(t.directive_name, c);
break;
default:
tokenize_error(&t, "invalid directive character: '%c'", c);
break;
}
break;
case TokenizeStateDirectiveName:
switch (c) {
case WHITESPACE:
end_directive_name(&t);
break;
case SYMBOL_CHAR:
buf_append_char(t.directive_name, c);
break;
default:
tokenize_error(&t, "invalid directive name character: '%c'", c);
break;
}
break;
case TokenizeStateInclude:
switch (c) {
case WHITESPACE:
break;
case '<':
case '"':
t.state = TokenizeStateIncludeQuote;
t.quote_start_pos = t.pos;
t.unquote_char = (c == '<') ? '>' : '"';
break;
}
break;
case TokenizeStateIncludeQuote:
if (c == t.unquote_char) {
t.include_path = buf_slice(t.buf, t.quote_start_pos + 1, t.pos);
t.state = TokenizeStateDirectiveEnd;
}
break;
case TokenizeStateDirectiveEnd:
switch (c) {
case '\n':
end_directive(&t);
break;
case ' ':
case '\t':
case '\f':
case '\r':
case 0xb:
break;
default:
tokenize_error(&t, "expected whitespace or newline: '%c'", c);
}
break;
case TokenizeStateSymbol:
switch (c) {
case SYMBOL_CHAR:
break;
default:
end_symbol(&t);
break;
t.pos -= 1;
end_token(&t);
t.state = TokenizeStateStart;
continue;
}
break;
case TokenizeStateString:
@ -434,7 +306,22 @@ static ZigList<Token> *tokenize(Buf *buf, ZigList<char *> *include_paths, Buf *c
case DIGIT:
break;
default:
end_symbol(&t);
t.pos -= 1;
end_token(&t);
t.state = TokenizeStateStart;
continue;
}
break;
case TokenizeStateSawDash:
switch (c) {
case '>':
t.cur_tok->id = TokenIdArrow;
end_token(&t);
t.state = TokenizeStateStart;
break;
default:
end_token(&t);
t.state = TokenizeStateStart;
break;
}
break;
@ -450,30 +337,17 @@ static ZigList<Token> *tokenize(Buf *buf, ZigList<char *> *include_paths, Buf *c
switch (t.state) {
case TokenizeStateStart:
break;
case TokenizeStateDirective:
end_directive(&t);
break;
case TokenizeStateDirectiveName:
end_directive_name(&t);
end_directive(&t);
break;
case TokenizeStateInclude:
tokenize_error(&t, "missing include path");
break;
case TokenizeStateSymbol:
end_symbol(&t);
end_token(&t);
break;
case TokenizeStateString:
tokenize_error(&t, "unterminated string");
break;
case TokenizeStateNumber:
end_symbol(&t);
end_token(&t);
break;
case TokenizeStateIncludeQuote:
tokenize_error(&t, "unterminated include path");
break;
case TokenizeStateDirectiveEnd:
end_directive(&t);
case TokenizeStateSawDash:
end_token(&t);
break;
}
assert(!t.cur_tok);
@ -482,7 +356,6 @@ static ZigList<Token> *tokenize(Buf *buf, ZigList<char *> *include_paths, Buf *c
static const char * token_name(Token *token) {
switch (token->id) {
case TokenIdDirective: return "Directive";
case TokenIdSymbol: return "Symbol";
case TokenIdLParen: return "LParen";
case TokenIdRParen: return "RParen";
@ -494,6 +367,9 @@ static const char * token_name(Token *token) {
case TokenIdSemicolon: return "Semicolon";
case TokenIdNumberLiteral: return "NumberLiteral";
case TokenIdPlus: return "Plus";
case TokenIdColon: return "Colon";
case TokenIdArrow: return "Arrow";
case TokenIdDash: return "Dash";
}
return "(invalid token)";
}
@ -507,6 +383,83 @@ static void print_tokens(Buf *buf, ZigList<Token> *tokens) {
}
}
enum NodeType {
NodeTypeRoot,
};
struct AstNode {
enum NodeType type;
ZigList<AstNode *> children;
};
enum AstState {
AstStateStart,
};
struct BuildAst {
Buf *buf;
AstNode *root;
AstState state;
int line;
int column;
};
__attribute__ ((format (printf, 2, 3)))
static void ast_error(BuildAst *b, const char *format, ...) {
int line = b->line + 1;
int column = b->column + 1;
va_list ap;
va_start(ap, format);
fprintf(stderr, "Error: Line %d, column %d: ", line, column);
vfprintf(stderr, format, ap);
fprintf(stderr, "\n");
va_end(ap);
exit(EXIT_FAILURE);
}
static inline bool mem_eql_str(const char *mem, size_t mem_len, const char *str) {
size_t str_len = strlen(str);
if (str_len != mem_len)
return false;
return memcmp(mem, str, mem_len) == 0;
}
static AstNode *build_ast(Buf *buf, ZigList<Token> *tokens) {
BuildAst b = {0};
b.buf = buf;
b.root = allocate<AstNode>(1);
b.root->type = NodeTypeRoot;
for (int i = 0; i < tokens->length; i += 1) {
Token *token = &tokens->at(i);
const char *token_str = buf_ptr(buf) + token->start_pos;
int token_len = token->end_pos - token->start_pos;
b.line = token->start_line;
b.column = token->start_column;
switch (b.state) {
case AstStateStart:
if (mem_eql_str(token_str, token_len, "fn")) {
zig_panic("TODO fn");
} else {
Buf msg = {0};
buf_append_str(&msg, "unexpected symbol: '");
buf_append_mem(&msg, token_str, token_len);
buf_append_str(&msg, "'");
ast_error(&b, "%s", buf_ptr(&msg));
}
break;
}
}
return b.root;
}
static void print_ast(AstNode *node) {
zig_panic("TODO");
}
char cur_dir[1024];
int main(int argc, char **argv) {
@ -559,18 +512,18 @@ int main(int argc, char **argv) {
Buf *in_data = fetch_file(in_f);
fprintf(stderr, "Original source:\n%s\n", buf_ptr(in_data));
fprintf(stderr, "Original source:\n");
fprintf(stderr, "----------------\n");
fprintf(stderr, "%s\n", buf_ptr(in_data));
ZigList<Token> *tokens = tokenize(in_data, &include_paths, cur_dir_path);
fprintf(stderr, "\nTokens:\n");
fprintf(stderr, "---------\n");
print_tokens(in_data, tokens);
/*
Buf *preprocessed_source = preprocess(in_data, tokens, &include_paths, cur_dir_path);
fprintf(stderr, "\nPreprocessed source:\n%s\n", buf_ptr(preprocessed_source));
*/
AstNode *root = build_ast(in_data, tokens);
print_ast(root);
return EXIT_SUCCESS;

View file

@ -1,3 +1,3 @@
int add(int a, int b) {
return a + b;
pub fn add(a: int, b: int) -> int {
a + b
}

View file

@ -1,6 +1,6 @@
#include <stdio.h>
#include "add.h"
int main(int argc, char **argv) {
fprintf(stderr, "hello: %d", add(1, 2));
fn main(argc: int, argv: *mut char) -> int {
puts("Hello, world!\n");
return 0;
}