-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtokenize.c
148 lines (126 loc) · 3.76 KB
/
tokenize.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#include "jcc.h"
// Input string
static char *current_input;
// Reports an error and exit.
void error(char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
fprintf(stderr, "\n");
exit(1);
}
// Reports an error location and exit
static void verror_at(char *loc, char *fmt, va_list ap) {
int pos = loc - current_input;
fprintf(stderr, "%s\n", current_input);
fprintf(stderr, "%*s", pos, " ");
fprintf(stderr, "^ ");
vfprintf(stderr, fmt, ap);
fprintf(stderr, "\n");
exit(1);
}
void error_at(char *loc, char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
verror_at(loc, fmt, ap);
}
void error_tok(Token *tok, char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
verror_at(tok->loc, fmt, ap);
}
// Consumes the current token if it matches 'op'
// op[tok->len] == '\0' to ensure that op only has one character
bool equal(Token *tok, char *op){
return memcmp(tok->loc, op, tok->len) == 0 && op[tok->len] == '\0';
}
// Ensure that the current token is 's'
Token *skip(Token *tok, char *s){
if (!equal(tok, s)) error_tok(tok, "expected '%s'", s);
return tok->next;
}
// Create a new token
static Token *new_token(TokenKind kind, char *start, char *end) {
Token *tok = calloc(1, sizeof(Token));
tok->kind = kind;
tok->loc = start;
tok->len = end - start;
return tok;
}
// https://en.cppreference.com/w/c/language/operator_precedence
static bool startswith(char *p, char *q) {
return strncmp(p, q, strlen(q)) == 0;
}
// Return True if c is a valid as the first character of an identifier
static bool is_ident1(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
}
// Return True if c ia valid as a non-first character of an identifier
static bool is_ident2(char c) {
return is_ident1(c) || ('0' <= c && c <= '9');
}
// Read a punctuator token from p and returns its length
static int read_punct(char *p) {
if (startswith(p, "==") || startswith(p, "!=") ||
startswith(p, "<=") || startswith(p, ">="))
return 2;
return ispunct(*p) ? 1 : 0;
}
static bool is_keyword(Token *tok) {
static char *kw[] = {"return", "if", "else", "for", "while"};
for (int i = 0; i < sizeof(kw) / sizeof(*kw); i++)
if(equal(tok, kw[i]))
return true;
return false;
}
static void convert_keyword(Token *tok) {
for(Token *cur = tok; cur; cur = cur->next) {
if(is_keyword(cur))
cur->kind = TK_KEYWORD;
}
}
// Tokenize 'current_input' and returns new tokens.
Token *tokenize(char *p) {
current_input = p;
Token head = {};
Token *cur = &head;
while (*p) {
// Skip whitespace characters
if (isspace(*p)) {
p++;
continue;
}
// Numeric literal
if (isdigit(*p)) {
// cur(cur->next) point to next cur and create a new tokoen for cur
// cur->next = new_token(TK_NUM, p, p);
// cur = cur->next;
cur = cur->next = new_token(TK_NUM, p, p);
char *q = p;
cur->val = strtol(q, &p, 10);
cur->len = p - q;
continue;
}
// Identifier
if (is_ident1(*p)) {
char *start = p;
do {
p++;
}while(is_ident2(*p));
cur = cur->next = new_token(TK_IDENT, start, p);
continue;
}
// Punctuators
int punct_len = read_punct(p);
if (punct_len) {
cur = cur->next = new_token(TK_PUNCT, p, p + punct_len);
p += cur->len;
continue;
}
// Error
error_at(p,"unexpected charater: %c", *p);
}
cur = cur->next = new_token(TK_EOF, p, p);
convert_keyword(head.next);
return head.next;
}