Kestrel Interface
Loading...
Searching...
No Matches
kest_tokenizer.c
Go to the documentation of this file.
1#include <stdlib.h>
2#include <stdio.h>
3#include <string.h>
4
5#include "kest_int.h"
6
7#ifndef PRINTLINES_ALLOWED
8#define PRINTLINES_ALLOWED 0
9#endif
10
11static const char *FNAME = "kest_tokenizer.c";
12
14
15int char_is_letter(char c)
16{
17 return (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'));
18}
19
20int char_is_number(char c)
21{
22 return ('0' <= c && c <= '9');
23}
24
26{
27 return (char_is_letter(c) || char_is_number(c));
28}
29
30int char_is_bracket(char c)
31{
32 return (c == '(' || c == '[' || c == '{'
33 || c == ')' || c == ']' || c == '}');
34}
35
36int char_is_in_string(char c, const char *str)
37{
38 if (!str)
39 return 0;
40
41 int i = 0;
42
43 while (str[i] != c)
44 {
45 if (str[i++] == 0)
46 return 0;
47 }
48
49 return 1;
50}
51
52
53int token_is_char(char *str, char c)
54{
55 return (str && str[0] == c && str[1] == 0);
56}
57
58int token_is_newline(char *str)
59{
60 return token_is_char(str, '\n');
61}
62
63int token_is_int(char *token)
64{
65 if (!token)
66 return 0;
67
68 int pos = 0;
69
70 while (token[pos])
71 {
72 if (!('0' <= token[pos] && token[pos] <= '9'))
73 return 0;
74 pos++;
75 }
76
77 return 1;
78}
79
80int token_is_number(char *token)
81{
82 if (!token)
83 return 0;
84
85 int len = strlen(token);
86
87 char allowed_chars[27] = ".0123456789\0\0\0abcdefABCDEF\0";
88
89 if (!char_is_in_string(token[0], allowed_chars))
90 return 0;
91
92 if (token[0] == '0')
93 {
94 allowed_chars[11] = 'b';
95 allowed_chars[12] = 'x';
96 }
97 else if (token[0] == '.')
98 {
99 allowed_chars[0] = '0';
100
101 if (len == 1)
102 return 0;
103 }
104
105 for (int i = 1; i < len; i++)
106 {
107 if (!char_is_in_string(token[i], allowed_chars))
108 return 0;
109
110 if (token[i] == '.')
111 allowed_chars[0] = '0';
112
113 if (i == 1)
114 {
115 if (token[i] == 'x' || token[i] == 'b')
116 {
117 allowed_chars[11] = '0';
118 allowed_chars[12] = '0';
119
120 if (len < 3)
121 return 0;
122
123 if (token[i] == 'b')
124 allowed_chars[3] = 0;
125 else
126 allowed_chars[13] = '0';
127 }
128 }
129 }
130
131 return 1;
132}
133
135{
136 if (!token)
137 return 0;
138
139 if (strcmp(token, ",") == 0)
140 return 1;
141
142 if (strcmp(token, "\n") == 0)
143 return 1;
144
145 if (strcmp(token, ";") == 0)
146 return 1;
147
148 return 0;
149}
150
151float digit_to_float(char c)
152{
153 if ('0' <= c && c <= '9')
154 return (float)(c - '0');
155 else if ('a' <= c && c <= 'f')
156 return (float)(c - 'a');
157 else if ('A' <= c && c <= 'F')
158 return (float)(c - 'A');
159
160 return 0.0;
161}
162
163float token_to_float(char *token)
164{
165 if (!token)
166 return 0;
167
168 int len = strlen(token);
169
170 float res = 0.0;
171
172 float base = 10.0f;
173 int exp = 0;
174 int frac = 0;
175 int pos = 0;
176
177 if (len > 2 && token[0] == '0')
178 {
179 if (token[1] == 'b')
180 {
181 base = 2.0f;
182 pos = 2;
183 }
184 else if (token[1] == 'x')
185 {
186 base = 16.0f;
187 pos = 2;
188 }
189 }
190
191 while (pos < len)
192 {
193 if (token[pos] == '.')
194 {
195 frac = 1;
196 }
197 else
198 {
199 res = res * base + digit_to_float(token[pos]);
200 exp += frac;
201 }
202
203 pos++;
204 }
205
206 while (exp > 0)
207 {
208 res /= (float)base;
209 exp--;
210 }
211
212 return res;
213}
214
215int token_is_name(char *token)
216{
217 if (!token)
218 return 0;
219
220 int len = strlen(token);
221
222 if (len == 0)
223 return 0;
224
225 for (int i = 0; i < len; i++)
226 {
227 if (char_is_letter(token[i]) || token[i] == '_')
228 continue;
229
230 if (i > 0 && char_is_number(token[i]))
231 continue;
232
233 return 0;
234 }
235
236 return 1;
237}
238
240{
241 if (!list)
242 return ERR_NULL_PTR;
243
244 if (*list)
245 *list = (*list)->next;
246 else
247 return NO_ERROR;
248
249 kest_token_ll *current = *list;
250
251 if (!current)
252 return NO_ERROR;
253
254 int cont;
255 while (current && current->data)
256 {
257 cont = 0;
258
259 if (current->data[0] == '\n' || current->data[0] == ' ' || current->data[0] == '\t')
260 cont = 1;
261
262 if (!cont)
263 break;
264
265 current = current->next;
266 }
267
268 *list = current;
269 return NO_ERROR;
270}
271
273{
274 if (!list)
275 return ERR_NULL_PTR;
276
277 kest_token_ll *current = *list;
278
279 if (!current)
280 return NO_ERROR;
281
282 int cont;
283 while (current && current->data)
284 {
285 cont = 0;
286
287 if (current->data[0] == '\n' || current->data[0] == ' ' || current->data[0] == '\t')
288 cont = 1;
289
290 if (!cont)
291 break;
292
293 current = current->next;
294 }
295
296 *list = current;
297 return NO_ERROR;
298}
299
300
301int kest_token_ll_safe_append(kest_token_ll **list_ptr, char *x, int line, int index)
302{
303 if (!list_ptr)
304 return ERR_NULL_PTR;
305
306 kest_token_ll *node = kest_alloc(sizeof(kest_token_ll));
307
308 if (!node)
309 return ERR_ALLOC_FAIL;
310
311 node->data = x;
312 node->line = line;
313 node->index = index;
314 node->next = NULL;
315
316 if (*list_ptr)
317 {
318 kest_token_ll *current = *list_ptr;
319
320 while (current->next)
321 current = current->next;
322
323 current->next = node;
324 }
325 else
326 {
327 *list_ptr = node;
328 }
329
330 return NO_ERROR;
331}
332
333int kest_token_ll_safe_aappend(kest_token_ll **list_ptr, char *x, int line, int index)
334{
335 if (!list_ptr)
336 return ERR_NULL_PTR;
337
339
340 if (!node)
341 return ERR_ALLOC_FAIL;
342
343 node->data = x;
344 node->line = line;
345 node->index = index;
346 node->next = NULL;
347
348 if (*list_ptr)
349 {
350 kest_token_ll *current = *list_ptr;
351
352 while (current->next)
353 current = current->next;
354
355 current->next = node;
356 }
357 else
358 {
359 *list_ptr = node;
360 }
361
362 return NO_ERROR;
363}
364
365int tokenizer_policy(char c, int *state_ptr)
366{
367 if (!state_ptr)
368 return -1;
369
370 int state = *state_ptr;
371
372 if (c == 0 || c == EOF)
373 {
374 *state_ptr = TOKENIZER_STATE_DONE;
376 }
377
378 if (state == TOKENIZER_STATE_STRING)
379 {
380 if (c == '"')
381 {
382 *state_ptr = TOKENIZER_STATE_IDLE;
384 }
385
386 if (c == '\\')
387 {
388 *state_ptr = TOKENIZER_STATE_STRESC;
390 }
391
392 if (c == '\n')
394
396 }
397
398 if (state == TOKENIZER_STATE_STRESC)
399 {
400 *state_ptr = TOKENIZER_STATE_STRING;
402 }
403
404 if (state != TOKENIZER_STATE_IDLE && (c == ' ' || c == '\t'))
405 {
406 *state_ptr = TOKENIZER_STATE_IDLE;
408 }
409
410 if (c == '\n' || char_is_bracket(c) || c == ':' || c == ',' ||
411 (c == '.' && (state != TOKENIZER_STATE_LEADING_ZERO && state != TOKENIZER_STATE_NUMBER && state != TOKENIZER_STATE_NUMBER_BIN && state != TOKENIZER_STATE_NUMBER_HEX)))
412 {
413 *state_ptr = TOKENIZER_STATE_IDLE;
415 }
416
417 switch (state)
418 {
420 if (c == ' ' || c == '\t') return TOKENIZER_POLICY_DISCARD;
421
422 if (char_is_letter(c))
423 {
424 *state_ptr = TOKENIZER_STATE_NAME;
426 }
427
428 if (char_is_number(c))
429 {
430 if (c == '0')
431 *state_ptr = TOKENIZER_STATE_LEADING_ZERO;
432 else
433 *state_ptr = TOKENIZER_STATE_NUMBER;
434
436 }
437
438 if (c == '"')
439 {
440 *state_ptr = TOKENIZER_STATE_STRING;
442 }
443
445
447 if (c == '_' || char_is_alphanumeric(c))
449
450 if (char_is_in_string(c, " \t"))
451 {
452 *state_ptr = TOKENIZER_STATE_IDLE;
454 }
455
457
459 if (c == 'b')
460 {
461 *state_ptr = TOKENIZER_STATE_NUMBER_BIN;
463 }
464
465 if (c == 'x')
466 {
467 *state_ptr = TOKENIZER_STATE_NUMBER_HEX;
469 }
470
471 if (c == '.')
472 {
473 *state_ptr = TOKENIZER_STATE_NUMBER;
475 }
476
477 if (!char_is_in_string(c, "0123456789"))
478 {
479 *state_ptr = TOKENIZER_STATE_IDLE;
481 }
482
484
486 if (char_is_in_string(c, "abcdefABCDEF"))
488
490 if (char_is_in_string(c, ".123456789"))
492
494 if (c == '0' || c == '1' || c == '.')
496
497 *state_ptr = TOKENIZER_STATE_IDLE;
499 }
500
502}
503
505{
506 if (!ps)
507 return ERR_NULL_PTR;
508
509 char buf[256];
510
511 int line = 1;
512 int line_char = 4;
513 int token_index = 0;
514 int new_line = 0;
515 int buf_pos = 0;
516 char c;
517 int C;
518 int policy;
519 int file_pos = 0;
520
521 int state = TOKENIZER_STATE_IDLE;
522
523 buf[0] = ps->content[file_pos++];
524 buf[1] = ps->content[file_pos++];
525 buf[2] = ps->content[file_pos++];
526 buf[3] = ps->content[file_pos++];
527 buf[4] = 0;
528
529 if (strcmp(ver_str, buf) != 0)
530 {
531 KEST_PRINTF("'%c' (%d), '%c' (%d), '%c' (%d), '%c' (%d)\n", buf[0], buf[0], buf[1], buf[1], buf[2], buf[2], buf[3], buf[3]);
532 kest_parser_error_at_line(ps, 1, "Version string \"%s\" required at start of file; instead, file starts with \"%s\"", ver_str, buf);
533 return ERR_BAD_ARGS;
534 }
535
537
538 while (state != TOKENIZER_STATE_DONE)
539 {
540 C = ps->content[file_pos++];
541 c = (char)C;
542
543 if (c == 0)
544 {
545 state = TOKENIZER_STATE_DONE;
547 }
548 else
549 {
550 policy = tokenizer_policy(c, &state);
551 }
552
553 switch (policy)
554 {
556 break;
558 buf[buf_pos++] = c;
559 break;
560
562 if (buf_pos)
563 {
564 buf[buf_pos++] = 0;
565 kest_token_ll_safe_aappend(&ps->tokens, kest_parser_strndup(buf, buf_pos), line, token_index);
566 token_index += buf_pos;
567 buf_pos = 0;
568 }
569 buf[0] = c;
570 buf[1] = 0;
571 kest_token_ll_safe_aappend(&ps->tokens, kest_parser_strndup(buf, 1), line, token_index);
572 token_index += 1;
573 break;
575 buf_pos = 0;
576 buf[buf_pos++] = c;
577 break;
579 buf[buf_pos++] = c;
581 buf[buf_pos++] = 0;
582 kest_token_ll_safe_aappend(&ps->tokens, kest_parser_strndup(buf, buf_pos), line, token_index);
583 token_index += buf_pos;
584 buf_pos = 0;
585 break;
587 if (c == '\n')
588 {
589 buf[0] = '\\';
590 buf[1] = 'n';
591 buf[2] = 0;
592 }
593 else
594 {
595 buf[0] = c;
596 buf[1] = 0;
597 }
598 kest_parser_error_at_line(ps, line, "Unexpected \"%s\"", buf);
599 return ERR_BAD_ARGS;
600 }
601
602 if (c == '\n')
603 {
604 line = line + 1;
605 line_char = 0;
606 token_index = 0;
607 }
608 else
609 {
610 line_char++;
611 }
612 }
613
614 if (buf_pos)
615 {
616 buf[buf_pos++] = 0;
617 kest_token_ll_safe_aappend(&ps->tokens, kest_parser_strndup(buf, buf_pos), line, token_index);
618 }
619
620 ps->n_lines = line;
621
622 return NO_ERROR;
623}
624
626{
627 if (!start)
628 return NULL;
629
630 kest_token_ll *res = NULL;
631 kest_token_ll *current = start;
632
633 while (current && current != end)
634 {
635 kest_token_ll_safe_aappend(&res, current->data, current->line, current->index);
636 current = current->next;
637 }
638
639 return res;
640}
void * kest_alloc(size_t size)
Definition kest_alloc.c:11
char * kest_parser_strndup(const char *str, int n)
void * kest_parser_alloc(size_t size)
void kest_parser_error_at_line(kest_eff_parsing_state *ps, int line, const char *msg,...)
const char * ver_str
#define ERR_ALLOC_FAIL
#define ERR_BAD_ARGS
#define NO_ERROR
#define ERR_NULL_PTR
#define IMPLEMENT_LINKED_PTR_LIST(X)
#define KEST_PRINTF(...)
Definition kest_printf.h:10
int kest_token_ll_safe_aappend(kest_token_ll **list_ptr, char *x, int line, int index)
int char_is_in_string(char c, const char *str)
int token_is_newline(char *str)
int kest_token_ll_safe_append(kest_token_ll **list_ptr, char *x, int line, int index)
int token_is_int(char *token)
int char_is_bracket(char c)
float digit_to_float(char c)
int char_is_number(char c)
float token_to_float(char *token)
int token_is_name(char *token)
int token_is_dict_entry_seperator(char *token)
int kest_token_ll_advance(kest_token_ll **list)
int token_is_char(char *str, char c)
int char_is_alphanumeric(char c)
int kest_tokenize_content(kest_eff_parsing_state *ps)
int tokenizer_policy(char c, int *state_ptr)
int kest_token_ll_skip_ws(kest_token_ll **list)
kest_token_ll * kest_token_span_to_ll(kest_token_ll *start, kest_token_ll *end)
int char_is_letter(char c)
int token_is_number(char *token)
#define TOKENIZER_STATE_DONE
#define TOKENIZER_STATE_NAME
#define TOKENIZER_STATE_IDLE
#define TOKENIZER_STATE_NUMBER_BIN
#define TOKENIZER_POLICY_DISCARD
#define TOKENIZER_STATE_STRESC
#define TOKENIZER_POLICY_COMPLAIN
#define TOKENIZER_POLICY_END_ACCEPT
#define TOKENIZER_POLICY_SINGULAR
#define TOKENIZER_STATE_LEADING_ZERO
#define TOKENIZER_POLICY_BEGIN
#define TOKENIZER_POLICY_ACCEPT
#define TOKENIZER_POLICY_END_DISCARD
#define TOKENIZER_STATE_NUMBER
#define TOKENIZER_STATE_STRING
#define TOKENIZER_STATE_NUMBER_HEX
struct kest_token_ll * next