summaryrefslogtreecommitdiff
path: root/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'lex.c')
-rw-r--r--lex.c247
1 files changed, 247 insertions, 0 deletions
diff --git a/lex.c b/lex.c
new file mode 100644
index 0000000..0ff1f20
--- /dev/null
+++ b/lex.c
@@ -0,0 +1,247 @@
+#include "lex.h"
+#include "errloc.h"
+#include <errno.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+static void free_tk(struct tk *tk)
+{
+ free(tk->s);
+}
+
+static int invnum(const struct tk *tk, char *end)
+{
+ if (errno)
+ errloc(tk, "invalid number: %s (%s)", tk->s, strerror(errno));
+ else if (*end)
+ errloc(tk, "invalid number: %s", tk->s);
+
+ return -1;
+}
+
+static int fintok(struct lex *lex)
+{
+ struct tk *tk = &lex->tk, *tokens;
+ const struct tk empty = {.loc = tk->loc};
+ size_t ntk = lex->ntok + 1;
+ char *s;
+
+ if (tk->type == UNDEF)
+ return 0;
+ else if (!(s = realloc(tk->s, lex->len + 1)))
+ {
+ perror("realloc(3)");
+ return -1;
+ }
+
+ tk->s = s;
+ tk->s[lex->len] = '\0';
+
+ switch (tk->type)
+ {
+ case UNDEF:
+ break;
+
+ case ANY:
+ errloc(tk, "%s: unreachable", __func__);
+ return -1;
+
+ case LIT:
+ case ID:
+ break;
+
+ case NUM:
+ {
+ int neg = *tk->s == '-';
+ char *end;
+
+ errno = 0;
+ strtoll(tk->s, &end, 0);
+
+ if (errno || *end)
+ {
+ if (neg)
+ return invnum(tk, end);
+
+ errno = 0;
+ strtoull(tk->s, &end, 0);
+
+ if (errno || *end)
+ return invnum(tk, end);
+ }
+ }
+ }
+
+ if (!(tokens = realloc(lex->tokens, ntk * sizeof *tokens)))
+ {
+ perror("realloc(3)");
+ return -1;
+ }
+
+ tokens[lex->ntok++] = lex->tk;
+ lex->tokens = tokens;
+ lex->tk = empty;
+ lex->len = 0;
+ return 0;
+}
+
+static int printable(char c)
+{
+ return c >= '!' && c <= '~';
+}
+
+static void invch(const struct lex *l, char c)
+{
+ if (printable(c))
+ errcloc(l, "invalid character: %c", c);
+ else
+ errcloc(l, "invalid character: (%#hhx)", c);
+}
+
+static int ch(char c, struct lex *lex)
+{
+ struct tk *tk = &lex->tk;
+ char *s;
+
+ switch (tk->type)
+ {
+ case UNDEF:
+ tk->loc = lex->loc;
+
+ if (c == '\"')
+ {
+ tk->type = LIT;
+ return 0;
+ }
+ else if (c == '-' || c == '+'
+ || (c >= '0' && c <= '9'))
+ tk->type = NUM;
+ else if (c == '_'
+ || (c >= 'a' && c <= 'z')
+ || (c >= 'A' && c <= 'Z'))
+ tk->type = ID;
+ else
+ {
+ invch(lex, c);
+ return -1;
+ }
+
+ break;
+
+ case LIT:
+ if (c == '\"')
+ return fintok(lex);
+
+ break;
+
+ case NUM:
+ if (c == '\"')
+ {
+ invch(lex, c);
+ return -1;
+ }
+
+ break;
+
+ case ID:
+ if (c == '\"' || c == '-' || c == '+')
+ {
+ invch(lex, c);
+ return -1;
+ }
+ else if (!(c == '_'
+ || (c >= '0' && c <= '9')
+ || (c >= 'a' && c <= 'z')
+ || (c >= 'A' && c <= 'Z')))
+ {
+ invch(lex, c);
+ return -1;
+ }
+
+ break;
+ case ANY:
+ errcloc(lex, "%s: unreachable", __func__);
+ return -1;
+ }
+
+ if (!(s = realloc(tk->s, lex->len + 1)))
+ {
+ perror("realloc(3)");
+ return -1;
+ }
+
+ s[lex->len++] = c;
+ tk->s = s;
+ return 0;
+}
+
+static int clex(char c, struct lex *lex)
+{
+ struct tk *tk = &lex->tk;
+ struct loc *loc = &lex->loc;
+
+ ++loc->col;
+
+ switch (c)
+ {
+ case '*':
+ lex->comment = 1;
+ return fintok(lex);
+
+ case '\n':
+ if (tk->type == LIT)
+ {
+ errcloc(lex, "unterminated literal");
+ return -1;
+ }
+ else if (fintok(lex))
+ return -1;
+
+ loc->line++;
+ loc->col = lex->comment = 0;
+ return 0;
+
+ case ' ':
+ case '\t':
+ if (tk->type != LIT)
+ return lex->comment ? 0 : fintok(lex);
+ default:
+ return lex->comment ? 0 : ch(c, lex);
+ }
+
+ invch(lex, c);
+ return -1;
+}
+
+void lex_free(struct lex *lex)
+{
+ for (size_t i = 0; i < lex->ntok; i++)
+ free_tk(&lex->tokens[i]);
+
+ free(lex->tokens);
+ free_tk(&lex->tk);
+}
+
+int lex_eof(const struct lex *lex, const struct tk *tk)
+{
+ return tk - lex->tokens >= lex->ntok;
+}
+
+int lex(struct lex *l, FILE *f)
+{
+ l->loc.line = 1;
+
+ for (;;)
+ {
+ int c = fgetc(f);
+
+ if (c == EOF)
+ break;
+ else if (clex(c, l))
+ return -1;
+ }
+
+ return 0;
+}