Lexer for the Algol 68 front-end. Signed-off-by: Jose E. Marchesi <[email protected]> Co-authored-by: Marcel van der Veer <[email protected]> --- gcc/algol68/a68-parser-scanner.cc | 2277 +++++++++++++++++++++++++++++ 1 file changed, 2277 insertions(+) create mode 100644 gcc/algol68/a68-parser-scanner.cc
diff --git a/gcc/algol68/a68-parser-scanner.cc b/gcc/algol68/a68-parser-scanner.cc new file mode 100644 index 00000000000..d3f7a93abe3 --- /dev/null +++ b/gcc/algol68/a68-parser-scanner.cc @@ -0,0 +1,2277 @@ +/* Context-dependent ALGOL 68 tokeniser. + Copyright (C) 2001-2023 J. Marcel van der Veer. + Copyright (C) 2025 Jose E. Marchesi. + + Original implementation by J. Marcel van der Veer. + Adapted for GCC by Jose E. Marchesi. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +/* Context-dependent ALGOL 68 tokeniser. */ + + +#define INCLUDE_MEMORY +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "diagnostic.h" +#include "options.h" +#include "vec.h" + +#include "a68.h" + +/* A few forward references of static functions defined in this file. */ + +static void include_files (LINE_T *top); + +/* Standard prelude and postlude for source files. + + We need several versions for the several supported stropping regimes. */ + +static const char * +upper_prelude_start[] = { + "BEGIN", + " BEGIN", + NO_TEXT +}; + +static const char * +upper_postlude[] = { + " END;", + " stop: SKIP", + "END", + NO_TEXT +}; + +static const char * +supper_prelude_start[] = { + "begin", + " begin", + NO_TEXT +}; + +static const char * +supper_postlude[] = { + " end;", + " stop: skip", + "end", + NO_TEXT +}; + +/* Macros. */ + +#define NULL_CHAR '\0' +#define STOP_CHAR 127 +#define FORMFEED_CHAR '\f' +#define CR_CHAR '\r' +#define QUOTE_CHAR '"' +#define APOSTROPHE_CHAR '\'' +#define BACKSLASH_CHAR '\\' +#define NEWLINE_CHAR '\n' +#define EXPONENT_CHAR 'e' +#define RADIX_CHAR 'r' +#define POINT_CHAR '.' +#define TAB_CHAR '\t' + +#define MAX_RESTART 256 + +#define EOL(c) ((c) == NEWLINE_CHAR || (c) == NULL_CHAR) +#define SCAN_ERROR(c, u, v, txt) if (c) \ + do \ + { \ + a68_scan_error (u, v, txt); \ + } \ + while (0) + + +#define SCAN_DIGITS(c) \ + while (ISDIGIT (c)) \ + { \ + (sym++)[0] = (c); \ + (c) = next_char (ref_l, ref_s, true); \ + } + +#define SCAN_EXPONENT_PART(c) \ + do \ + { \ + (sym++)[0] = EXPONENT_CHAR; \ + (c) = next_char (ref_l, ref_s, true); \ + if ((c) == '+' || (c) == '-') { \ + (sym++)[0] = (c); \ + (c) = next_char (ref_l, ref_s, true); \ + } \ + SCAN_ERROR (!ISDIGIT (c), *start_l, *start_c, \ + "invalid exponent digit"); \ + SCAN_DIGITS (c); \ + } \ + while (0) + +/* Read bytes from file into buffer. */ + +static ssize_t +io_read (FILE *file, void *buf, size_t n) +{ + int fd = fileno (file); + size_t to_do = n; + int restarts = 0; + char *z = (char *) buf; + while (to_do > 0) + { + ssize_t bytes_read; + + errno = 0; + bytes_read = read (fd, z, to_do); + if (bytes_read < 0) + { + if (errno == EINTR) + { + /* interrupt, retry. */ + bytes_read = 0; + if (restarts++ > MAX_RESTART) + { + return -1; + } + } + else + { + /* read error. */ + return -1; + } + } + else if (bytes_read == 0) + { + /* EOF_CHAR */ + break; + } + to_do -= (size_t) bytes_read; + z += bytes_read; + } + + /* return >= 0 */ + return (ssize_t) n - (ssize_t) to_do; +} + +/* Save scanner state, for character look-ahead. */ + +static void +save_state (LINE_T *ref_l, char *ref_s, char ch) +{ + SCAN_STATE_L (&A68_JOB) = ref_l; + SCAN_STATE_S (&A68_JOB) = ref_s; + SCAN_STATE_C (&A68_JOB) = ch; +} + +/* Restore scanner state, for character look-ahead. */ + +static void +restore_state (LINE_T **ref_l, char **ref_s, char *ch) +{ + *ref_l = SCAN_STATE_L (&A68_JOB); + *ref_s = SCAN_STATE_S (&A68_JOB); + *ch = SCAN_STATE_C (&A68_JOB); +} + +/* New_source_line. */ + +static LINE_T * +new_source_line (void) +{ + LINE_T *z = (LINE_T *) xmalloc (sizeof (LINE_T)); + + MARKER (z)[0] = '\0'; + STRING (z) = NO_TEXT; + FILENAME (z) = NO_TEXT; + NUMBER (z) = 0; + NEXT (z) = NO_LINE; + PREVIOUS (z) = NO_LINE; + return z; +} + +/* Append a source line to the internal source file. */ + +static void +append_source_line (const char *str, LINE_T **ref_l, int *line_num, + const char *filename) +{ + LINE_T *z = new_source_line (); + + /* Link line into the chain. */ + STRING (z) = xstrdup (str); + FILENAME (z) = filename; + NUMBER (z) = (*line_num)++; + NEXT (z) = NO_LINE; + PREVIOUS (z) = *ref_l; + if (TOP_LINE (&A68_JOB) == NO_LINE) + TOP_LINE (&A68_JOB) = z; + if (*ref_l != NO_LINE) + NEXT (*ref_l) = z; + *ref_l = z; +} + +/* Append environment source lines. */ + +static void +append_environ (const char *str[], LINE_T **ref_l, int *line_num, const char *name) +{ + for (int k = 0; str[k] != NO_TEXT; k++) + { + int zero_line_num = 0; + (*line_num)++; + append_source_line (str[k], ref_l, &zero_line_num, name); + } +} + +/* + * Scanner, tokenises the source code. + */ + +/* Emit a diagnostic if CH is an unworthy character. */ + +static void +unworthy (LINE_T *u, char *v, char ch) +{ + if (ISPRINT (ch)) + { + if (snprintf (A68 (edit_line), SNPRINTF_SIZE, "*%s", + "unworthy character") < 0) + gcc_unreachable (); + } + else + { + if (snprintf (A68 (edit_line), SNPRINTF_SIZE, "*%s %c", + "unworthy character", ch) < 0) + gcc_unreachable (); + } + + a68_scan_error (u, v, A68 (edit_line)); +} + +/* Concatenate lines that terminate in '\' with next line. */ + +static void +concatenate_lines (LINE_T * top) +{ + LINE_T *q; + /* Work from bottom backwards. */ + for (q = top; q != NO_LINE && NEXT (q) != NO_LINE; FORWARD (q)) + ; + + for (; q != NO_LINE; BACKWARD (q)) + { + char *z = STRING (q); + size_t len = strlen (z); + + if (len >= 2 + && z[len - 2] == BACKSLASH_CHAR + && z[len - 1] == NEWLINE_CHAR + && NEXT (q) != NO_LINE + && STRING (NEXT (q)) != NO_TEXT) + { + z[len - 2] = '\0'; + len += (int) strlen (STRING (NEXT (q))); + z = (char *) xmalloc (len + 1); + a68_bufcpy (z, STRING (q), len + 1); + a68_bufcat (z, STRING (NEXT (q)), len + 1); + STRING (NEXT (q))[0] = '\0'; + STRING (q) = z; + } + } +} + +/* Size of source file. */ + +static int +get_source_size (void) +{ + FILE *f = FILE_SOURCE_FD (&A68_JOB); + return (int) lseek (fileno (f), 0, SEEK_END); +} + +/* Read source file FILENAME and make internal copy. */ + +static bool +read_source_file (const char *filename) +{ + struct stat statbuf; + LINE_T *ref_l = NO_LINE; + int line_num = 0; + size_t k; + size_t bytes_read; + ssize_t l; + size_t source_file_size; + char *buffer; + FILE *f; + bool ret = true; + + /* First open the given file. */ + if (!(FILE_SOURCE_FD (&A68_JOB) = fopen (filename, "r"))) + fatal_error (UNKNOWN_LOCATION, "could not open source file %s", + filename); + FILE_SOURCE_NAME (&A68_JOB) = xstrdup (filename); + f = FILE_SOURCE_FD (&A68_JOB); + + if (fstat (fileno (f), &statbuf) + || !(S_ISREG (statbuf.st_mode) || S_ISCHR (statbuf.st_mode))) + fatal_error (UNKNOWN_LOCATION, "specified file %s is a directory", + filename); + + if ((source_file_size = get_source_size ()) == 0) + { + /* The source file is empty. */ + ret = false; + goto done; + } + + /* Allocate A68_PARSER (scan_buf), which is an auxiliary buffer used by the + scanner known to be big enough to hold any string contained in the source + file. */ + A68_PARSER (max_scan_buf_length) = source_file_size + 1; + A68_PARSER (max_scan_buf_length) += 1024; /* For the environment. */ + A68_PARSER (scan_buf) = (char *) xmalloc (A68_PARSER (max_scan_buf_length)); + + /* Prelude. */ + append_environ (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING + ? upper_prelude_start : supper_prelude_start, + &ref_l, &line_num, "prelude"); + + /* Read the file into a single buffer, so we save on system calls. */ + line_num = 1; + errno = 0; + buffer = (char *) xmalloc (8 + source_file_size); + if (lseek (fileno (f), 0, SEEK_SET) < 0) + gcc_unreachable (); + errno = 0; + bytes_read = io_read (f, buffer, source_file_size); + gcc_assert (errno == 0 && bytes_read == source_file_size); + + /* Link all lines into the list. */ + k = 0; + while (k < source_file_size) + { + l = 0; + A68_PARSER (scan_buf)[0] = '\0'; + while (k < source_file_size && buffer[k] != NEWLINE_CHAR) + { + if (k < source_file_size - 1 + && buffer[k] == CR_CHAR && buffer[k + 1] == NEWLINE_CHAR) + k++; + else + { + A68_PARSER (scan_buf)[l++] = buffer[k++]; + A68_PARSER (scan_buf)[l] = '\0'; + } + } + A68_PARSER (scan_buf)[l++] = NEWLINE_CHAR; + A68_PARSER (scan_buf)[l] = '\0'; + if (k < source_file_size) + k++; + append_source_line (A68_PARSER (scan_buf), &ref_l, &line_num, + FILE_SOURCE_NAME (&A68_JOB)); + SCAN_ERROR (l != (ssize_t) strlen (A68_PARSER (scan_buf)), + NO_LINE, NO_TEXT, "invalid characters in source file"); + } + + /* Postlude. */ + append_environ (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING + ? upper_postlude : supper_postlude, + &ref_l, &line_num, "postlude"); + + /* Concatenate lines that end with \. */ + concatenate_lines (TOP_LINE (&A68_JOB)); + + /* Include files. */ + include_files (TOP_LINE (&A68_JOB)); + + done: + if (fclose (FILE_SOURCE_FD (&A68_JOB)) != 0) + gcc_unreachable (); + return ret; +} + +/* Get next character from internal copy of source file. + + If ALLOW_TYPO is true then typographical display features are skipped. + + If ALLOW_ONE_UNDER is true then a single underscore character is + skipped. */ + +static char +next_char (LINE_T **ref_l, char **ref_s, bool allow_typo, + bool allow_one_under = false, bool *found_under = NULL) +{ + char ch; + + /* Empty source. */ + if (*ref_l == NO_LINE) + return STOP_CHAR; + + if ((*ref_s)[0] == NEWLINE_CHAR || (*ref_s)[0] == '\0') + { + /* Go to new line. */ + *ref_l = NEXT (*ref_l); + if (*ref_l == NO_LINE) + return STOP_CHAR; + *ref_s = STRING (*ref_l); + } + else + (*ref_s)++; + + /* Deliver next char. */ + ch = (*ref_s)[0]; + if ((allow_typo && (ISSPACE (ch) || ch == FORMFEED_CHAR)) + || (allow_one_under && ch == '_')) + { + if (ch == '_' && found_under != NULL) + *found_under = true; + ch = next_char (ref_l, ref_s, allow_typo); + } + return ch; +} + +/* Find first character that can start a valid symbol. */ + +static void +get_good_char (char *ref_c, LINE_T **ref_l, char **ref_s) +{ + while (*ref_c != STOP_CHAR && (ISSPACE (*ref_c) || (*ref_c == '\0'))) + *ref_c = next_char (ref_l, ref_s, false); +} + +/* Case insensitive strncmp for at most the number of chars in V. */ + +static int +streq (const char *u, const char *v) +{ + int diff; + for (diff = 0; diff == 0 && u[0] != NULL_CHAR && v[0] != NULL_CHAR; u++, v++) + diff = ((int) TOLOWER (u[0])) - ((int) TOLOWER (v[0])); + return diff; +} + +/* Case insensitive strncmp for at most N chars. */ + +static int +strneq (const char *u, const char *v, size_t n) +{ + int diff; + size_t pos = 0; + for (diff = 0; + diff == 0 && u[0] != NULL_CHAR && v[0] != NULL_CHAR && pos < n; + u++, v++, pos++) + diff = ((int) TOLOWER (u[0])) - ((int) TOLOWER (v[0])); + return diff; +} + + +/* Determine whether u is bold tag v, independent of stropping regime. */ + +static bool +is_bold (char *u, const char *v) +{ + size_t len = strlen (v); + + if (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING) + /* UPPER stropping. */ + return strncmp (u, v, len) == 0 && !ISUPPER (u[len]); + else + /* SUPPER stropping. */ + return (strlen (u) >= len + && ISLOWER (u[0]) + && strneq (u, v, len) == 0 + && !ISALPHA (u[len]) + && !ISDIGIT (u[len])); +} + +/* Skip a string denotation. + + This function returns true if the end of the string denotation is found. + Returns false otherwise. */ + +static bool +skip_string (LINE_T **top, char **ch) +{ + LINE_T *u = *top; + char *v = *ch; + v++; + while (u != NO_LINE) + { + while (v[0] != NULL_CHAR) + { + if (v[0] == QUOTE_CHAR && v[1] != QUOTE_CHAR) + { + *top = u; + *ch = &v[1]; + return true; + } + else if (v[0] == QUOTE_CHAR && v[1] == QUOTE_CHAR) + { + v += 2; + } + else + { + v++; + } + } + FORWARD (u); + if (u != NO_LINE) { + v = &(STRING (u)[0]); + } else { + v = NO_TEXT; + } + } + return false; +} + +/* Skip a comment. + + This function returns true if the end of the comment is found. Returns + false otherwise. */ + +static bool +skip_comment (LINE_T **top, char **ch, int delim) +{ + LINE_T *u = *top; + char *v = *ch; + int nesting_level = 1; + v++; + while (u != NO_LINE) + { + while (v[0] != NULL_CHAR) + { + LINE_T *l = u; + char *c = v; + + if (v[0] == QUOTE_CHAR && skip_string (&l, &c) + && (delim == BOLD_COMMENT_BEGIN_SYMBOL || delim == BRIEF_COMMENT_BEGIN_SYMBOL)) + { + u = l; + v = c; + } + else if (is_bold (v, "COMMENT") && delim == BOLD_COMMENT_SYMBOL) + { + *top = u; + *ch = &v[1]; + return true; + } + else if (is_bold (v, "CO") && delim == STYLE_I_COMMENT_SYMBOL) + { + *top = u; + *ch = &v[1]; + return true; + } + else if (v[0] == '#' && delim == STYLE_II_COMMENT_SYMBOL) + { + *top = u; + *ch = &v[1]; + return true; + } + else if (is_bold (v, "ETON") && delim == BOLD_COMMENT_BEGIN_SYMBOL) + { + gcc_assert (nesting_level > 0); + nesting_level -= 1; + if (nesting_level == 0) + { + *top = u; + *ch = &v[1]; + return true; + } + } + else if (v[0] == '}' && delim == BRIEF_COMMENT_BEGIN_SYMBOL) + { + gcc_assert (nesting_level > 0); + nesting_level -= 1; + if (nesting_level == 0) + { + *top = u; + *ch = &v[1]; + return true; + } + } + else + { + if ((is_bold (v, "NOTE") && delim == BOLD_COMMENT_BEGIN_SYMBOL) + || (v[0] == '{' && delim == BRIEF_COMMENT_BEGIN_SYMBOL)) + { + nesting_level += 1; + } + + v++; + } + } + FORWARD (u); + if (u != NO_LINE) + v = &(STRING (u)[0]); + else + v = NO_TEXT; + } + + return false; +} + +/* Skip rest of pragmat. + + This function returns true if the end of the pragmat is found, false + otherwise. */ + +static bool +skip_pragmat (LINE_T **top, char **ch, int delim, bool whitespace) +{ + LINE_T *u = *top; + char *v = *ch; + while (u != NO_LINE) + { + while (v[0] != NULL_CHAR) + { + if (is_bold (v, "PRAGMAT") && delim == BOLD_PRAGMAT_SYMBOL) + { + *top = u; + *ch = &v[1]; + return true; + } + else if (is_bold (v, "PR") && delim == STYLE_I_PRAGMAT_SYMBOL) + { + *top = u; + *ch = &v[1]; + return true; + } + else + { + if (whitespace && !ISSPACE (v[0]) && v[0] != NEWLINE_CHAR) + { + SCAN_ERROR (true, u, v, "error in pragment"); + } + else if (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING && ISUPPER (v[0])) + { + /* Skip a bold word as you may trigger on REPR, for + instance. */ + while (ISUPPER (v[0])) + v++; + } + else if (OPTION_STROPPING (&A68_JOB) == SUPPER_STROPPING && ISLOWER (v[0])) + { + /* Skip a tag as you may trigger on expr, for instance. */ + while (ISLOWER (v[0]) || ISDIGIT (v[0]) || v[0] == '_') + v++; + } + else + { + v++; + } + } + } + + FORWARD (u); + if (u != NO_LINE) + v = &(STRING (u)[0]); + else + v = NO_TEXT; + } + + return false; +} + +/* Return pointer to next token within pragmat. */ + +static char * +get_pragmat_item (LINE_T **top, char **ch) +{ + LINE_T *u = *top; + char *v = *ch; + while (u != NO_LINE) + { + while (v[0] != NULL_CHAR) + { + if (!ISSPACE (v[0]) && v[0] != NEWLINE_CHAR) + { + *top = u; + *ch = v; + return v; + } + else + { + v++; + } + } + FORWARD (u); + if (u != NO_LINE) + v = &(STRING (u)[0]); + else + v = NO_TEXT; + } + + return NO_TEXT; +} + +/* Scan for the next pragmat and yield the first item within it. */ + +static char * +next_preprocessor_item (LINE_T **top, char **ch, int *delim) +{ + LINE_T *u = *top; + char *v = *ch; + *delim = 0; + while (u != NO_LINE) + { + while (v[0] != NULL_CHAR) + { + LINE_T *start_l = u; + char *start_c = v; + + if (v[0] == QUOTE_CHAR) + { + /* Skip string denotation. */ + SCAN_ERROR (!skip_string (&u, &v), start_l, start_c, + "unterminated string"); + } + else if (a68_find_keyword (A68 (top_keyword), "COMMENT") != NO_KEYWORD + && is_bold (v, "COMMENT")) + { + /* Skip comment. */ + SCAN_ERROR (!skip_comment (&u, &v, BOLD_COMMENT_SYMBOL), start_l, start_c, + "unterminated comment"); + } + else if (a68_find_keyword (A68 (top_keyword), "CO") != NO_KEYWORD + && is_bold (v, "CO")) + { + /* skip comment. */ + SCAN_ERROR (!skip_comment (&u, &v, STYLE_I_COMMENT_SYMBOL), start_l, start_c, + "unterminated comment"); + } + else if (a68_find_keyword (A68 (top_keyword), "#") != NO_KEYWORD + && v[0] == '#') + { + SCAN_ERROR (!skip_comment (&u, &v, STYLE_II_COMMENT_SYMBOL), start_l, start_c, + "unterminated comment"); + } + else if (a68_find_keyword (A68 (top_keyword), "NOTE") != NO_KEYWORD + && is_bold (v, "NOTE")) + { + SCAN_ERROR (!skip_comment (&u, &v, BOLD_COMMENT_BEGIN_SYMBOL), start_l, start_c, + "unterminated comment"); + } + else if (a68_find_keyword (A68 (top_keyword), "{") != NO_KEYWORD + && v[0] == '{') + { + SCAN_ERROR (!skip_comment (&u, &v, BRIEF_COMMENT_BEGIN_SYMBOL), start_l, start_c, + "unterminated comment"); + } + else if (is_bold (v, "PRAGMAT") || is_bold (v, "PR")) + { + /* We caught a PRAGMAT. */ + char *item; + if (is_bold (v, "PRAGMAT")) + { + *delim = BOLD_PRAGMAT_SYMBOL; + v = &v[strlen ("PRAGMAT")]; + } + else if (is_bold (v, "PR")) + { + *delim = STYLE_I_PRAGMAT_SYMBOL; + v = &v[strlen ("PR")]; + } + item = get_pragmat_item (&u, &v); + SCAN_ERROR (item == NO_TEXT, start_l, start_c, + "unterminated pragmat"); + + if (streq (item, "INCLUDE") == 0) + { + /* Item "INCLUDE" includes a file. */ + *top = u; + *ch = v; + return item; + } + else + { + /* Unrecognised item - probably options handled later by the + tokeniser. */ + SCAN_ERROR (!skip_pragmat (&u, &v, *delim, false), start_l, start_c, + "unterminated pragmat"); + } + } + else if (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING && ISUPPER (v[0])) + { + /* Skip a bold word as you may trigger on REPR, for instance. */ + while (ISUPPER (v[0])) + v++; + } + else if (OPTION_STROPPING (&A68_JOB) == SUPPER_STROPPING && ISLOWER (v[0])) + { + /* Skip a tag as you may trigger on expr, for instance. */ + while (ISLOWER (v[0]) || ISDIGIT (v[0]) || v[0] == '_') + v++; + } + else + { + v++; + } + } + + FORWARD (u); + if (u != NO_LINE) + v = &(STRING (u)[0]); + else + v = NO_TEXT; + } + + *top = u; + *ch = v; + return NO_TEXT; +} + +/* Concatenate the two paths P1 and P2. */ + +static char * +a68_relpath (const char *p1, const char *p2, const char *fn) +{ +#if defined(__GNU__) + /* The Hurd doesn't define PATH_MAX. */ +# define PATH_MAX 4096 +#endif + + char q[PATH_MAX + 1]; + a68_bufcpy (q, p1, PATH_MAX); + a68_bufcat (q, "/", PATH_MAX); + a68_bufcat (q, p2, PATH_MAX); + a68_bufcat (q, "/", PATH_MAX); + a68_bufcat (q, fn, PATH_MAX); + /* Home directory shortcut ~ is a shell extension. */ + if (strchr (q, '~') != NO_TEXT) { + return NO_TEXT; + } + char *r = (char *) xmalloc (PATH_MAX + 1); + gcc_assert (r != NULL); + /* Error handling in the caller! */ + errno = 0; + r = lrealpath (q); + return r; +} + +/* Return true if we can open the file for reading. False otherwise. */ + +static bool +file_read_p (const char *filename) +{ + return access (filename, R_OK) == 0 ? true : false; +} + +/* Find a file to include into the current source being parsed. Search the file + system for FILENAME and return a string with the file path. If the file is + not found, return NULL. + + When FILENAME is not an absolute path we first try to find it relative to the + current file being parsed (CURFILE). Failing to do that we use the search + paths provided by the -I option. */ + +static char * +find_include_file (const char *curfile, const char *filename) +{ + char *filepath = NO_TEXT; + char *tmpfpath = NO_TEXT; + char *fnbdir = ldirname (filename); + const char *incfile = lbasename (filename); + + if (fnbdir == NULL || incfile == NULL) + gcc_unreachable (); + + if (!IS_ABSOLUTE_PATH (filename)) + { + char *sourcedir = ldirname (curfile); + + if (sourcedir == NULL || fnbdir == NULL) + gcc_unreachable (); + + if (strlen (sourcedir) == 0 && strlen (fnbdir) == 0) + { + free (sourcedir); + sourcedir = (char *) xmalloc (2); + a68_bufcpy (sourcedir, ".", 2); + } + + tmpfpath = a68_relpath (sourcedir, fnbdir, incfile); + if (file_read_p (tmpfpath)) + { + filepath = tmpfpath; + goto cleanup; + } + + for (unsigned ix = 0; ix != vec_safe_length (A68_INCLUDE_PATHS); ix++) + { + const char *include_dir = (*(A68_INCLUDE_PATHS))[ix]; + tmpfpath = a68_relpath (include_dir, fnbdir, incfile); + if (!IS_ABSOLUTE_PATH (tmpfpath)) + tmpfpath = a68_relpath (sourcedir, fnbdir, incfile); + if (file_read_p (tmpfpath)) + { + filepath = tmpfpath; + goto cleanup; + } + } + + cleanup: + free (sourcedir); + goto end; + } + else + { + size_t fnwid = (int) strlen (filename) + 1; + tmpfpath = (char *) xmalloc ((size_t) fnwid); + a68_bufcpy (tmpfpath, filename, fnwid); + + if (file_read_p (tmpfpath)) + { + filepath = tmpfpath; + goto end; + } + } + +end: + free (fnbdir); + return filepath; +} + +/* Include files. + This function handles the INCLUDE pragmat in the source file. */ + +static void +include_files (LINE_T *top) +{ + /* syntax: PR include "filename" PR + + The file gets inserted before the line containing the pragmat. In this way + correct line numbers are preserved which helps diagnostics. A file that + has been included will not be included a second time - it will be ignored. + A rigorous fail-safe, but there is no mechanism to prevent recursive + includes in A68 source code. User reports do not indicate sophisticated + use of INCLUDE, so this is fine for now. + */ + + bool make_pass = true; + while (make_pass) + { + LINE_T *s, *t, *u = top; + char *v = &(STRING (u)[0]); + make_pass = false; + errno = 0; + while (u != NO_LINE) + { + int pr_lim; + char *item = next_preprocessor_item (&u, &v, &pr_lim); + LINE_T *start_l = u; + char *start_c = v; + /* Search for PR include "filename" PR. */ + if (item != NO_TEXT && streq (item, "INCLUDE") == 0) + { + FILE *fp; + int fd; + size_t fsize, k; + int n, linum, bytes_read; + char *fbuf, delim; + BUFFER fnb; + char *fn = NO_TEXT; + /* Skip to filename. */ + while (ISALPHA (v[0])) + v++; + while (ISSPACE (v[0])) + v++; + /* Scan quoted filename. */ + SCAN_ERROR ((v[0] != QUOTE_CHAR && v[0] != '\''), start_l, start_c, + "incorrect filename"); + delim = (v++)[0]; + n = 0; + fnb[0] = NULL_CHAR; + /* Scan Algol 68 string (note: "" denotes a ", while in C it + concatenates). */ + do + { + SCAN_ERROR (EOL (v[0]), start_l, start_c, + "incorrect filename"); + SCAN_ERROR (n == BUFFER_SIZE - 1, start_l, start_c, + "incorrect filename"); + if (v[0] == delim) + { + while (v[0] == delim && v[1] == delim) + { + SCAN_ERROR (n == BUFFER_SIZE - 1, start_l, start_c, + "incorrect filename"); + fnb[n++] = delim; + fnb[n] = NULL_CHAR; + v += 2; + } + } + else if (ISPRINT (v[0])) + { + fnb[n++] = *(v++); + fnb[n] = NULL_CHAR; + } + else + { + SCAN_ERROR (true, start_l, start_c, + "incorrect filename"); + } + } + while (v[0] != delim); + + /* Insist that the pragmat is closed properly. */ + v = &v[1]; + SCAN_ERROR (!skip_pragmat (&u, &v, pr_lim, true), start_l, start_c, + "unterminated pragmat"); + SCAN_ERROR (n == 0, start_l, start_c, + "incorrect filename"); + + char *sourcefile = NO_TEXT; + if (FILENAME (u) != NO_TEXT) + { + sourcefile = xstrdup (FILENAME (u)); + } + else + { + sourcefile = (char *) xmalloc (2); + a68_bufcpy (sourcefile, ".", 1); + } + fn = find_include_file (sourcefile, fnb); + free (sourcefile); + + /* Do not check errno, since errno may be undefined here + after a successful call. */ + if (fn != NO_TEXT) + a68_bufcpy (fnb, fn, BUFFER_SIZE); + else + { + SCAN_ERROR (true, start_l, start_c, + "included file not found"); + } + size_t fnwid = (int) strlen (fnb) + 1; + fn = (char *) xmalloc ((size_t) fnwid); + a68_bufcpy (fn, fnb, fnwid); + + /* Ignore the file when included more than once. */ + for (t = top; t != NO_LINE; t = NEXT (t)) + { + if (strcmp (FILENAME (t), fn) == 0) + goto search_next_pragmat; + } + t = NO_LINE; + + /* Access the file. */ + errno = 0; + fp = fopen (fn, "r"); + SCAN_ERROR (fp == NULL, start_l, start_c, + "error opening included file"); + fd = fileno (fp); + errno = 0; + off_t off = lseek (fd, 0, SEEK_END); + gcc_assert (off >= 0); + fsize = (size_t) off; + SCAN_ERROR (errno != 0, start_l, start_c, + "error while reading file"); + fbuf = (char *) xmalloc (8 + fsize); + errno = 0; + if (lseek (fd, 0, SEEK_SET) < 0) + gcc_unreachable (); + SCAN_ERROR (errno != 0, start_l, start_c, + "error while reading file"); + errno = 0; + bytes_read = (int) io_read (fp, fbuf, (size_t) fsize); + SCAN_ERROR (errno != 0 || (size_t) bytes_read != fsize, start_l, start_c, + "error while reading file"); + + /* Buffer still usable?. */ + if (fsize > A68_PARSER (max_scan_buf_length)) + { + A68_PARSER (max_scan_buf_length) = fsize; + A68_PARSER (scan_buf) = (char *) xmalloc (8 + A68_PARSER (max_scan_buf_length)); + } + + /* Link all lines into the list. */ + linum = 1; + s = u; + t = PREVIOUS (u); + k = 0; + if (fsize == 0) + { + /* If file is empty, insert single empty line. */ + A68_PARSER (scan_buf)[0] = NEWLINE_CHAR; + A68_PARSER (scan_buf)[1] = NULL_CHAR; + append_source_line (A68_PARSER (scan_buf), &t, &linum, fn); + } + else + { + while (k < fsize) + { + n = 0; + A68_PARSER (scan_buf)[0] = NULL_CHAR; + while (k < fsize && fbuf[k] != NEWLINE_CHAR) + { + SCAN_ERROR ((ISCNTRL (fbuf[k]) && !ISSPACE (fbuf[k])) + || fbuf[k] == STOP_CHAR, + start_l, start_c, + "invalid characters in included file"); + A68_PARSER (scan_buf)[n++] = fbuf[k++]; + A68_PARSER (scan_buf)[n] = NULL_CHAR; + } + A68_PARSER (scan_buf)[n++] = NEWLINE_CHAR; + A68_PARSER (scan_buf)[n] = NULL_CHAR; + if (k < fsize) + k++; + append_source_line (A68_PARSER (scan_buf), &t, &linum, fn); + } + } + + /* Conclude and go find another include directive, if any. */ + NEXT (t) = s; + PREVIOUS (s) = t; + concatenate_lines (top); + if (fclose (fp) != 0) + gcc_unreachable (); + make_pass = true; + } + search_next_pragmat: + { (void) 0; }; + } + } +} + +/* Handle a pragment (pragmat or comment). */ + +static char * +pragment (int type, LINE_T **ref_l, char **ref_c) +{ +#define INIT_BUFFER \ + do \ + { \ + chars_in_buf = 0; \ + A68_PARSER (scan_buf)[chars_in_buf] = '\0'; \ + } \ + while (0) + +#define ADD_ONE_CHAR(CH) \ + do \ + { \ + A68_PARSER (scan_buf)[chars_in_buf ++] = (CH); \ + A68_PARSER (scan_buf)[chars_in_buf] = '\0'; \ + } \ + while (0) + + const char *term_s = NO_TEXT; + const char *beg_s = NO_TEXT; + char c = **ref_c, *start_c = *ref_c; + char *z = NO_TEXT; + LINE_T *start_l = *ref_l; + int beg_s_length, term_s_length, chars_in_buf; + bool stop, pragmat = false; + + /* Set terminator to look for. */ + if (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING) + { + if (type == STYLE_I_COMMENT_SYMBOL) + term_s = "CO"; + else if (type == STYLE_II_COMMENT_SYMBOL) + term_s = "#"; + else if (type == BOLD_COMMENT_SYMBOL) + term_s = "COMMENT"; + else if (type == BOLD_COMMENT_BEGIN_SYMBOL) + { + beg_s = "NOTE"; + term_s = "ETON"; + } + else if (type == BRIEF_COMMENT_BEGIN_SYMBOL) + { + beg_s = "{"; + term_s = "}"; + } + else if (type == STYLE_I_PRAGMAT_SYMBOL) + { + term_s = "PR"; + pragmat = true; + } + else if (type == BOLD_PRAGMAT_SYMBOL) + { + term_s = "PRAGMAT"; + pragmat = true; + } + } + else + { + /* SUPPER stropping. */ + if (type == STYLE_I_COMMENT_SYMBOL) + term_s = "co"; + else if (type == STYLE_II_COMMENT_SYMBOL) + term_s = "#"; + else if (type == BOLD_COMMENT_SYMBOL) + term_s = "comment"; + else if (type == BOLD_COMMENT_BEGIN_SYMBOL) + { + beg_s = "note"; + term_s = "eton"; + } + else if (type == BRIEF_COMMENT_BEGIN_SYMBOL) + { + beg_s = "{"; + term_s = "}"; + } + else if (type == STYLE_I_PRAGMAT_SYMBOL) + { + term_s = "pr"; + pragmat = true; + } + else if (type == BOLD_PRAGMAT_SYMBOL) + { + term_s = "pragmat"; + pragmat = true; + } + } + + beg_s_length = (beg_s != NO_TEXT ? (int) strlen (beg_s) : 0); + term_s_length = (int) strlen (term_s); + + /* Scan for terminator. */ + bool nestable_comment = (beg_s != NO_TEXT); + int nesting_level = 1; + INIT_BUFFER; + stop = false; + while (stop == false) + { + SCAN_ERROR (c == STOP_CHAR, start_l, start_c, + "unterminated pragment"); + + /* A ".." or '..' delimited string in a PRAGMAT, or + a ".." in a nestable comment. */ + if ((pragmat && (c == QUOTE_CHAR || c == '\'')) + || (nestable_comment && c == QUOTE_CHAR)) + { + char delim = c; + bool eos = false; + ADD_ONE_CHAR (c); + c = next_char (ref_l, ref_c, false); + while (!eos) + { + SCAN_ERROR (EOL (c), start_l, start_c, + "string within pragment exceeds end of line"); + + if (c == delim) + { + ADD_ONE_CHAR (delim); + save_state (*ref_l, *ref_c, c); + c = next_char (ref_l, ref_c, false); + if (c == delim) + c = next_char (ref_l, ref_c, false); + else + { + restore_state (ref_l, ref_c, &c); + eos = true; + } + } + else if (ISPRINT (c)) + { + ADD_ONE_CHAR (c); + c = next_char (ref_l, ref_c, false); + } + else + unworthy (start_l, start_c, c); + } + } + else if (EOL (c)) + ADD_ONE_CHAR (NEWLINE_CHAR); + else if (ISPRINT (c) || ISSPACE (c)) + ADD_ONE_CHAR (c); + + if (nestable_comment && chars_in_buf >= beg_s_length) + { + /* If we find another instance of the nestable begin mark, bump the + nesting level and continue scanning. */ + if (strcmp (beg_s, + &(A68_PARSER (scan_buf)[chars_in_buf - beg_s_length])) == 0) + { + nesting_level += 1; + goto nextchar; + } + } + + if (chars_in_buf >= term_s_length) + { + /* Check whether we encountered the terminator. Mind nesting if + necessary. */ + if (strcmp (term_s, + &(A68_PARSER (scan_buf)[chars_in_buf - term_s_length])) == 0) + { + if (nestable_comment) + { + gcc_assert (nesting_level > 0); + nesting_level -= 1; + stop = (nesting_level == 0); + } + else + stop = true; + } + } + + nextchar: + c = next_char (ref_l, ref_c, false); + } + + A68_PARSER (scan_buf)[chars_in_buf - term_s_length] = '\0'; + z = a68_new_string (term_s, A68_PARSER (scan_buf), term_s, NO_TEXT); + return z; +#undef ADD_ONE_CHAR +#undef INIT_BUFFER +} + +/* Whether input shows exponent character. */ + +static bool +is_exp_char (LINE_T **ref_l, char **ref_s, char *ch) +{ + bool ret = false; + + char exp_syms[3]; + + /* Note that this works for both UPPER and SUPPER stropping regimes. */ + exp_syms[0] = EXPONENT_CHAR; + exp_syms[1] = TOUPPER (EXPONENT_CHAR); + exp_syms[2] = '\0'; + + save_state (*ref_l, *ref_s, *ch); + if (strchr (exp_syms, *ch) != NO_TEXT) + { + *ch = next_char (ref_l, ref_s, true); + ret = (strchr ("+-0123456789", *ch) != NO_TEXT); + } + restore_state (ref_l, ref_s, ch); + return ret; +} + +/* Whether input shows radix character. */ + +static bool +is_radix_char (LINE_T **ref_l, char **ref_s, char *ch) +{ + bool ret = false; + + save_state (*ref_l, *ref_s, *ch); + /* Note that this works for both UPPER and SUPPER stropping regimes. */ + if (*ch == RADIX_CHAR) + { + *ch = next_char (ref_l, ref_s, true); + ret = (strchr ("0123456789abcdef", *ch) != NO_TEXT); + } + restore_state (ref_l, ref_s, ch); + return ret; +} + +/* Whether input shows decimal point. */ + +static bool +is_decimal_point (LINE_T **ref_l, char **ref_s, char *ch) +{ + bool ret = false; + + save_state (*ref_l, *ref_s, *ch); + if (*ch == POINT_CHAR) + { + char exp_syms[3]; + + /* Note that this works for both UPPER and SUPPER stropping regimes. */ + exp_syms[0] = EXPONENT_CHAR; + exp_syms[1] = TOUPPER (EXPONENT_CHAR); + exp_syms[2] = '\0'; + + *ch = next_char (ref_l, ref_s, true); + if (strchr (exp_syms, *ch) != NO_TEXT) + { + *ch = next_char (ref_l, ref_s, true); + ret = (strchr ("+-0123456789", *ch) != NO_TEXT); + } + else + ret = (strchr ("0123456789", *ch) != NO_TEXT); + } + restore_state (ref_l, ref_s, ch); + return ret; +} + +/* Attribute for format item. */ + +static enum a68_attribute +get_format_item (char ch) +{ + switch (TOLOWER (ch)) + { + case 'a': + return FORMAT_ITEM_A; + case 'b': + return FORMAT_ITEM_B; + case 'c': + return FORMAT_ITEM_C; + case 'd': + return FORMAT_ITEM_D; + case 'e': + return FORMAT_ITEM_E; + case 'f': + return FORMAT_ITEM_F; + case 'g': + return FORMAT_ITEM_G; + case 'h': + return FORMAT_ITEM_H; + case 'i': + return FORMAT_ITEM_I; + case 'j': + return FORMAT_ITEM_J; + case 'k': + return FORMAT_ITEM_K; + case 'l': + case '/': + return FORMAT_ITEM_L; + case 'm': + return FORMAT_ITEM_M; + case 'n': + return FORMAT_ITEM_N; + case 'o': + return FORMAT_ITEM_O; + case 'p': + return FORMAT_ITEM_P; + case 'q': + return FORMAT_ITEM_Q; + case 'r': + return FORMAT_ITEM_R; + case 's': + return FORMAT_ITEM_S; + case 't': + return FORMAT_ITEM_T; + case 'u': + return FORMAT_ITEM_U; + case 'v': + return FORMAT_ITEM_V; + case 'w': + return FORMAT_ITEM_W; + case 'x': + return FORMAT_ITEM_X; + case 'y': + return FORMAT_ITEM_Y; + case 'z': + return FORMAT_ITEM_Z; + case '+': + return FORMAT_ITEM_PLUS; + case '-': + return FORMAT_ITEM_MINUS; + case POINT_CHAR: + return FORMAT_ITEM_POINT; + case '%': + return FORMAT_ITEM_ESCAPE; + default: + return STOP; + } +} + +/* Get next token from internal copy of source file. + + The kind of token is set via the passed pointer ATTR. + The contents of token is set in the scan_buf via SYM. + + The recognized tokens are, by reported ATTR: + + <unset> + End of file. + FORMAT_ITEM_* + Item in a format. + STATIC_REPLICATOR + INT denotation for a static replicator in a format. + BOLD_TAG + Bold tag. + IDENTIFIER + A "lower case" identifier. + IDENTIFIER_WITH_UNDERSCORES + A "lower case" identifier whose's at least one taggle + was found adjacent to an underscore. + REAL_DENOTATION + A REAL denotation. + POINT_SYMBOL + . + BITS_DENOTATION + A BITS denotation like 16rffff + INT_DENOTATION + An INT denotation. + ROW_CHAR_DENOTATION + A STRING denotation. + LITERAL + A literal denotation in a format. + STOP + Single-character symbols #$()[]{},;@|: + := /= :=: :/=: + The character is placed in SYM. + EQUALS_SYMBOL + The equality symbol. + OPERATOR + A predefined operator. +*/ + +static void +get_next_token (bool in_format, + LINE_T **ref_l, char **ref_s, + LINE_T **start_l, char **start_c, enum a68_attribute *att) +{ + char c = **ref_s; + char *sym = A68_PARSER (scan_buf); + + sym[0] = '\0'; + get_good_char (&c, ref_l, ref_s); + *start_l = *ref_l; + *start_c = *ref_s; + if (c == STOP_CHAR) + { + /* We are at EOF. */ + (sym++)[0] = STOP_CHAR; + sym[0] = '\0'; + return; + } + + if (in_format) + { + /* In a format. */ + const char *format_items = "/%\\+-.abcdefghijklmnopqrstuvwxyz"; + if (strchr (format_items, c) != NO_TEXT) + { + /* General format items. */ + (sym++)[0] = c; + sym[0] = NULL_CHAR; + *att = get_format_item (c); + (void) next_char (ref_l, ref_s, false); + return; + } + if (ISDIGIT (c)) + { + /* INT denotation for static replicator. */ + SCAN_DIGITS (c); + sym[0] = NULL_CHAR; + *att = STATIC_REPLICATOR; + return; + } + } + + if (ISUPPER (c)) + { + /* Bold taggles are enabled only in gnu68. */ + bool allow_one_under = !OPTION_STRICT (&A68_JOB); + + if (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING) + { + /* In UPPER stropping a bold tag is an upper case word. */ + while (ISUPPER (c)) + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false, allow_one_under); + } + sym[0] = '\0'; + *att = BOLD_TAG; + } + else + { + /* In SUPPER stropping a bold tag is a capitalized word that may + contain letters and digits. */ + while (ISALPHA (c) || ISDIGIT (c)) + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false, allow_one_under); + } + sym[0] = '\0'; + *att = BOLD_TAG; + } + } + else if (ISLOWER (c)) + { + /* In both UPPER and SUPPER stropping regimes a tag is a lower case word + which may contain letters and digits. + + In SUPPER stropping, however, it is not allowed to have blanks + separating the taggles within tags. */ + + bool allow_one_under = true; + bool found_under = false; + bool allow_typo = OPTION_STROPPING (&A68_JOB) != SUPPER_STROPPING; + + /* Lower case word - identifier. */ + while (ISLOWER (c) || ISDIGIT (c)) + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, allow_typo, allow_one_under, + &found_under); + } + + sym[0] = '\0'; + *att = found_under ? IDENTIFIER_WITH_UNDERSCORES : IDENTIFIER; + } + else if (c == POINT_CHAR) + { + /* Begins with a point symbol - point, L REAL denotation. */ + if (is_decimal_point (ref_l, ref_s, &c)) + { + (sym++)[0] = '0'; + (sym++)[0] = POINT_CHAR; + c = next_char (ref_l, ref_s, true); + SCAN_DIGITS (c); + if (is_exp_char (ref_l, ref_s, &c)) + SCAN_EXPONENT_PART (c); + sym[0] = '\0'; + *att = REAL_DENOTATION; + } + else + { + c = next_char (ref_l, ref_s, true); + (sym++)[0] = POINT_CHAR; + sym[0] = '\0'; + *att = POINT_SYMBOL; + } + } + else if (ISDIGIT (c)) + { + /* Something that begins with a digit: + L INT denotation, L REAL denotation. */ + SCAN_DIGITS (c); + + if (is_decimal_point (ref_l, ref_s, &c)) + { + c = next_char (ref_l, ref_s, true); + if (is_exp_char (ref_l, ref_s, &c)) + { + (sym++)[0] = POINT_CHAR; + (sym++)[0] = '0'; + SCAN_EXPONENT_PART (c); + *att = REAL_DENOTATION; + } + else + { + (sym++)[0] = POINT_CHAR; + SCAN_DIGITS (c); + if (is_exp_char (ref_l, ref_s, &c)) + SCAN_EXPONENT_PART (c); + *att = REAL_DENOTATION; + } + } + else if (is_exp_char (ref_l, ref_s, &c)) + { + SCAN_EXPONENT_PART (c); + *att = REAL_DENOTATION; + } + else if (is_radix_char (ref_l, ref_s, &c)) + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, true); + /* This is valid for both UPPER and SUPPER stropping. */ + while (ISDIGIT (c) || strchr ("abcdef", c) != NO_TEXT) + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, true); + } + *att = BITS_DENOTATION; + } + else + { + *att = INT_DENOTATION; + } + sym[0] = '\0'; + } + else if (c == QUOTE_CHAR) + { + /* STRING denotation. */ + bool stop = false; + + while (!stop) + { + c = next_char (ref_l, ref_s, false); + while (c != QUOTE_CHAR && c != STOP_CHAR) + { + if (c == APOSTROPHE_CHAR) + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + switch (c) + { + case APOSTROPHE_CHAR: + case 'n': + case 'f': + case 'r': + case 't': + (sym++)[0] = c; + break; + case '(': + { + unsigned int num_code_points = 0; + + (sym++)[0] = c; + /* Process code points. */ + while (1) + { + /* Skip white spaces. */ + while (1) + { + c = next_char (ref_l, ref_s, false); + if (!ISSPACE (c)) + break; + } + + /* See if we are done. */ + if (c == ')') + { + SCAN_ERROR (num_code_points == 0, *start_l, *ref_s, + "expected at least one character point in string break"); + (sym++)[0] = c; + break; + } + else if (c == 'u' || c == 'U') + { + (sym++)[0] = c; + /* Process a code point. */ + char u = c; + int numdigits = (u == 'u' ? 4 : 8); + char *startpos = *ref_s; + int i = 0; + do + { + c = next_char (ref_l, ref_s, false); + if (!(ISDIGIT (c) + || ((c >= 'a') && (c <= 'f')) + || ((c >= 'A') && (c <= 'F')))) + { + SCAN_ERROR (true, *start_l, startpos, + (u == 'u' + ? "expected four hex digits in \ +string break character point" + : "expected eight hex digits in \ +string break character point")); + } + (sym++)[0] = c; + i += 1; + } + while (i < numdigits); + + /* Skip white spaces. */ + while (1) + { + c = next_char (ref_l, ref_s, false); + if (!ISSPACE (c)) + break; + } + + /* Comma or end of list. */ + if (c == ')') + { + (sym++)[0] = c; + break; + } + + SCAN_ERROR (c != ',', *start_l, *ref_s, + "expected , or ) in string break"); + } + else + { + SCAN_ERROR (true, *start_l, *ref_s, + "unterminated list of character codes"); + } + } + break; + } + default: + SCAN_ERROR (true, *start_l, *ref_s, "invalid string break sequence"); + } + } + else + { + SCAN_ERROR (EOL (c), *start_l, *start_c, "string exceeds end of line"); + (sym++)[0] = c; + } + c = next_char (ref_l, ref_s, false); + } + SCAN_ERROR (*ref_l == NO_LINE, *start_l, *start_c, "unterminated string"); + c = next_char (ref_l, ref_s, false); + if (c == QUOTE_CHAR) + (sym++)[0] = QUOTE_CHAR; + else + stop = true; + } + sym[0] = '\0'; + *att = (in_format ? LITERAL : ROW_CHAR_DENOTATION); + } + else if (strchr ("#$()[]{},;@", c) != NO_TEXT) + { + /* Single character symbols. */ + (sym++)[0] = c; + (void) next_char (ref_l, ref_s, false); + sym[0] = '\0'; + *att = STOP; + } + else if (c == '|') + { + /* Bar. */ + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + if (c == ':') + { + (sym++)[0] = c; + (void) next_char (ref_l, ref_s, false); + } + sym[0] = '\0'; + *att = STOP; + } + else if (c == ':') + { + /* Colon, semicolon, IS, ISNT. */ + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + if (c == '=') + { + (sym++)[0] = c; + if ((c = next_char (ref_l, ref_s, false)) == ':') + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + } + } + else if (c == '/') + { + (sym++)[0] = c; + if ((c = next_char (ref_l, ref_s, false)) == '=') + { + (sym++)[0] = c; + if ((c = next_char (ref_l, ref_s, false)) == ':') + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + } + } + } + else if (c == ':') + { + (sym++)[0] = c; + if ((c = next_char (ref_l, ref_s, false)) == '=') + (sym++)[0] = c; + } + + sym[0] = '\0'; + *att = STOP; + + } + else if (c == '=') + { + /* Operator starting with "=". */ + char *scanned = sym; + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + if (strchr (NOMADS, c) != NO_TEXT) + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + } + if (c == '=') + { + (sym++)[0] = c; + if (next_char (ref_l, ref_s, false) == ':') + { + (sym++)[0] = ':'; + c = next_char (ref_l, ref_s, false); + if (strlen (sym) < 4 && c == '=') + { + (sym++)[0] = '='; + (void) next_char (ref_l, ref_s, false); + } + } + } + else if (c == ':') + { + (sym++)[0] = c; + sym[0] = '\0'; + if (next_char (ref_l, ref_s, false) == '=') + { + (sym++)[0] = '='; + (void) next_char (ref_l, ref_s, false); + } + else + { + SCAN_ERROR (!(strcmp (scanned, "=:") == 0 || strcmp (scanned, "==:") == 0), + *start_l, *start_c, "invalid operator tag"); + } + } + sym[0] = '\0'; + if (strcmp (scanned, "=") == 0) + *att = EQUALS_SYMBOL; + else + *att = OPERATOR; + } + else if (strchr (MONADS, c) != NO_TEXT || strchr (NOMADS, c) != NO_TEXT) + { + /* Operator. */ + char *scanned = sym; + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + if (strchr (NOMADS, c) != NO_TEXT) + { + (sym++)[0] = c; + c = next_char (ref_l, ref_s, false); + } + if (c == '=') + { + (sym++)[0] = c; + if (next_char (ref_l, ref_s, false) == ':') + { + (sym++)[0] = ':'; + c = next_char (ref_l, ref_s, false); + if (strlen (scanned) < 4 && c == '=') + { + (sym++)[0] = '='; + (void) next_char (ref_l, ref_s, false); + } + } + } + else if (c == ':') + { + (sym++)[0] = c; + sym[0] = '\0'; + if (next_char (ref_l, ref_s, false) == '=') + { + (sym++)[0] = '='; + sym[0] = '\0'; + (void) next_char (ref_l, ref_s, false); + } + else + { + SCAN_ERROR (strcmp (&(scanned[1]), "=:") != 0, + *start_l, *start_c, "invalid operator tag"); + } + } + sym[0] = '\0'; + *att = OPERATOR; + } + else + { + /* Afuuus ... strange characters!. */ + unworthy (*start_l, *start_c, (int) c); + } +} + +/* Whether att opens an embedded clause. */ + +static bool +open_nested_clause (int att) +{ + switch (att) + { + case OPEN_SYMBOL: + case BEGIN_SYMBOL: + case PAR_SYMBOL: + case IF_SYMBOL: + case CASE_SYMBOL: + case FOR_SYMBOL: + case FROM_SYMBOL: + case BY_SYMBOL: + case TO_SYMBOL: + case WHILE_SYMBOL: + case DO_SYMBOL: + case SUB_SYMBOL: + return true; + } + return false; +} + +/* Whether att closes an embedded clause. */ + +static bool +close_nested_clause (int att) +{ + switch (att) + { + case CLOSE_SYMBOL: + case END_SYMBOL: + case FI_SYMBOL: + case ESAC_SYMBOL: + case OD_SYMBOL: + case BUS_SYMBOL: + return true; + } + return false; +} + +/* Cast a string to lower case. */ + +static void +make_lower_case (char *p) +{ + for (; p != NO_TEXT && p[0] != '\0'; p++) + p[0] = TOLOWER (p[0]); +} + +/* Cast a string to upper case. */ + +static void +make_upper_case (char *p) +{ + for (; p != NO_TEXT && p[0] != '\0'; p++) + p[0] = TOUPPER (p[0]); +} + +/* Construct a linear list of tokens. */ + +static void +tokenise_source (NODE_T **root, int level, bool in_format, + LINE_T **l, char **s, LINE_T **start_l, + char **start_c) +{ + char *lpr = NO_TEXT; + int lprt = 0; + + while (l != NO_VAR && !A68_PARSER (stop_scanner)) + { + enum a68_attribute att = STOP; + get_next_token (in_format, l, s, start_l, start_c, &att); + + if (A68_PARSER (scan_buf)[0] == STOP_CHAR) + A68_PARSER (stop_scanner) = true; + else if (strlen (A68_PARSER (scan_buf)) > 0 || att == ROW_CHAR_DENOTATION || att == LITERAL) + { + KEYWORD_T *kw; + const char *c = NO_TEXT; + bool make_node = true; + const char *trailing = NO_TEXT; + + if (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING) + { + /* In UPPER stropping all symbols in R9.4.1 are expressed as bold + tags like "BEGIN", or symbols like "@". */ + + /* In this stropping regime there is no need to handle + identifiers for which taggles were adjacent to underscores + specially. */ + if (att != IDENTIFIER && att != IDENTIFIER_WITH_UNDERSCORES) + kw = a68_find_keyword (A68 (top_keyword), A68_PARSER (scan_buf)); + else + kw = NO_KEYWORD; + } + else + { + /* In SUPPER stropping all symbols in R9.4.1 are expressed as + tags like "begin", or symbols like "@". */ + + /* Normalize bold tags to all upper-case letters. */ + if (att == BOLD_TAG) + make_upper_case (A68_PARSER (scan_buf)); + + /* If any of the taggles of the scanned identifier were adjacent + to an underscore, that inhibits interpreting it as a + keyword. */ + if (att != BOLD_TAG && att != IDENTIFIER_WITH_UNDERSCORES) + kw = a68_find_keyword (A68 (top_keyword), A68_PARSER (scan_buf)); + else + kw = NO_KEYWORD; + } + + /* Beyond this point it is irrelevant whether an identifier had + taggles adjacent to an underscore. */ + if (att == IDENTIFIER_WITH_UNDERSCORES) + att = IDENTIFIER; + + if (kw == NO_KEYWORD || att == ROW_CHAR_DENOTATION) + { + if (att == IDENTIFIER) + make_lower_case (A68_PARSER (scan_buf)); + if (att != ROW_CHAR_DENOTATION && att != LITERAL) + { + size_t len = strlen (A68_PARSER (scan_buf)); + while (len >= 1 && A68_PARSER (scan_buf)[len - 1] == '_') + { + trailing = "_"; + A68_PARSER (scan_buf)[len - 1] = NULL_CHAR; + len--; + } + } + c = TEXT (a68_add_token (&A68 (top_token), A68_PARSER (scan_buf))); + } + else + { + if (IS (kw, TO_SYMBOL)) + { + /* Merge GO and TO to GOTO. */ + if (*root != NO_NODE && IS (*root, GO_SYMBOL)) + { + ATTRIBUTE (*root) = GOTO_SYMBOL; + NSYMBOL (*root) = TEXT (a68_find_keyword (A68 (top_keyword), "GOTO")); + make_node = false; + } + else + { + att = ATTRIBUTE (kw); + c = TEXT (kw); + } + } + else + { + if (OPTION_STROPPING (&A68_JOB) == UPPER_STROPPING) + { + if (att == 0 || att == BOLD_TAG) + att = ATTRIBUTE (kw); + } + else + { + if (att == 0 || att == IDENTIFIER) + att = ATTRIBUTE (kw); + } + + c = TEXT (kw); + /* Handle pragments. */ + if (att == STYLE_II_COMMENT_SYMBOL + || att == STYLE_I_COMMENT_SYMBOL + || att == BOLD_COMMENT_SYMBOL + || att == BOLD_COMMENT_BEGIN_SYMBOL + || att == BRIEF_COMMENT_BEGIN_SYMBOL) + { + char *nlpr = pragment (ATTRIBUTE (kw), l, s); + + if (lpr == NO_TEXT || (int) strlen (lpr) == 0) + lpr = nlpr; + else + { + char *stale = lpr; + lpr = a68_new_string (lpr, "\n\n", nlpr, NO_TEXT); + free (stale); + } + lprt = att; + make_node = false; + } + else if (att == STYLE_I_PRAGMAT_SYMBOL + || att == BOLD_PRAGMAT_SYMBOL) + { + char *nlpr = pragment (ATTRIBUTE (kw), l, s); + if (lpr == NO_TEXT || (int) strlen (lpr) == 0) + lpr = nlpr; + else + { + char *stale = lpr; + lpr = a68_new_string (lpr, "\n\n", nlpr, NO_TEXT); + free (stale); + } + lprt = att; + if (!A68_PARSER (stop_scanner)) + make_node = false; + } + } + } + /* Add token to the tree. */ + if (make_node) + { + NODE_T *q = a68_new_node (); + INFO (q) = a68_new_node_info (); + + switch (att) + { + case ASSIGN_SYMBOL: + case END_SYMBOL: + case ESAC_SYMBOL: + case OD_SYMBOL: + case OF_SYMBOL: + case FI_SYMBOL: + case CLOSE_SYMBOL: + case BUS_SYMBOL: + case COLON_SYMBOL: + case COMMA_SYMBOL: + case SEMI_SYMBOL: + GINFO (q) = NO_GINFO; + break; + default: + GINFO (q) = a68_new_genie_info (); + break; + } + + STATUS (q) = (STATUS_MASK_T) 0; + LINE (INFO (q)) = *start_l; + CHAR_IN_LINE (INFO (q)) = *start_c; + PRIO (INFO (q)) = 0; + PROCEDURE_LEVEL (INFO (q)) = 0; + ATTRIBUTE (q) = att; + NSYMBOL (q) = c; + PREVIOUS (q) = *root; + SUB (q) = NEXT (q) = NO_NODE; + TABLE (q) = NO_TABLE; + MOID (q) = NO_MOID; + TAX (q) = NO_TAG; + if (lpr != NO_TEXT) + { + NPRAGMENT (q) = lpr; + NPRAGMENT_TYPE (q) = lprt; + lpr = NO_TEXT; + lprt = 0; + } + if (*root != NO_NODE) + NEXT (*root) = q; + if (TOP_NODE (&A68_JOB) == NO_NODE) + TOP_NODE (&A68_JOB) = q; + *root = q; + if (trailing != NO_TEXT) + a68_warning (q, 0, + "ignoring trailing character H in A", + trailing, att); + } + /* Redirection in tokenising formats. The scanner is a recursive-descent type as + to know when it scans a format text and when not. */ + if (in_format && att == FORMAT_DELIMITER_SYMBOL) + return; + else if (!in_format && att == FORMAT_DELIMITER_SYMBOL) + tokenise_source (root, level + 1, true, l, s, start_l, start_c); + else if (in_format && open_nested_clause (att)) + { + NODE_T *z = PREVIOUS (*root); + + if (z != NO_NODE && a68_is_one_of (z, FORMAT_ITEM_N, FORMAT_ITEM_G, FORMAT_ITEM_H, + FORMAT_ITEM_F, STOP)) + { + tokenise_source (root, level, false, l, s, start_l, start_c); + } + else if (att == OPEN_SYMBOL) + ATTRIBUTE (*root) = FORMAT_OPEN_SYMBOL; + else if (OPTION_BRACKETS (&A68_JOB) && att == SUB_SYMBOL) + ATTRIBUTE (*root) = FORMAT_OPEN_SYMBOL; + } + else if (!in_format && level > 0 && open_nested_clause (att)) + tokenise_source (root, level + 1, false, l, s, start_l, start_c); + else if (!in_format && level > 0 && close_nested_clause (att)) + return; + else if (in_format && att == CLOSE_SYMBOL) + ATTRIBUTE (*root) = FORMAT_CLOSE_SYMBOL; + else if (OPTION_BRACKETS (&A68_JOB) && in_format && att == BUS_SYMBOL) + ATTRIBUTE (*root) = FORMAT_CLOSE_SYMBOL; + } + } +} + +/* Tokenise source file, build initial syntax tree. */ + +bool +a68_lexical_analyser (const char *filename) +{ + LINE_T *l = NO_LINE, *start_l = NO_LINE; + char *s = NO_TEXT, *start_c = NO_TEXT; + NODE_T *root = NO_NODE; + + /* Read the source file into lines. */ + if (!read_source_file (filename)) + return false; + + /* Start tokenising. */ + A68_PARSER (stop_scanner) = false; + if ((l = TOP_LINE (&A68_JOB)) != NO_LINE) + s = STRING (l); + tokenise_source (&root, 0, false, &l, &s, &start_l, &start_c); + + /* Note that A68_PARSER (scan_buf) and A68_PARSER (max_scan_buf_length) are + allocated by read_source_line. */ + free (A68_PARSER (scan_buf)); + A68_PARSER (scan_buf) = NULL; + A68_PARSER (max_scan_buf_length) = 0; + return true; +} -- 2.30.2
