Soar Kernel  9.3.2 08-06-12
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
lexer.cpp
Go to the documentation of this file.
1 #include <portability.h>
2 
3 /*************************************************************************
4  * PLEASE SEE THE FILE "license.txt" (INCLUDED WITH THIS SOFTWARE PACKAGE)
5  * FOR LICENSE AND COPYRIGHT INFORMATION.
6  *************************************************************************/
7 
8 /*************************************************************************
9  *
10  * file: lexer.cpp
11  *
12  * =======================================================================
13  *
14  * lexer.c
15  *
16  * The lexer reads files and returns a stream of lexemes. Get_lexeme() is
17  * the main routine; it looks for the next lexeme in the input, and stores
18  * it in the global variable "lexeme". See the structure definition below.
19  *
20  * Restrictions: the lexer cannot read individual input lines longer than
21  * MAX_LEXER_LINE_LENGTH characters. Thus, a single lexeme can't be longer
22  * than that either.
23  *
24  * The lexer maintains a stack of files being read, in order to handle nested
25  * loads. Start_lex_from_file() and stop_lex_from_file() push and pop the
26  * stack. Immediately after start_lex_from_file(), the current lexeme (global
27  * variable) is undefined. Immediately after stop_lex_from_file(), the
28  * current lexeme is automatically restored to whatever it was just before
29  * the corresponding start_lex_from_file() call.
30  *
31  * Determine_possible_symbol_types_for_string() is a utility routine which
32  * figures out what kind(s) of symbol a given string could represent.
33  *
34  * Print_location_of_most_recent_lexeme() is used to print an indication
35  * of where a parser error occurred. It tries to print out the current
36  * source line with a pointer to where the error was detected.
37  *
38  * Current_lexer_parentheses_level() returns the current level of parentheses
39  * nesting (0 means no open paren's have been encountered).
40  * Skip_ahead_to_balanced_parentheses() eats lexemes until the appropriate
41  * closing paren is found (0 means eat until back at the top level).
42  *
43  * Fake_rparen_at_next_end_of_line() tells the lexer to insert a fake
44  * R_PAREN_LEXEME token the next time it reaches the end of a line.
45  *
46  * Set_lexer_allow_ids() tells the lexer whether to allow identifiers to
47  * be read. If FALSE, things that look like identifiers will be returned
48  * as SYM_CONSTANT_LEXEME's instead.
49  *
50  * BUGBUG There are still problems with Soar not being very friendly
51  * when users have typos in productions, particularly with mismatched
52  * braces and parens. see also parser.c
53  * =======================================================================
54  */
55 /* ======================================================================
56  lexer.c
57 
58  See comments in soarkernel.h for an overview.
59  ====================================================================== */
60 
61 #include <stdlib.h>
62 
63 #include "lexer.h"
64 #include "mem.h"
65 #include "kernel.h"
66 #include "agent.h"
67 #include "print.h"
68 #include "init_soar.h"
69 #include "xml.h"
70 
71 #include <math.h>
72 #include <ctype.h>
73 
74 #include <assert.h>
75 
76 //
77 // These three should be safe for re-entrancy. --JNW--
78 //
79 Bool constituent_char[256]; /* is the character a symbol constituent? */
80 Bool whitespace[256]; /* is the character whitespace? */
81 Bool number_starters[256]; /* could the character initiate a number? */
82 
83 /* ======================================================================
84  Start/Stop Lex from File
85 
86  The lexer maintains a stack of files being read, in order to handle nested
87  loads. Start_lex_from_file() and stop_lex_from_file() push and pop the
88  stack. Immediately after start_lex_from_file(), the current lexeme (agent
89  variable) is undefined. Immediately after stop_lex_from_file(), the
90  current lexeme is automatically restored to whatever it was just before
91  the corresponding start_lex_from_file() call.
92 ====================================================================== */
93 
94 void start_lex_from_file (agent* thisAgent, const char *filename,
95  FILE *already_opened_file) {
96  lexer_source_file *lsf;
97 
98  lsf = static_cast<lexer_source_file_struct *>(allocate_memory (thisAgent, sizeof(lexer_source_file),
100  lsf->saved_lexeme = thisAgent->lexeme;
101  lsf->saved_current_char = thisAgent->current_char;
102  lsf->parent_file = thisAgent->current_file;
103  thisAgent->current_file = lsf;
104  lsf->filename = make_memory_block_for_string (thisAgent, filename);
105  lsf->file = already_opened_file;
106  lsf->fake_rparen_at_eol = FALSE;
107  lsf->allow_ids = TRUE;
108  lsf->parentheses_level = 0;
111  lsf->current_line = 0;
112  lsf->current_column = 0;
113  lsf->buffer[0] = 0;
114  thisAgent->current_char = ' '; /* whitespace--to force immediate read of first line */
115 }
116 
117 void stop_lex_from_file (agent* thisAgent) {
118  lexer_source_file *lsf;
119 
120  if (reading_from_top_level(thisAgent)) {
121  print (thisAgent, "Internal error: tried to stop_lex_from_file at top level\n");
122  return;
123  }
124  lsf = thisAgent->current_file;
125  thisAgent->current_file = thisAgent->current_file->parent_file;
126  thisAgent->current_char = lsf->saved_current_char;
127  thisAgent->lexeme = lsf->saved_lexeme;
128 
129  free_memory_block_for_string (thisAgent, lsf->filename);
130  free_memory (thisAgent, lsf, MISCELLANEOUS_MEM_USAGE);
131 }
132 
133 /* ======================================================================
134  Get next char
135 
136  Get_next_char() gets the next character from the current input file and
137  puts it into the agent variable current_char.
138 ====================================================================== */
139 
140 void get_next_char (agent* thisAgent) {
141  char *s;
142 
143  if ( thisAgent->alternate_input_exit &&
144  (thisAgent->alternate_input_string == NULL) &&
145  (thisAgent->alternate_input_suffix == NULL) ) {
146  thisAgent->current_char = EOF;
147  //assert(0 && "error in lexer.cpp (control_c_handler() used to be called here)");
148  return;
149  }
150 
151  if (thisAgent->alternate_input_string != NULL)
152  {
153  thisAgent->current_char = *thisAgent->alternate_input_string++;
154 
155  if (thisAgent->current_char == '\0')
156  {
157  thisAgent->alternate_input_string = NIL;
158  thisAgent->current_char =
159  *thisAgent->alternate_input_suffix++;
160  }
161  }
162  else if (thisAgent->alternate_input_suffix != NULL)
163  {
164  thisAgent->current_char = *thisAgent->alternate_input_suffix++;
165 
166  if (thisAgent->current_char == '\0')
167  {
168  thisAgent->alternate_input_suffix = NIL;
169 
170  if ( thisAgent->alternate_input_exit ) {
171  thisAgent->current_char = EOF;
172  //assert(0 && "error in lexer.cpp (control_c_handler() used to be called here)");
173  return;
174  }
175 
176  thisAgent->current_char = thisAgent->current_file->buffer
177  [thisAgent->current_file->current_column++];
178  }
179  }
180  else
181  {
182  thisAgent->current_char = thisAgent->current_file->buffer
183  [thisAgent->current_file->current_column++];
184  }
185 
186  if (thisAgent->current_char) return;
187 
188  if ((thisAgent->current_file->current_column == BUFSIZE) &&
189  (thisAgent->current_file->buffer[BUFSIZE-2] != '\n') &&
190  (thisAgent->current_file->buffer[BUFSIZE-2] != EOF)) {
191  char msg[512];
192  SNPRINTF (msg, 512,
193  "lexer.c: Error: line too long (max allowed is %d chars)\nFile %s, line %llu\n",
195  static_cast<long long unsigned>(thisAgent->current_file->current_line));
196  msg[511] = 0; /* ensure null termination */
197 
198  abort_with_fatal_error(thisAgent, msg);
199  }
200 
201  s = fgets (thisAgent->current_file->buffer, BUFSIZE, thisAgent->current_file->file);
202 
203  if (s) {
204  thisAgent->current_file->current_line++;
205  if (reading_from_top_level(thisAgent)) {
207  }
208  } else {
209  /* s==NIL means immediate eof encountered or read error occurred */
210  if (! feof(thisAgent->current_file->file)) {
211  if(reading_from_top_level(thisAgent)) {
212  assert(0 && "error in lexer.cpp (control_c_handler() used to be called here)");
213  return;
214  } else {
215  print (thisAgent, "I/O error while reading file %s; ignoring the rest of it.\n",
216  thisAgent->current_file->filename);
217  }
218  }
219  thisAgent->current_file->buffer[0] = 0;
220  }
221  thisAgent->current_char = thisAgent->current_file->buffer[0];
222  thisAgent->current_file->current_column = 1;
223 }
224 
225 /* ======================================================================
226 
227  Lexer Utility Routines
228 
229 ====================================================================== */
230 
231 /*#define record_position_of_start_of_lexeme() { \
232  thisAgent->current_file->column_of_start_of_last_lexeme = \
233  thisAgent->current_file->current_column - 1; \
234  thisAgent->current_file->line_of_start_of_last_lexeme = \
235  thisAgent->current_file->current_line; }*/
237 {
239  thisAgent->current_file->current_column - 1;
241  thisAgent->current_file->current_line;
242 }
243 
244 /* redefined for Soar 7, want case-sensitivity to match Tcl. KJC 5/96
245 #define store_and_advance() { \
246  thisAgent->lexeme.string[thisAgent->lexeme.length++] = (isupper((char)thisAgent->current_char) ? \
247  tolower((char)thisAgent->current_char) : \
248  (char)thisAgent->current_char); \
249  get_next_char(); }
250 #define store_and_advance() { \
251  thisAgent->lexeme.string[thisAgent->lexeme.length++] = \
252  (char)thisAgent->current_char; \
253  get_next_char(); }*/
254 inline void store_and_advance(agent* thisAgent)
255 {
256  thisAgent->lexeme.string[thisAgent->lexeme.length++] = char(thisAgent->current_char);
257  get_next_char(thisAgent);
258 }
259 
260 /*#define finish() { thisAgent->lexeme.string[thisAgent->lexeme.length]=0; }*/
261 inline void finish(agent* thisAgent)
262 {
263  thisAgent->lexeme.string[thisAgent->lexeme.length]=0;
264 }
265 
266 void read_constituent_string (agent* thisAgent) {
267 #ifdef __SC__
268  char *buf;
269  int i,len;
270 #endif
271 
272  while ((thisAgent->current_char!=EOF) &&
273  constituent_char[static_cast<unsigned char>(thisAgent->current_char)])
274  store_and_advance(thisAgent);
275  finish(thisAgent);
276 }
277 
279  /* --- at entry, current_char=="."; we read the "." and rest of number --- */
280  store_and_advance(thisAgent);
281  while (isdigit(thisAgent->current_char)) store_and_advance(thisAgent); /* string of digits */
282  if ((thisAgent->current_char=='e')||(thisAgent->current_char=='E')) {
283  store_and_advance(thisAgent); /* E */
284  if ((thisAgent->current_char=='+')||(thisAgent->current_char=='-'))
285  store_and_advance(thisAgent); /* optional leading + or - */
286  while (isdigit(thisAgent->current_char)) store_and_advance(thisAgent); /* string of digits */
287  }
288  finish(thisAgent);
289 
290 #ifdef __SC__
291  if (strcmp("soar>",thisAgent->lexeme.string)) { /* if the lexeme doesn't equal "soar>" */
292  if (!(strncmp("soar>",thisAgent->lexeme.string,5))) { /* but the first 5 chars are "soar>" */
293  /* then SIOW messed up so ignore the "soar>" */
294  buf = (char *)allocate_memory(thisAgent, (len=(strlen(thisAgent->lexeme.string)+1))*sizeof(char),STRING_MEM_USAGE);
295  for (i=0;i<=len;i++) {
296  buf[i] = thisAgent->lexeme.string[i];
297  }
298  for (i=5;i<=len;i++) {
299  thisAgent->lexeme.string[i-5] = buf[i];
300  }
301  free_memory_block_for_string(thisAgent, buf);
302  }
303  }
304 #endif
305 }
306 
308  Bool possible_id, possible_var, possible_sc, possible_ic, possible_fc;
309  Bool rereadable;
310 
312  thisAgent->lexeme.length,
313  &possible_id,
314  &possible_var,
315  &possible_sc,
316  &possible_ic,
317  &possible_fc,
318  &rereadable);
319 
320  if (possible_var) {
321  thisAgent->lexeme.type = VARIABLE_LEXEME;
322  return TRUE;
323  }
324 
325  if (possible_ic) {
326  errno = 0;
327  thisAgent->lexeme.type = INT_CONSTANT_LEXEME;
328  thisAgent->lexeme.int_val = strtol (thisAgent->lexeme.string,NULL,10);
329  if (errno) {
330  print (thisAgent, "Error: bad integer (probably too large)\n");
332  thisAgent->lexeme.int_val = 0;
333  }
334  return (errno == 0);
335  }
336 
337  if (possible_fc) {
338  errno = 0;
339  thisAgent->lexeme.type = FLOAT_CONSTANT_LEXEME;
340  thisAgent->lexeme.float_val = strtod (thisAgent->lexeme.string,NULL);
341  if (errno) {
342  print (thisAgent, "Error: bad floating point number\n");
344  thisAgent->lexeme.float_val = 0.0;
345  }
346  return (errno == 0);
347  }
348 
349  if (thisAgent->current_file->allow_ids && possible_id) {
350  // long term identifiers start with @
351  unsigned lti_index = 0;
352  if (thisAgent->lexeme.string[lti_index] == '@') {
353  lti_index += 1;
354  }
355  thisAgent->lexeme.id_letter = static_cast<char>(toupper(thisAgent->lexeme.string[lti_index]));
356  lti_index += 1;
357  errno = 0;
358  thisAgent->lexeme.type = IDENTIFIER_LEXEME;
359  if (!from_c_string(thisAgent->lexeme.id_number, &(thisAgent->lexeme.string[lti_index]))) {
360  print (thisAgent, "Error: bad number for identifier (probably too large)\n");
362  thisAgent->lexeme.id_number = 0;
363  errno = 1;
364  }
365  return (errno == 0);
366  }
367 
368  if (possible_sc) {
369  thisAgent->lexeme.type = SYM_CONSTANT_LEXEME;
370  if (thisAgent->sysparams[PRINT_WARNINGS_SYSPARAM]) {
371  if ( (thisAgent->lexeme.string[0] == '<') ||
372  (thisAgent->lexeme.string[thisAgent->lexeme.length-1] == '>') )
373  {
374  print (thisAgent, "Warning: Suspicious string constant \"%s\"\n", thisAgent->lexeme.string);
376  xml_generate_warning(thisAgent, "Warning: Suspicious string constant");
377  }
378  }
379  return TRUE;
380  }
381 
382  thisAgent->lexeme.type = QUOTED_STRING_LEXEME;
383  return TRUE;
384 }
385 
386 void do_fake_rparen (agent* thisAgent) {
388  thisAgent->lexeme.type = R_PAREN_LEXEME;
389  thisAgent->lexeme.length = 1;
390  thisAgent->lexeme.string[0] = ')';
391  thisAgent->lexeme.string[1] = 0;
392  if (thisAgent->current_file->parentheses_level > 0) thisAgent->current_file->parentheses_level--;
393  thisAgent->current_file->fake_rparen_at_eol = FALSE;
394 }
395 
396 /* ======================================================================
397  Lex such-and-such Routines
398 
399  These routines are called from get_lexeme(). Which routine gets called
400  depends on the first character of the new lexeme being read. Each routine's
401  job is to finish reading the lexeme and store the necessary items in
402  the agent variable "lexeme".
403 ====================================================================== */
404 
405 void lex_unknown (agent* thisAgent);
406 #define lu lex_unknown
407 
408 //
409 // This should be safe for re-entrant code. --JNW--
410 //
411 void (*(lexer_routines[256]))(agent*) =
412 {
413  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
414  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
415  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
416  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
417  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
418  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
419  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
420  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
421  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
422  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
423  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
424  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
425  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
426  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
427  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
428  lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,
429 };
430 
431 void lex_eof (agent* thisAgent) {
432  if (thisAgent->current_file->fake_rparen_at_eol) {
433  do_fake_rparen(thisAgent);
434  return;
435  }
436  store_and_advance(thisAgent);
437  finish(thisAgent);
438  thisAgent->lexeme.type = EOF_LEXEME;
439 }
440 
441 void lex_at (agent* thisAgent) {
442  store_and_advance(thisAgent);
443  finish(thisAgent);
444  thisAgent->lexeme.type = AT_LEXEME;
445 }
446 
447 void lex_tilde (agent* thisAgent) {
448  store_and_advance(thisAgent);
449  finish(thisAgent);
450  thisAgent->lexeme.type = TILDE_LEXEME;
451 }
452 
453 void lex_up_arrow (agent* thisAgent) {
454  store_and_advance(thisAgent);
455  finish(thisAgent);
456  thisAgent->lexeme.type = UP_ARROW_LEXEME;
457 }
458 
459 void lex_lbrace (agent* thisAgent) {
460  store_and_advance(thisAgent);
461  finish(thisAgent);
462  thisAgent->lexeme.type = L_BRACE_LEXEME;
463 }
464 
465 void lex_rbrace (agent* thisAgent) {
466  store_and_advance(thisAgent);
467  finish(thisAgent);
468  thisAgent->lexeme.type = R_BRACE_LEXEME;
469 }
470 
471 void lex_exclamation_point (agent* thisAgent) {
472  store_and_advance(thisAgent);
473  finish(thisAgent);
474  thisAgent->lexeme.type = EXCLAMATION_POINT_LEXEME;
475 }
476 
477 void lex_comma (agent* thisAgent) {
478  store_and_advance(thisAgent);
479  finish(thisAgent);
480  thisAgent->lexeme.type = COMMA_LEXEME;
481 }
482 
483 void lex_equal (agent* thisAgent) {
484  /* Lexeme might be "=", or symbol */
485  /* Note: this routine relies on = being a constituent character */
486 
487  read_constituent_string(thisAgent);
488  if (thisAgent->lexeme.length==1) { thisAgent->lexeme.type = EQUAL_LEXEME; return; }
490 }
491 
492 void lex_ampersand (agent* thisAgent) {
493  /* Lexeme might be "&", or symbol */
494  /* Note: this routine relies on & being a constituent character */
495 
496  read_constituent_string(thisAgent);
497  if (thisAgent->lexeme.length==1) { thisAgent->lexeme.type = AMPERSAND_LEXEME; return; }
499 }
500 
501 void lex_lparen (agent* thisAgent) {
502  store_and_advance(thisAgent);
503  finish(thisAgent);
504  thisAgent->lexeme.type = L_PAREN_LEXEME;
505  thisAgent->current_file->parentheses_level++;
506 }
507 
508 void lex_rparen (agent* thisAgent) {
509  store_and_advance(thisAgent);
510  finish(thisAgent);
511  thisAgent->lexeme.type = R_PAREN_LEXEME;
512  if (thisAgent->current_file->parentheses_level > 0) thisAgent->current_file->parentheses_level--;
513 }
514 
515 void lex_greater (agent* thisAgent) {
516  /* Lexeme might be ">", ">=", ">>", or symbol */
517  /* Note: this routine relies on =,> being constituent characters */
518 
519  read_constituent_string(thisAgent);
520  if (thisAgent->lexeme.length==1) { thisAgent->lexeme.type = GREATER_LEXEME; return; }
521  if (thisAgent->lexeme.length==2) {
522  if (thisAgent->lexeme.string[1]=='>') { thisAgent->lexeme.type = GREATER_GREATER_LEXEME; return;}
523  if (thisAgent->lexeme.string[1]=='=') { thisAgent->lexeme.type = GREATER_EQUAL_LEXEME; return; }
524  }
526 }
527 
528 void lex_less (agent* thisAgent) {
529  /* Lexeme might be "<", "<=", "<=>", "<>", "<<", or variable */
530  /* Note: this routine relies on =,<,> being constituent characters */
531 
532  read_constituent_string(thisAgent);
533  if (thisAgent->lexeme.length==1) { thisAgent->lexeme.type = LESS_LEXEME; return; }
534  if (thisAgent->lexeme.length==2) {
535  if (thisAgent->lexeme.string[1]=='>') { thisAgent->lexeme.type = NOT_EQUAL_LEXEME; return; }
536  if (thisAgent->lexeme.string[1]=='=') { thisAgent->lexeme.type = LESS_EQUAL_LEXEME; return; }
537  if (thisAgent->lexeme.string[1]=='<') { thisAgent->lexeme.type = LESS_LESS_LEXEME; return; }
538  }
539  if (thisAgent->lexeme.length==3) {
540  if ((thisAgent->lexeme.string[1]=='=')&&(thisAgent->lexeme.string[2]=='>'))
541  { thisAgent->lexeme.type = LESS_EQUAL_GREATER_LEXEME; return; }
542  }
544 
545 }
546 
547 void lex_period (agent* thisAgent) {
548  store_and_advance(thisAgent);
549  finish(thisAgent);
550  /* --- if we stopped at '.', it might be a floating-point number, so be
551  careful to check for this case --- */
552  if (isdigit(thisAgent->current_char)) read_rest_of_floating_point_number(thisAgent);
553  if (thisAgent->lexeme.length==1) { thisAgent->lexeme.type = PERIOD_LEXEME; return; }
555 }
556 
557 void lex_plus (agent* thisAgent) {
558  /* Lexeme might be +, number, or symbol */
559  /* Note: this routine relies on various things being constituent chars */
560  int i;
561  Bool could_be_floating_point;
562 
563  read_constituent_string(thisAgent);
564  /* --- if we stopped at '.', it might be a floating-point number, so be
565  careful to check for this case --- */
566  if (thisAgent->current_char=='.') {
567  could_be_floating_point = TRUE;
568  for (i=1; i<thisAgent->lexeme.length; i++)
569  if (! isdigit(thisAgent->lexeme.string[i])) could_be_floating_point = FALSE;
570  if (could_be_floating_point) read_rest_of_floating_point_number(thisAgent);
571  }
572  if (thisAgent->lexeme.length==1) { thisAgent->lexeme.type = PLUS_LEXEME; return; }
574 }
575 
576 void lex_minus (agent* thisAgent) {
577  /* Lexeme might be -, -->, number, or symbol */
578  /* Note: this routine relies on various things being constituent chars */
579  int i;
580  Bool could_be_floating_point;
581 
582  read_constituent_string(thisAgent);
583  /* --- if we stopped at '.', it might be a floating-point number, so be
584  careful to check for this case --- */
585  if (thisAgent->current_char=='.') {
586  could_be_floating_point = TRUE;
587  for (i=1; i<thisAgent->lexeme.length; i++)
588  if (! isdigit(thisAgent->lexeme.string[i])) could_be_floating_point = FALSE;
589  if (could_be_floating_point) read_rest_of_floating_point_number(thisAgent);
590  }
591  if (thisAgent->lexeme.length==1) { thisAgent->lexeme.type = MINUS_LEXEME; return; }
592  if (thisAgent->lexeme.length==3) {
593  if ((thisAgent->lexeme.string[1]=='-')&&(thisAgent->lexeme.string[2]=='>'))
594  { thisAgent->lexeme.type = RIGHT_ARROW_LEXEME; return; }
595  }
597 }
598 
599 void lex_digit (agent* thisAgent) {
600  int i;
601  Bool could_be_floating_point;
602 
603  read_constituent_string(thisAgent);
604  /* --- if we stopped at '.', it might be a floating-point number, so be
605  careful to check for this case --- */
606  if (thisAgent->current_char=='.') {
607  could_be_floating_point = TRUE;
608  for (i=1; i<thisAgent->lexeme.length; i++)
609  if (! isdigit(thisAgent->lexeme.string[i])) could_be_floating_point = FALSE;
610  if (could_be_floating_point) read_rest_of_floating_point_number(thisAgent);
611  }
613 }
614 
615 void lex_unknown (agent* thisAgent) {
616  if(reading_from_top_level(thisAgent) && thisAgent->current_char == 0) {
617  }
618  else {
619  print (thisAgent, "Error: Unknown character encountered by lexer, code=%d\n",
620  thisAgent->current_char);
621  print (thisAgent, "File %s, line %lu, column %lu.\n", thisAgent->current_file->filename,
622  thisAgent->current_file->current_line,
623  thisAgent->current_file->current_column);
624  if (! reading_from_top_level(thisAgent)) {
625  //respond_to_load_errors (thisAgent);
626  if (thisAgent->load_errors_quit)
627  thisAgent->current_char = EOF;
628  }
629  }
630  get_next_char(thisAgent);
631  get_lexeme(thisAgent);
632 }
633 
634 void lex_constituent_string (agent* thisAgent) {
635  read_constituent_string(thisAgent);
637 }
638 
639 void lex_vbar (agent* thisAgent) {
640  thisAgent->lexeme.type = SYM_CONSTANT_LEXEME;
641  get_next_char(thisAgent);
642  do {
643  if ((thisAgent->current_char==EOF)||
644  (thisAgent->lexeme.length==MAX_LEXEME_LENGTH)) {
645  print (thisAgent, "Error: opening '|' without closing '|'\n");
647  /* BUGBUG if reading from top level, don't want to signal EOF */
648  thisAgent->lexeme.type = EOF_LEXEME;
649  thisAgent->lexeme.string[0]=EOF;
650  thisAgent->lexeme.string[1]=0;
651  thisAgent->lexeme.length = 1;
652  return;
653  }
654  if (thisAgent->current_char=='\\') {
655  get_next_char(thisAgent);
656  thisAgent->lexeme.string[thisAgent->lexeme.length++] = char(thisAgent->current_char);
657  get_next_char(thisAgent);
658  } else if (thisAgent->current_char=='|') {
659  get_next_char(thisAgent);
660  break;
661  } else {
662  thisAgent->lexeme.string[thisAgent->lexeme.length++] = char(thisAgent->current_char);
663  get_next_char(thisAgent);
664  }
665  } while(TRUE);
666  thisAgent->lexeme.string[thisAgent->lexeme.length]=0;
667 }
668 
669 void lex_quote (agent* thisAgent) {
670  thisAgent->lexeme.type = QUOTED_STRING_LEXEME;
671  get_next_char(thisAgent);
672  do {
673  if ((thisAgent->current_char==EOF)||(thisAgent->lexeme.length==MAX_LEXEME_LENGTH)) {
674  print (thisAgent, "Error: opening '\"' without closing '\"'\n");
676  /* BUGBUG if reading from top level, don't want to signal EOF */
677  thisAgent->lexeme.type = EOF_LEXEME;
678  thisAgent->lexeme.string[0]=0;
679  thisAgent->lexeme.length = 1;
680  return;
681  }
682  if (thisAgent->current_char=='\\') {
683  get_next_char(thisAgent);
684  thisAgent->lexeme.string[thisAgent->lexeme.length++] = char(thisAgent->current_char);
685  get_next_char(thisAgent);
686  } else if (thisAgent->current_char=='"') {
687  get_next_char(thisAgent);
688  break;
689  } else {
690  thisAgent->lexeme.string[thisAgent->lexeme.length++] = char(thisAgent->current_char);
691  get_next_char(thisAgent);
692  }
693  } while(TRUE);
694  thisAgent->lexeme.string[thisAgent->lexeme.length]=0;
695 }
696 
697 /* AGR 562 begin */
698 
699 /* There are 2 functions here, for 2 different schemes for handling the
700  shell escape.
701  Scheme 1: A '$' signals that all the rest of the text up to the '\n'
702  is to be passed to the system() command verbatim. The whole string,
703  including the '$' as its first character, is stored in a single
704  lexeme which has the type DOLLAR_STRING_LEXEME.
705  Scheme 2: A '$' is a single lexeme, much like a '(' or '&'. All the
706  subsequent lexemes are gotten individually with calls to get_lexeme().
707  This makes it easier to parse the shell command, so that commands like
708  cd, pushd, popd, etc. can be trapped and the equivalent Soar commands
709  executed instead. The problem with this scheme is that pulling the
710  string apart into lexemes eliminates any special spacing the user may
711  have done in specifying the shell command. For that reason, my current
712  plan is to follow scheme 1. AGR 3-Jun-94 */
713 
714 void lex_dollar (agent* thisAgent) {
715  thisAgent->lexeme.type = DOLLAR_STRING_LEXEME;
716  thisAgent->lexeme.string[0] = '$';
717  thisAgent->lexeme.length = 1;
718  get_next_char(thisAgent); /* consume the '$' */
719  while ((thisAgent->current_char!='\n') &&
720  (thisAgent->current_char!=EOF) &&
721  (thisAgent->lexeme.length < MAX_LEXEME_LENGTH-1)) {
722  thisAgent->lexeme.string[thisAgent->lexeme.length++] =
723  char(thisAgent->current_char);
724  get_next_char(thisAgent);
725  }
726  thisAgent->lexeme.string[thisAgent->lexeme.length] = '\0';
727 }
728 
729 /*
730 void lex_dollar (void) {
731  store_and_advance();
732  finish();
733  thisAgent->lexeme.type = DOLLAR_STRING_LEXEME;
734 }
735 */
736 
737 /* AGR 562 end */
738 
739 /* ======================================================================
740  Get lexeme
741 
742  This is the main routine called from outside the lexer. It reads past
743  any whitespace, then calls some lex_xxx routine (using the lexer_routines[]
744  table) based on the first character of the lexeme.
745 ====================================================================== */
746 
747 void get_lexeme (agent* thisAgent) {
748 
749  /* AGR 568 begin */
750  if (thisAgent->lex_alias) {
751  thisAgent->lexeme = thisAgent->lex_alias->lexeme;
752  thisAgent->lex_alias = thisAgent->lex_alias->next;
753  return;
754  }
755  /* AGR 568 end */
756 
757  thisAgent->lexeme.length = 0;
758  thisAgent->lexeme.string[0] = 0;
759 
760 /* AGR 534 The only time a prompt should be printed out is if there's
761  a command being expected; ie. the prompt shouldn't print out if we're
762  in the middle of entering a production. So if we're in the middle of
763  entering a production, then the parentheses level will be > 0, so that's
764  the criteria we will use. AGR 5-Apr-94 */
765 
766  thisAgent->load_errors_quit = FALSE; /* AGR 527c */
767 
768  while (thisAgent->load_errors_quit==FALSE) { /* AGR 527c */
769  if (thisAgent->current_char==EOF) break;
770  if (whitespace[static_cast<unsigned char>(thisAgent->current_char)]) {
771  if (thisAgent->current_char == '\n')
772  {
773  if (thisAgent->current_file->fake_rparen_at_eol) {
774  do_fake_rparen(thisAgent);
775  return;
776  }
777  }
778  get_next_char(thisAgent);
779  continue;
780  }
781 
782 //#ifdef USE_TCL
783  if (thisAgent->current_char==';') {
784  /* --- skip the semi-colon, forces newline in TCL --- */
785  get_next_char(thisAgent); /* consume it */
786  continue;
787  }
788  if (thisAgent->current_char=='#') {
789  /* --- read from hash to end-of-line --- */
790  while ((thisAgent->current_char!='\n') &&
791  (thisAgent->current_char!=EOF))
792  get_next_char(thisAgent);
793  if (thisAgent->current_file->fake_rparen_at_eol) {
794  do_fake_rparen(thisAgent);
795  return;
796  }
797  if (thisAgent->current_char!=EOF) get_next_char(thisAgent);
798  continue;
799  }
800 //#else
801 // if (thisAgent->current_char==';') {
802 // /* --- read from semicolon to end-of-line --- */
803 // while ((thisAgent->current_char!='\n') &&
804 // (thisAgent->current_char!=EOF))
805 // get_next_char(thisAgent);
806 // if (thisAgent->current_file->fake_rparen_at_eol) {
807 // do_fake_rparen(thisAgent);
808 // return;
809 // }
810 // if (thisAgent->current_char!=EOF) get_next_char(thisAgent);
811 // continue;
812 // }
813 // if (thisAgent->current_char=='#') {
814 // /* --- comments surrounded by "#|" and "|#" delimiters --- */
815 // record_position_of_start_of_lexeme(); /* in case of later error mesg. */
816 // get_next_char(thisAgent);
817 // if (thisAgent->current_char!='|') {
818 // print ("Error: '#' not followed by '|'\n");
819 // print_location_of_most_recent_lexeme(thisAgent);
820 // continue;
821 // }
822 // get_next_char(thisAgent); /* consume the vbar */
823 // while (TRUE) {
824 // if (thisAgent->current_char==EOF) {
825 // print ("Error: '#|' without terminating '|#'\n");
826 // print_location_of_most_recent_lexeme(thisAgent);
827 // break;
828 // }
829 // if (thisAgent->current_char!='|') { get_next_char(thisAgent); continue; }
830 // get_next_char(thisAgent);
831 // if (thisAgent->current_char=='#') break;
832 // }
833 // get_next_char(thisAgent); /* consume the closing '#' */
834 // continue; /* continue outer while(TRUE), reading more whitespace */
835 // }
836 //#endif /* USE_TCL */
837  break; /* if no whitespace or comments found, break out of the loop */
838  }
839  /* --- no more whitespace, so go get the actual lexeme --- */
841  if (thisAgent->current_char!=EOF)
842  (*(lexer_routines[static_cast<unsigned char>(thisAgent->current_char)]))(thisAgent);
843  else
844  lex_eof(thisAgent);
845 }
846 
847 /* ======================================================================
848  Init lexer
849 
850  This should be called before anything else in this file. It does all
851  the necessary init stuff for the lexer, and starts the lexer reading from
852  standard input.
853 ====================================================================== */
854 
855 
856 //
857 // This file badly need to be locked. Probably not the whole thing, but certainly the last
858 // call to start_lext_from_file. It does a memory allocation and other things that should
859 // never happen more than once.
860 //
861 void init_lexer (agent* thisAgent)
862 {
863  static bool initialized = false;
864 
865  if(!initialized)
866  {
867  initialized = true;
868 
869  unsigned int i;
870 
871  /* --- setup constituent_char array --- */
872  char extra_constituents[] = "$%&*+-/:<=>?_@";
873  for (i=0; i<256; i++)
874  {
875  //
876  // When i == 1, strchr returns true based on the terminating
877  // character. This is not the intent, so we exclude that case
878  // here.
879  //
880  if((strchr(extra_constituents, i) != 0) && i != 0)
881  {
883  }
884  else
885  {
886  constituent_char[i] = (isalnum(i) != 0);
887  }
888  }
889 
890  // for (i=0; i<strlen(extra_constituents); i++)
891  // {
892  // constituent_char[(int)extra_constituents[i]]=TRUE;
893  // }
894 
895  /* --- setup whitespace array --- */
896  for (i=0; i<256; i++)
897  {
898  whitespace[i] = (isspace(i) != 0);
899  }
900 
901  /* --- setup number_starters array --- */
902  for (i=0; i<256; i++)
903  {
904  switch(i)
905  {
906  case '+':
907  number_starters[(int)'+']=TRUE;
908  break;
909  case '-':
910  number_starters[(int)'-']=TRUE;
911  break;
912  case '.':
913  number_starters[(int)'.']=TRUE;
914  break;
915  default:
916  number_starters[i] = (isdigit(i) != 0);
917  }
918  }
919 
920  /* --- setup lexer_routines array --- */
921  //
922  // I go to some effort here to insure that values do not
923  // get overwritten. That could cause problems in a multi-
924  // threaded sense because values could get switched to one
925  // value and then another. If a value is only ever set to
926  // one thing, resetting it to the same thing should be
927  // perfectly safe.
928  //
929  for (i=0; i<256; i++)
930  {
931  switch(i)
932  {
933  case '@':
934  lexer_routines[(int)'@'] = lex_at;
935  break;
936  case '(':
937  lexer_routines[(int)'('] = lex_lparen;
938  break;
939  case ')':
940  lexer_routines[(int)')'] = lex_rparen;
941  break;
942  case '+':
943  lexer_routines[(int)'+'] = lex_plus;
944  break;
945  case '-':
946  lexer_routines[(int)'-'] = lex_minus;
947  break;
948  case '~':
949  lexer_routines[(int)'~'] = lex_tilde;
950  break;
951  case '^':
953  break;
954  case '{':
955  lexer_routines[(int)'{'] = lex_lbrace;
956  break;
957  case '}':
958  lexer_routines[(int)'}'] = lex_rbrace;
959  break;
960  case '!':
962  break;
963  case '>':
965  break;
966  case '<':
967  lexer_routines[(int)'<'] = lex_less;
968  break;
969  case '=':
970  lexer_routines[(int)'='] = lex_equal;
971  break;
972  case '&':
974  break;
975  case '|':
976  lexer_routines[(int)'|'] = lex_vbar;
977  break;
978  case ',':
979  lexer_routines[(int)','] = lex_comma;
980  break;
981  case '.':
982  lexer_routines[(int)'.'] = lex_period;
983  break;
984  case '"':
985  lexer_routines[(int)'"'] = lex_quote;
986  break;
987  case '$':
988  lexer_routines[(int)'$'] = lex_dollar; /* AGR 562 */
989  break;
990  default:
991  if (isdigit(i))
992  {
994  continue;
995  }
996 
997  if (constituent_char[i])
998  {
1000  continue;
1001  }
1002  }
1003  }
1004  }
1005 
1006  /* --- initially we're reading from the standard input --- */
1007  start_lex_from_file (thisAgent, "[standard input]", stdin);
1008 }
1009 
1010 /* ======================================================================
1011  Print location of most recent lexeme
1012 
1013  This routine is used to print an indication of where a parser or interface
1014  command error occurred. It tries to print out the current source line
1015  with a pointer to where the error was detected. If the current source
1016  line is no longer available, it just prints out the line number instead.
1017 
1018  BUGBUG: if the input line contains any tabs, the pointer comes out in
1019  the wrong place.
1020 ====================================================================== */
1021 
1023  int i;
1024 
1025  if (thisAgent->current_file->line_of_start_of_last_lexeme ==
1026  thisAgent->current_file->current_line) {
1027  /* --- error occurred on current line, so print out the line --- */
1028  if (! reading_from_top_level(thisAgent)) {
1029  print (thisAgent, "File %s, line %lu:\n", thisAgent->current_file->filename,
1030  thisAgent->current_file->current_line);
1031  /* respond_to_load_errors (); AGR 527a */
1032  }
1033  if (thisAgent->current_file->buffer[strlen(thisAgent->current_file->buffer)-1]=='\n')
1034  print_string (thisAgent, thisAgent->current_file->buffer);
1035  else
1036  print (thisAgent, "%s\n",thisAgent->current_file->buffer);
1037  for (i=0; i<thisAgent->current_file->column_of_start_of_last_lexeme; i++)
1038  print_string (thisAgent, "-");
1039  print_string (thisAgent, "^\n");
1040 
1041  if (! reading_from_top_level(thisAgent)) {
1042  //respond_to_load_errors (thisAgent); /* AGR 527a */
1043  if (thisAgent->load_errors_quit)
1044  thisAgent->current_char = EOF;
1045  }
1046 
1047 /* AGR 527a The respond_to_load_errors call came too early (above),
1048  and the "continue" prompt appeared before the offending line was printed
1049  out, so the respond_to_load_errors call was moved here.
1050  AGR 26-Apr-94 */
1051 
1052  } else {
1053  /* --- error occurred on a previous line, so just give the position --- */
1054  print (thisAgent, "File %s, line %lu, column %lu.\n", thisAgent->current_file->filename,
1057  if (! reading_from_top_level(thisAgent)) {
1058  //respond_to_load_errors (thisAgent);
1059  if (thisAgent->load_errors_quit)
1060  thisAgent->current_char = EOF;
1061  }
1062  }
1063 }
1064 
1065 /* ======================================================================
1066  Parentheses Utilities
1067 
1068  Current_lexer_parentheses_level() returns the current level of parentheses
1069  nesting (0 means no open paren's have been encountered).
1070 
1071  Skip_ahead_to_balanced_parentheses() eats lexemes until the appropriate
1072  closing paren is found (0 means eat until back at the top level).
1073 
1074  Fake_rparen_at_next_end_of_line() tells the lexer to insert a fake
1075  R_PAREN_LEXEME token the next time it reaches the end of a line.
1076 ====================================================================== */
1077 
1079  return thisAgent->current_file->parentheses_level;
1080 }
1081 
1083  int parentheses_level) {
1084  while (TRUE) {
1085  if (thisAgent->lexeme.type==EOF_LEXEME) return;
1086  if ((thisAgent->lexeme.type==R_PAREN_LEXEME) &&
1087  (parentheses_level==thisAgent->current_file->parentheses_level)) return;
1088  get_lexeme(thisAgent);
1089  }
1090 }
1091 
1093  thisAgent->current_file->parentheses_level++;
1094  thisAgent->current_file->fake_rparen_at_eol = TRUE;
1095 }
1096 
1097 /* ======================================================================
1098  Set lexer allow ids
1099 
1100  This routine should be called to tell the lexer whether to allow
1101  identifiers to be read. If FALSE, things that look like identifiers
1102  will be returned as SYM_CONSTANT_LEXEME's instead.
1103 ====================================================================== */
1104 
1105 void set_lexer_allow_ids (agent* thisAgent, Bool allow_identifiers) {
1106  thisAgent->current_file->allow_ids = allow_identifiers;
1107 }
1108 
1110  return thisAgent->current_file->allow_ids;
1111 }
1112 
1113 /* ======================================================================
1114  Determine possible symbol types for string
1115 
1116  This is a utility routine which figures out what kind(s) of symbol a
1117  given string could represent. At entry: s, length_of_s represent the
1118  string. At exit: possible_xxx is set to TRUE/FALSE to indicate
1119  whether the given string could represent that kind of symbol; rereadable
1120  is set to TRUE indicating whether the lexer would read the given string
1121  as a symbol with exactly the same name (as opposed to treating it as a
1122  special lexeme like "+", changing upper to lower case, etc.
1123 ====================================================================== */
1124 
1126  size_t length_of_s,
1127  Bool *possible_id,
1128  Bool *possible_var,
1129  Bool *possible_sc,
1130  Bool *possible_ic,
1131  Bool *possible_fc,
1132  Bool *rereadable) {
1133  char *ch;
1134  Bool all_alphanum;
1135 
1136  *possible_id = FALSE;
1137  *possible_var = FALSE;
1138  *possible_sc = FALSE;
1139  *possible_ic = FALSE;
1140  *possible_fc = FALSE;
1141  *rereadable = FALSE;
1142 
1143  /* --- check if it's an integer or floating point number --- */
1144  if (number_starters[static_cast<unsigned char>(*s)]) {
1145  ch = s;
1146  if ((*ch=='+')||(*ch=='-'))
1147  ch++; /* optional leading + or - */
1148  while (isdigit(*ch))
1149  ch++; /* string of digits */
1150  if ((*ch==0)&&(isdigit(*(ch-1))))
1151  *possible_ic = TRUE;
1152  if (*ch=='.') {
1153  ch++; /* decimal point */
1154  while (isdigit(*ch))
1155  ch++; /* string of digits */
1156  if ((*ch=='e')||(*ch=='E')) {
1157  ch++; /* E */
1158  if ((*ch=='+')||(*ch=='-'))
1159  ch++; /* optional leading + or - */
1160  while (isdigit(*ch))
1161  ch++; /* string of digits */
1162  }
1163  if (*ch==0)
1164  *possible_fc = TRUE;
1165  }
1166  }
1167 
1168  /* --- make sure it's entirely constituent characters --- */
1169  for (ch=s; *ch!=0; ch++)
1170  if (! constituent_char[static_cast<unsigned char>(*ch)])
1171  return;
1172 
1173  /* --- check for rereadability --- */
1174  all_alphanum = TRUE;
1175  for (ch=s; *ch!='\0'; ch++) {
1176  if (!isalnum(*ch)) {
1177  all_alphanum = FALSE;
1178  break;
1179  }
1180  }
1181  if ( all_alphanum ||
1182  (length_of_s > LENGTH_OF_LONGEST_SPECIAL_LEXEME) ||
1183  ((length_of_s==1)&&(*s=='*')) )
1184  {
1185  *rereadable = TRUE;
1186  }
1187 
1188  /* --- any string of constituents could be a sym constant --- */
1189  *possible_sc = TRUE;
1190 
1191  /* --- check whether it's a variable --- */
1192  if ((*s=='<')&&(*(s+length_of_s-1)=='>'))
1193  *possible_var = TRUE;
1194 
1195  /* --- check if it's an identifier --- */
1196  // long term identifiers start with @
1197  if (*s == '@') {
1198  ch = s+1;
1199  } else {
1200  ch = s;
1201  }
1202  if (isalpha(*ch) && *(++ch) != '\0') {
1203  /* --- is the rest of the string an integer? --- */
1204  while (isdigit(*ch))
1205  ch++;
1206  if (*ch=='\0')
1207  *possible_id = TRUE;
1208  }
1209 }
1210