git.sur5r.net Git - cc65/blob - src/cc65/scanner.c

   1 /*
   2  * scanner.c
   3  *
   4  * Ullrich von Bassewitz, 07.06.1998
   5  */
   6
   7
   8
   9 #include <stdio.h>
  10 #include <stdlib.h>
  11 #include <string.h>
  12 #include <errno.h>
  13 #include <ctype.h>
  14
  15 #include "ctrans.h"
  16 #include "datatype.h"
  17 #include "error.h"
  18 #include "function.h"
  19 #include "global.h"
  20 #include "ident.h"
  21 #include "input.h"
  22 #include "litpool.h"
  23 #include "preproc.h"
  24 #include "symtab.h"
  25 #include "util.h"
  26 #include "scanner.h"
  27
  28
  29
  30 /*****************************************************************************/
  31 /*                                   data                                    */
  32 /*****************************************************************************/
  33
  34
  35
  36 Token CurTok;           /* The current token */
  37 Token NextTok;          /* The next token */
  38
  39
  40
  41 /* Token types */
  42 #define TT_C    0               /* ANSI C token */
  43 #define TT_EXT  1               /* cc65 extension */
  44
  45 /* Token table */
  46 static const struct Keyword {
  47     char*           Key;        /* Keyword name */
  48     unsigned char   Tok;        /* The token */
  49     unsigned char   Type;       /* Token type */
  50 } Keywords [] = {
  51     { "__AX__",         TOK_AX,         TT_C    },
  52     { "__EAX__",        TOK_EAX,        TT_C    },
  53     { "__asm__",        TOK_ASM,        TT_C    },
  54     { "__attribute__",  TOK_ATTRIBUTE,  TT_C    },
  55     { "__far__",        TOK_FAR,        TT_C    },
  56     { "__fastcall__",   TOK_FASTCALL,   TT_C    },
  57     { "asm",            TOK_ASM,        TT_EXT  },
  58     { "auto",           TOK_AUTO,       TT_C    },
  59     { "break",          TOK_BREAK,      TT_C    },
  60     { "case",           TOK_CASE,       TT_C    },
  61     { "char",           TOK_CHAR,       TT_C    },
  62     { "const",          TOK_CONST,      TT_C    },
  63     { "continue",       TOK_CONTINUE,   TT_C    },
  64     { "default",        TOK_DEFAULT,    TT_C    },
  65     { "do",             TOK_DO,         TT_C    },
  66     { "double",         TOK_DOUBLE,     TT_C    },
  67     { "else",           TOK_ELSE,       TT_C    },
  68     { "enum",           TOK_ENUM,       TT_C    },
  69     { "extern",         TOK_EXTERN,     TT_C    },
  70     { "far",            TOK_FAR,        TT_EXT  },
  71     { "fastcall",       TOK_FASTCALL,   TT_EXT  },
  72     { "float",          TOK_FLOAT,      TT_C    },
  73     { "for",            TOK_FOR,        TT_C    },
  74     { "goto",           TOK_GOTO,       TT_C    },
  75     { "if",             TOK_IF,         TT_C    },
  76     { "int",            TOK_INT,        TT_C    },
  77     { "long",           TOK_LONG,       TT_C    },
  78     { "register",       TOK_REGISTER,   TT_C    },
  79     { "return",         TOK_RETURN,     TT_C    },
  80     { "short",          TOK_SHORT,      TT_C    },
  81     { "signed",         TOK_SIGNED,     TT_C    },
  82     { "sizeof",         TOK_SIZEOF,     TT_C    },
  83     { "static",         TOK_STATIC,     TT_C    },
  84     { "struct",         TOK_STRUCT,     TT_C    },
  85     { "switch",         TOK_SWITCH,     TT_C    },
  86     { "typedef",        TOK_TYPEDEF,    TT_C    },
  87     { "union",          TOK_UNION,      TT_C    },
  88     { "unsigned",       TOK_UNSIGNED,   TT_C    },
  89     { "void",           TOK_VOID,       TT_C    },
  90     { "volatile",       TOK_VOLATILE,   TT_C    },
  91     { "while",          TOK_WHILE,      TT_C    },
  92 };
  93 #define KEY_COUNT       (sizeof (Keywords) / sizeof (Keywords [0]))
  94
  95
  96
  97 /* Stuff for determining the type of an integer constant */
  98 #define IT_INT          0x01
  99 #define IT_UINT         0x02
 100 #define IT_LONG         0x04
 101 #define IT_ULONG        0x08
 102
 103
 104
 105 /*****************************************************************************/
 106 /*                                   code                                    */
 107 /*****************************************************************************/
 108
 109
 110
 111 static int CmpKey (const void* Key, const void* Elem)
 112 /* Compare function for bsearch */
 113 {
 114     return strcmp ((const char*) Key, ((const struct Keyword*) Elem)->Key);
 115 }
 116
 117
 118
 119 static int FindKey (const char* Key)
 120 /* Find a keyword and return the token. Return IDENT if the token is not a
 121  * keyword.
 122  */
 123 {
 124     struct Keyword* K;
 125     K = bsearch (Key, Keywords, KEY_COUNT, sizeof (Keywords [0]), CmpKey);
 126     if (K && (K->Type != TT_EXT || ANSI == 0)) {
 127         return K->Tok;
 128     } else {
 129         return TOK_IDENT;
 130     }
 131 }
 132
 133
 134
 135 static int SkipWhite (void)
 136 /* Skip white space in the input stream, reading and preprocessing new lines
 137  * if necessary. Return 0 if end of file is reached, return 1 otherwise.
 138  */
 139 {
 140     while (1) {
 141         while (CurC == 0) {
 142             if (NextLine () == 0) {
 143                 return 0;
 144             }
 145             Preprocess ();
 146         }
 147         if (CurC == ' ' || CurC == '\r') {
 148             NextChar ();
 149         } else {
 150             return 1;
 151         }
 152     }
 153 }
 154
 155
 156
 157 void SymName (char* s)
 158 /* Get symbol from input stream */
 159 {
 160     unsigned k = 0;
 161     do {
 162         if (k != MAX_IDENTLEN) {
 163             ++k;
 164             *s++ = CurC;
 165         }
 166         NextChar ();
 167     } while (IsIdent (CurC) || isdigit (CurC));
 168     *s = '\0';
 169 }
 170
 171
 172
 173 int IsSym (char *s)
 174 /* Get symbol from input stream or return 0 if not a symbol. */
 175 {
 176     if (IsIdent (CurC)) {
 177         SymName (s);
 178         return 1;
 179     } else {
 180         return 0;
 181     }
 182 }
 183
 184
 185
 186 static void unknown (char C)
 187 /* Error message for unknown character */
 188 {
 189     Error (ERR_INVALID_CHAR, C);
 190     NextChar ();                        /* Skip */
 191 }
 192
 193
 194
 195 static unsigned hexval (int c)
 196 /* Convert a hex digit into a value */
 197 {
 198     if (!isxdigit (c)) {
 199         Error (ERR_ILLEGAL_HEX_DIGIT);
 200     }
 201     if (isdigit (c)) {
 202         return c - '0';
 203     } else {
 204         return toupper (c) - 'A' + 10;
 205     }
 206 }
 207
 208
 209
 210 static void SetTok (int tok)
 211 /* set nxttok and bump line ptr */
 212 {
 213     nxttok = tok;
 214     NextChar ();
 215 }
 216
 217
 218
 219 static int SignExtendChar (int C)
 220 /* Do correct sign extension of a character */
 221 {
 222     if (SignedChars && (C & 0x80) != 0) {
 223         return C | ~0xFF;
 224     } else {
 225         return C & 0xFF;
 226     }
 227 }
 228
 229
 230
 231 static int ParseChar (void)
 232 /* Parse a character. Converts \n into EOL, etc. */
 233 {
 234     int i;
 235     unsigned val;
 236     int C;
 237
 238     /* Check for escape chars */
 239     if (CurC == '\\') {
 240         NextChar ();
 241         switch (CurC) {
 242             case 'b':
 243                 C = '\b';
 244                 break;
 245             case 'f':
 246                 C = '\f';
 247                 break;
 248             case 'r':
 249                 C = '\r';
 250                 break;
 251             case 'n':
 252                 C = '\n';
 253                 break;
 254             case 't':
 255                 C = '\t';
 256                 break;
 257             case '\"':
 258                 C = '\"';
 259                 break;
 260             case '\'':
 261                 C = '\'';
 262                 break;
 263             case '\\':
 264                 C = '\\';
 265                 break;
 266             case 'x':
 267             case 'X':
 268                 /* Hex character constant */
 269                 NextChar ();
 270                 val = hexval (CurC) << 4;
 271                 NextChar ();
 272                 C = val | hexval (CurC);        /* Do not translate */
 273                 break;
 274             case '0':
 275             case '1':
 276                 /* Octal constant */
 277                 i = 0;
 278                 C = CurC - '0';
 279                 while (NextC >= '0' && NextC <= '7' && i++ < 4) {
 280                     NextChar ();
 281                     C = (C << 3) | (CurC - '0');
 282                 }
 283                 break;
 284             default:
 285                 Error (ERR_ILLEGAL_CHARCONST);
 286                 C = ' ';
 287                 break;
 288         }
 289     } else {
 290         C = CurC;
 291     }
 292
 293     /* Skip the character read */
 294     NextChar ();
 295
 296     /* Do correct sign extension */
 297     return SignExtendChar (C);
 298 }
 299
 300
 301
 302 static void CharConst (void)
 303 /* Parse a character constant. */
 304 {
 305     int C;
 306
 307     /* Skip the quote */
 308     NextChar ();
 309
 310     /* Get character */
 311     C = ParseChar ();
 312
 313     /* Check for closing quote */
 314     if (CurC != '\'') {
 315         Error (ERR_QUOTE_EXPECTED);
 316     } else {
 317         /* Skip the quote */
 318         NextChar ();
 319     }
 320
 321     /* Setup values and attributes */
 322     nxttok  = TOK_CCONST;
 323     nxtval  = SignExtendChar (ctrans (C));      /* Translate into target charset */
 324     nxttype = type_int;                         /* Character constants have type int */
 325 }
 326
 327
 328
 329 static void StringConst (void)
 330 /* Parse a quoted string */
 331 {
 332     nxtval = GetLiteralOffs ();
 333     nxttok = TOK_SCONST;
 334
 335     /* Be sure to concatenate strings */
 336     while (CurC == '\"') {
 337
 338         /* Skip the quote char */
 339         NextChar ();
 340
 341         while (CurC != '\"') {
 342             if (CurC == '\0') {
 343                 Error (ERR_UNEXPECTED_NEWLINE);
 344                 break;
 345             }
 346             AddLiteralChar (ParseChar ());
 347         }
 348
 349         /* Skip closing quote char if there was one */
 350         NextChar ();
 351
 352         /* Skip white space, read new input */
 353         SkipWhite ();
 354
 355     }
 356
 357     /* Terminate the string */
 358     AddLiteralChar ('\0');
 359 }
 360
 361
 362
 363 void NextToken (void)
 364 /* Get next token from input stream */
 365 {
 366     ident token;
 367
 368     /* Current token is the lookahead token */
 369     CurTok = NextTok;
 370
 371     /* Remember the starting position of the next token */
 372     NextTok.Pos = GetCurrentLine();
 373
 374     /* Skip spaces and read the next line if needed */
 375     if (SkipWhite () == 0) {
 376         /* End of file reached */
 377         nxttok = TOK_CEOF;
 378         return;
 379     }
 380
 381     /* Determine the next token from the lookahead */
 382     if (isdigit (CurC)) {
 383
 384         /* A number */
 385         int HaveSuffix;         /* True if we have a type suffix */
 386         unsigned types;         /* Possible types */
 387         unsigned base;
 388         unsigned long k;        /* Value */
 389
 390         k     = 0;
 391         base  = 10;
 392         types = IT_INT | IT_LONG | IT_ULONG;
 393
 394         if (CurC == '0') {
 395             /* Octal or hex constants may also be of type unsigned int */
 396             types = IT_INT | IT_UINT | IT_LONG | IT_ULONG;
 397             /* gobble 0 and examin next char */
 398             NextChar ();
 399             if (toupper (CurC) == 'X') {
 400                 base = 16;
 401                 nxttype = type_uint;
 402                 NextChar ();    /* gobble "x" */
 403             } else {
 404                 base = 8;
 405             }
 406         }
 407         while (1) {
 408             if (isdigit (CurC)) {
 409                 k = k * base + (CurC - '0');
 410             } else if (base == 16 && isxdigit (CurC)) {
 411                 k = (k << 4) + hexval (CurC);
 412             } else {
 413                 break;          /* not digit */
 414             }
 415             NextChar ();        /* gobble char */
 416         }
 417
 418         /* Check for a suffix */
 419         HaveSuffix = 1;
 420         if (CurC == 'u' || CurC == 'U') {
 421             /* Unsigned type */
 422             NextChar ();
 423             if (toupper (CurC) != 'L') {
 424                 types = IT_UINT | IT_ULONG;
 425             } else {
 426                 NextChar ();
 427                 types = IT_ULONG;
 428             }
 429         } else if (CurC == 'l' || CurC == 'L') {
 430             /* Long type */
 431             NextChar ();
 432             if (toupper (CurC) != 'U') {
 433                 types = IT_LONG | IT_ULONG;
 434             } else {
 435                 NextChar ();
 436                 types = IT_ULONG;
 437             }
 438         } else {
 439             HaveSuffix = 0;
 440         }
 441
 442         /* Check the range to determine the type */
 443         if (k > 0x7FFF) {
 444             /* Out of range for int */
 445             types &= ~IT_INT;
 446             /* If the value is in the range 0x8000..0xFFFF, unsigned int is not
 447              * allowed, and we don't have a type specifying suffix, emit a
 448              * warning.
 449              */
 450             if (k <= 0xFFFF && (types & IT_UINT) == 0 && !HaveSuffix) {
 451                 Warning (WARN_CONSTANT_IS_LONG);
 452             }
 453         }
 454         if (k > 0xFFFF) {
 455             /* Out of range for unsigned int */
 456             types &= ~IT_UINT;
 457         }
 458         if (k > 0x7FFFFFFF) {
 459             /* Out of range for long int */
 460             types &= ~IT_LONG;
 461         }
 462
 463         /* Now set the type string to the smallest type in types */
 464         if (types & IT_INT) {
 465             nxttype = type_int;
 466         } else if (types & IT_UINT) {
 467             nxttype = type_uint;
 468         } else if (types & IT_LONG) {
 469             nxttype = type_long;
 470         } else {
 471             nxttype = type_ulong;
 472         }
 473
 474         /* Set the value and the token */
 475         nxtval = k;
 476         nxttok = TOK_ICONST;
 477         return;
 478     }
 479
 480     if (IsSym (token)) {
 481
 482         /* Check for a keyword */
 483         if ((nxttok = FindKey (token)) != TOK_IDENT) {
 484             /* Reserved word found */
 485             return;
 486         }
 487         /* No reserved word, check for special symbols */
 488         if (token [0] == '_') {
 489             /* Special symbols */
 490             if (strcmp (token, "__FILE__") == 0) {
 491                 nxtval = AddLiteral (GetCurrentFile());
 492                 nxttok = TOK_SCONST;
 493                 return;
 494             } else if (strcmp (token, "__LINE__") == 0) {
 495                 nxttok  = TOK_ICONST;
 496                 nxtval  = GetCurrentLine();
 497                 nxttype = type_int;
 498                 return;
 499             } else if (strcmp (token, "__fixargs__") == 0) {
 500                 nxttok  = TOK_ICONST;
 501                 nxtval  = GetParamSize (CurrentFunc);
 502                 nxttype = type_uint;
 503                 return;
 504             } else if (strcmp (token, "__func__") == 0) {
 505                 /* __func__ is only defined in functions */
 506                 if (CurrentFunc) {
 507                     nxtval = AddLiteral (GetFuncName (CurrentFunc));
 508                     nxttok = TOK_SCONST;
 509                     return;
 510                 }
 511             }
 512         }
 513
 514         /* No reserved word but identifier */
 515         strcpy (NextTok.Ident, token);
 516         NextTok.Tok = TOK_IDENT;
 517         return;
 518     }
 519
 520     /* Monstrous switch statement ahead... */
 521     switch (CurC) {
 522
 523         case '!':
 524             NextChar ();
 525             if (CurC == '=') {
 526                 SetTok (TOK_NE);
 527             } else {
 528                 nxttok = TOK_BOOL_NOT;
 529             }
 530             break;
 531
 532         case '\"':
 533             StringConst ();
 534             break;
 535
 536         case '%':
 537             NextChar ();
 538             if (CurC == '=') {
 539                 SetTok (TOK_MOD_ASSIGN);
 540             } else {
 541                 nxttok = TOK_MOD;
 542             }
 543             break;
 544
 545         case '&':
 546             NextChar ();
 547             switch (CurC) {
 548                 case '&':
 549                     SetTok (TOK_BOOL_AND);
 550                     break;
 551                 case '=':
 552                     SetTok (TOK_AND_ASSIGN);
 553                     break;
 554                 default:
 555                     nxttok = TOK_AND;
 556             }
 557             break;
 558
 559         case '\'':
 560             CharConst ();
 561             break;
 562
 563         case '(':
 564             SetTok (TOK_LPAREN);
 565             break;
 566
 567         case ')':
 568             SetTok (TOK_RPAREN);
 569             break;
 570
 571         case '*':
 572             NextChar ();
 573             if (CurC == '=') {
 574                 SetTok (TOK_MUL_ASSIGN);
 575             } else {
 576                 nxttok = TOK_STAR;
 577             }
 578             break;
 579
 580         case '+':
 581             NextChar ();
 582             switch (CurC) {
 583                 case '+':
 584                     SetTok (TOK_INC);
 585                     break;
 586                 case '=':
 587                     SetTok (TOK_PLUS_ASSIGN);
 588                     break;
 589                 default:
 590                     nxttok = TOK_PLUS;
 591             }
 592             break;
 593
 594         case ',':
 595             SetTok (TOK_COMMA);
 596             break;
 597
 598         case '-':
 599             NextChar ();
 600             switch (CurC) {
 601                 case '-':
 602                     SetTok (TOK_DEC);
 603                     break;
 604                 case '=':
 605                     SetTok (TOK_MINUS_ASSIGN);
 606                     break;
 607                 case '>':
 608                     SetTok (TOK_PTR_REF);
 609                     break;
 610                 default:
 611                     nxttok = TOK_MINUS;
 612             }
 613             break;
 614
 615         case '.':
 616             NextChar ();
 617             if (CurC == '.') {
 618                 NextChar ();
 619                 if (CurC == '.') {
 620                     SetTok (TOK_ELLIPSIS);
 621                 } else {
 622                     unknown (CurC);
 623                 }
 624             } else {
 625                 nxttok = TOK_DOT;
 626             }
 627             break;
 628
 629         case '/':
 630             NextChar ();
 631             if (CurC == '=') {
 632                 SetTok (TOK_DIV_ASSIGN);
 633             } else {
 634                 nxttok = TOK_DIV;
 635             }
 636             break;
 637
 638         case ':':
 639             SetTok (TOK_COLON);
 640             break;
 641
 642         case ';':
 643             SetTok (TOK_SEMI);
 644             break;
 645
 646         case '<':
 647             NextChar ();
 648             switch (CurC) {
 649                 case '=':
 650                     SetTok (TOK_LE);
 651                     break;
 652                 case '<':
 653                     NextChar ();
 654                     if (CurC == '=') {
 655                         SetTok (TOK_SHL_ASSIGN);
 656                     } else {
 657                         nxttok = TOK_SHL;
 658                     }
 659                     break;
 660                 default:
 661                     nxttok = TOK_LT;
 662             }
 663             break;
 664
 665         case '=':
 666             NextChar ();
 667             if (CurC == '=') {
 668                 SetTok (TOK_EQ);
 669             } else {
 670                 nxttok = TOK_ASSIGN;
 671             }
 672             break;
 673
 674         case '>':
 675             NextChar ();
 676             switch (CurC) {
 677                 case '=':
 678                     SetTok (TOK_GE);
 679                     break;
 680                 case '>':
 681                     NextChar ();
 682                     if (CurC == '=') {
 683                         SetTok (TOK_SHR_ASSIGN);
 684                     } else {
 685                         nxttok = TOK_SHR;
 686                     }
 687                     break;
 688                 default:
 689                     nxttok = TOK_GT;
 690             }
 691             break;
 692
 693         case '?':
 694             SetTok (TOK_QUEST);
 695             break;
 696
 697         case '[':
 698             SetTok (TOK_LBRACK);
 699             break;
 700
 701         case ']':
 702             SetTok (TOK_RBRACK);
 703             break;
 704
 705         case '^':
 706             NextChar ();
 707             if (CurC == '=') {
 708                 SetTok (TOK_XOR_ASSIGN);
 709             } else {
 710                 nxttok = TOK_XOR;
 711             }
 712             break;
 713
 714         case '{':
 715             SetTok (TOK_LCURLY);
 716             break;
 717
 718         case '|':
 719             NextChar ();
 720             switch (CurC) {
 721                 case '|':
 722                     SetTok (TOK_BOOL_OR);
 723                     break;
 724                 case '=':
 725                     SetTok (TOK_OR_ASSIGN);
 726                     break;
 727                 default:
 728                     nxttok = TOK_OR;
 729             }
 730             break;
 731
 732         case '}':
 733             SetTok (TOK_RCURLY);
 734             break;
 735
 736         case '~':
 737             SetTok (TOK_COMP);
 738             break;
 739
 740         case '#':
 741             /* Skip it and following whitespace */
 742             do {
 743                 NextChar ();
 744             } while (CurC == ' ');
 745             if (!IsSym (token) || strcmp (token, "pragma") != 0) {
 746                 /* OOPS - should not happen */
 747                 Error (ERR_CPP_DIRECTIVE_EXPECTED);
 748             }
 749             nxttok = TOK_PRAGMA;
 750             break;
 751
 752         default:
 753             unknown (CurC);
 754
 755     }
 756
 757 }
 758
 759
 760
 761 void Consume (token_t Token, unsigned ErrNum)
 762 /* Eat token if it is the next in the input stream, otherwise print an error
 763  * message.
 764  */
 765 {
 766     if (curtok == Token) {
 767         NextToken ();
 768     } else {
 769         Error (ErrNum);
 770     }
 771 }
 772
 773
 774
 775 void ConsumeColon (void)
 776 /* Check for a colon and skip it. */
 777 {
 778     Consume (TOK_COLON, ERR_COLON_EXPECTED);
 779 }
 780
 781
 782
 783 void ConsumeSemi (void)
 784 /* Check for a semicolon and skip it. */
 785 {
 786     /* Try do be smart about typos... */
 787     if (curtok == TOK_SEMI) {
 788         NextToken ();
 789     } else {
 790         Error (ERR_SEMICOLON_EXPECTED);
 791         if (curtok == TOK_COLON || curtok == TOK_COMMA) {
 792             NextToken ();
 793         }
 794     }
 795 }
 796
 797
 798
 799 void ConsumeLParen (void)
 800 /* Check for a left parenthesis and skip it */
 801 {
 802     Consume (TOK_LPAREN, ERR_LPAREN_EXPECTED);
 803 }
 804
 805
 806
 807 void ConsumeRParen (void)
 808 /* Check for a right parenthesis and skip it */
 809 {
 810     Consume (TOK_RPAREN, ERR_RPAREN_EXPECTED);
 811 }
 812
 813
 814
 815 void ConsumeLBrack (void)
 816 /* Check for a left bracket and skip it */
 817 {
 818     Consume (TOK_LBRACK, ERR_LBRACK_EXPECTED);
 819 }
 820
 821
 822
 823 void ConsumeRBrack (void)
 824 /* Check for a right bracket and skip it */
 825 {
 826     Consume (TOK_RBRACK, ERR_RBRACK_EXPECTED);
 827 }
 828
 829
 830
 831 void ConsumeLCurly (void)
 832 /* Check for a left curly brace and skip it */
 833 {
 834     Consume (TOK_LCURLY, ERR_LCURLY_EXPECTED);
 835 }
 836
 837
 838
 839 void ConsumeRCurly (void)
 840 /* Check for a right curly brace and skip it */
 841 {
 842     Consume (TOK_RCURLY, ERR_RCURLY_EXPECTED);
 843 }
 844
 845
 846