git.sur5r.net Git - cc65/blob - src/cc65/scanner.c

   1 /*
   2  * scanner.c
   3  *
   4  * Ullrich von Bassewitz, 07.06.1998
   5  */
   6
   7
   8
   9 #include <stdio.h>
  10 #include <stdlib.h>
  11 #include <string.h>
  12 #include <errno.h>
  13 #include <ctype.h>
  14
  15 #include "ctrans.h"
  16 #include "datatype.h"
  17 #include "error.h"
  18 #include "function.h"
  19 #include "global.h"
  20 #include "ident.h"
  21 #include "input.h"
  22 #include "litpool.h"
  23 #include "preproc.h"
  24 #include "symtab.h"
  25 #include "util.h"
  26 #include "scanner.h"
  27
  28
  29
  30 /*****************************************************************************/
  31 /*                                   data                                    */
  32 /*****************************************************************************/
  33
  34
  35
  36 Token CurTok;           /* The current token */
  37 Token NextTok;          /* The next token */
  38
  39
  40
  41 /* Token types */
  42 #define TT_C    0               /* ANSI C token */
  43 #define TT_EXT  1               /* cc65 extension */
  44
  45 /* Token table */
  46 static const struct Keyword {
  47     char*           Key;        /* Keyword name */
  48     unsigned char   Tok;        /* The token */
  49     unsigned char   Type;       /* Token type */
  50 } Keywords [] = {
  51     { "__A__",          TOK_A,          TT_C    },
  52     { "__AX__",         TOK_AX,         TT_C    },
  53     { "__EAX__",        TOK_EAX,        TT_C    },
  54     { "__X__",          TOK_X,          TT_C    },
  55     { "__Y__",          TOK_Y,          TT_C    },
  56     { "__asm__",        TOK_ASM,        TT_C    },
  57     { "__attribute__",  TOK_ATTRIBUTE,  TT_C    },
  58     { "__far__",        TOK_FAR,        TT_C    },
  59     { "__fastcall__",   TOK_FASTCALL,   TT_C    },
  60     { "asm",            TOK_ASM,        TT_EXT  },
  61     { "auto",           TOK_AUTO,       TT_C    },
  62     { "break",          TOK_BREAK,      TT_C    },
  63     { "case",           TOK_CASE,       TT_C    },
  64     { "char",           TOK_CHAR,       TT_C    },
  65     { "const",          TOK_CONST,      TT_C    },
  66     { "continue",       TOK_CONTINUE,   TT_C    },
  67     { "default",        TOK_DEFAULT,    TT_C    },
  68     { "do",             TOK_DO,         TT_C    },
  69     { "double",         TOK_DOUBLE,     TT_C    },
  70     { "else",           TOK_ELSE,       TT_C    },
  71     { "enum",           TOK_ENUM,       TT_C    },
  72     { "extern",         TOK_EXTERN,     TT_C    },
  73     { "far",            TOK_FAR,        TT_EXT  },
  74     { "fastcall",       TOK_FASTCALL,   TT_EXT  },
  75     { "float",          TOK_FLOAT,      TT_C    },
  76     { "for",            TOK_FOR,        TT_C    },
  77     { "goto",           TOK_GOTO,       TT_C    },
  78     { "if",             TOK_IF,         TT_C    },
  79     { "int",            TOK_INT,        TT_C    },
  80     { "long",           TOK_LONG,       TT_C    },
  81     { "register",       TOK_REGISTER,   TT_C    },
  82     { "return",         TOK_RETURN,     TT_C    },
  83     { "short",          TOK_SHORT,      TT_C    },
  84     { "signed",         TOK_SIGNED,     TT_C    },
  85     { "sizeof",         TOK_SIZEOF,     TT_C    },
  86     { "static",         TOK_STATIC,     TT_C    },
  87     { "struct",         TOK_STRUCT,     TT_C    },
  88     { "switch",         TOK_SWITCH,     TT_C    },
  89     { "typedef",        TOK_TYPEDEF,    TT_C    },
  90     { "union",          TOK_UNION,      TT_C    },
  91     { "unsigned",       TOK_UNSIGNED,   TT_C    },
  92     { "void",           TOK_VOID,       TT_C    },
  93     { "volatile",       TOK_VOLATILE,   TT_C    },
  94     { "while",          TOK_WHILE,      TT_C    },
  95 };
  96 #define KEY_COUNT       (sizeof (Keywords) / sizeof (Keywords [0]))
  97
  98
  99
 100 /* Stuff for determining the type of an integer constant */
 101 #define IT_INT          0x01
 102 #define IT_UINT         0x02
 103 #define IT_LONG         0x04
 104 #define IT_ULONG        0x08
 105
 106
 107
 108 /*****************************************************************************/
 109 /*                                   code                                    */
 110 /*****************************************************************************/
 111
 112
 113
 114 static int CmpKey (const void* Key, const void* Elem)
 115 /* Compare function for bsearch */
 116 {
 117     return strcmp ((const char*) Key, ((const struct Keyword*) Elem)->Key);
 118 }
 119
 120
 121
 122 static int FindKey (const char* Key)
 123 /* Find a keyword and return the token. Return IDENT if the token is not a
 124  * keyword.
 125  */
 126 {
 127     struct Keyword* K;
 128     K = bsearch (Key, Keywords, KEY_COUNT, sizeof (Keywords [0]), CmpKey);
 129     if (K && (K->Type != TT_EXT || ANSI == 0)) {
 130         return K->Tok;
 131     } else {
 132         return TOK_IDENT;
 133     }
 134 }
 135
 136
 137
 138 static int SkipWhite (void)
 139 /* Skip white space in the input stream, reading and preprocessing new lines
 140  * if necessary. Return 0 if end of file is reached, return 1 otherwise.
 141  */
 142 {
 143     while (1) {
 144         while (CurC == 0) {
 145             if (NextLine () == 0) {
 146                 return 0;
 147             }
 148             Preprocess ();
 149         }
 150         if (CurC == ' ' || CurC == '\r') {
 151             NextChar ();
 152         } else {
 153             return 1;
 154         }
 155     }
 156 }
 157
 158
 159
 160 void SymName (char* s)
 161 /* Get symbol from input stream */
 162 {
 163     unsigned k = 0;
 164     do {
 165         if (k != MAX_IDENTLEN) {
 166             ++k;
 167             *s++ = CurC;
 168         }
 169         NextChar ();
 170     } while (IsIdent (CurC) || isdigit (CurC));
 171     *s = '\0';
 172 }
 173
 174
 175
 176 int IsSym (char *s)
 177 /* Get symbol from input stream or return 0 if not a symbol. */
 178 {
 179     if (IsIdent (CurC)) {
 180         SymName (s);
 181         return 1;
 182     } else {
 183         return 0;
 184     }
 185 }
 186
 187
 188
 189 static void unknown (char C)
 190 /* Error message for unknown character */
 191 {
 192     Error (ERR_INVALID_CHAR, C);
 193     NextChar ();                        /* Skip */
 194 }
 195
 196
 197
 198 static unsigned hexval (int c)
 199 /* Convert a hex digit into a value */
 200 {
 201     if (!isxdigit (c)) {
 202         Error (ERR_ILLEGAL_HEX_DIGIT);
 203     }
 204     if (isdigit (c)) {
 205         return c - '0';
 206     } else {
 207         return toupper (c) - 'A' + 10;
 208     }
 209 }
 210
 211
 212
 213 static void SetTok (int tok)
 214 /* set nxttok and bump line ptr */
 215 {
 216     nxttok = tok;
 217     NextChar ();
 218 }
 219
 220
 221
 222 static int SignExtendChar (int C)
 223 /* Do correct sign extension of a character */
 224 {
 225     if (SignedChars && (C & 0x80) != 0) {
 226         return C | ~0xFF;
 227     } else {
 228         return C & 0xFF;
 229     }
 230 }
 231
 232
 233
 234 static int ParseChar (void)
 235 /* Parse a character. Converts \n into EOL, etc. */
 236 {
 237     int i;
 238     unsigned val;
 239     int C;
 240
 241     /* Check for escape chars */
 242     if (CurC == '\\') {
 243         NextChar ();
 244         switch (CurC) {
 245             case 'b':
 246                 C = '\b';
 247                 break;
 248             case 'f':
 249                 C = '\f';
 250                 break;
 251             case 'r':
 252                 C = '\r';
 253                 break;
 254             case 'n':
 255                 C = '\n';
 256                 break;
 257             case 't':
 258                 C = '\t';
 259                 break;
 260             case '\"':
 261                 C = '\"';
 262                 break;
 263             case '\'':
 264                 C = '\'';
 265                 break;
 266             case '\\':
 267                 C = '\\';
 268                 break;
 269             case 'x':
 270             case 'X':
 271                 /* Hex character constant */
 272                 NextChar ();
 273                 val = hexval (CurC) << 4;
 274                 NextChar ();
 275                 C = val | hexval (CurC);        /* Do not translate */
 276                 break;
 277             case '0':
 278             case '1':
 279                 /* Octal constant */
 280                 i = 0;
 281                 C = CurC - '0';
 282                 while (NextC >= '0' && NextC <= '7' && i++ < 4) {
 283                     NextChar ();
 284                     C = (C << 3) | (CurC - '0');
 285                 }
 286                 break;
 287             default:
 288                 Error (ERR_ILLEGAL_CHARCONST);
 289                 C = ' ';
 290                 break;
 291         }
 292     } else {
 293         C = CurC;
 294     }
 295
 296     /* Skip the character read */
 297     NextChar ();
 298
 299     /* Do correct sign extension */
 300     return SignExtendChar (C);
 301 }
 302
 303
 304
 305 static void CharConst (void)
 306 /* Parse a character constant. */
 307 {
 308     int C;
 309
 310     /* Skip the quote */
 311     NextChar ();
 312
 313     /* Get character */
 314     C = ParseChar ();
 315
 316     /* Check for closing quote */
 317     if (CurC != '\'') {
 318         Error (ERR_QUOTE_EXPECTED);
 319     } else {
 320         /* Skip the quote */
 321         NextChar ();
 322     }
 323
 324     /* Setup values and attributes */
 325     nxttok  = TOK_CCONST;
 326     nxtval  = SignExtendChar (ctrans (C));      /* Translate into target charset */
 327     nxttype = type_int;                         /* Character constants have type int */
 328 }
 329
 330
 331
 332 static void StringConst (void)
 333 /* Parse a quoted string */
 334 {
 335     nxtval = GetLiteralOffs ();
 336     nxttok = TOK_SCONST;
 337
 338     /* Be sure to concatenate strings */
 339     while (CurC == '\"') {
 340
 341         /* Skip the quote char */
 342         NextChar ();
 343
 344         while (CurC != '\"') {
 345             if (CurC == '\0') {
 346                 Error (ERR_UNEXPECTED_NEWLINE);
 347                 break;
 348             }
 349             AddLiteralChar (ParseChar ());
 350         }
 351
 352         /* Skip closing quote char if there was one */
 353         NextChar ();
 354
 355         /* Skip white space, read new input */
 356         SkipWhite ();
 357
 358     }
 359
 360     /* Terminate the string */
 361     AddLiteralChar ('\0');
 362 }
 363
 364
 365
 366 void NextToken (void)
 367 /* Get next token from input stream */
 368 {
 369     ident token;
 370
 371     /* Current token is the lookahead token */
 372     CurTok = NextTok;
 373
 374     /* Remember the starting position of the next token */
 375     NextTok.Pos = GetCurrentLine();
 376
 377     /* Skip spaces and read the next line if needed */
 378     if (SkipWhite () == 0) {
 379         /* End of file reached */
 380         nxttok = TOK_CEOF;
 381         return;
 382     }
 383
 384     /* Determine the next token from the lookahead */
 385     if (isdigit (CurC)) {
 386
 387         /* A number */
 388         int HaveSuffix;         /* True if we have a type suffix */
 389         unsigned types;         /* Possible types */
 390         unsigned base;
 391         unsigned long k;        /* Value */
 392
 393         k     = 0;
 394         base  = 10;
 395         types = IT_INT | IT_LONG | IT_ULONG;
 396
 397         if (CurC == '0') {
 398             /* Octal or hex constants may also be of type unsigned int */
 399             types = IT_INT | IT_UINT | IT_LONG | IT_ULONG;
 400             /* gobble 0 and examin next char */
 401             NextChar ();
 402             if (toupper (CurC) == 'X') {
 403                 base = 16;
 404                 nxttype = type_uint;
 405                 NextChar ();    /* gobble "x" */
 406             } else {
 407                 base = 8;
 408             }
 409         }
 410         while (1) {
 411             if (isdigit (CurC)) {
 412                 k = k * base + (CurC - '0');
 413             } else if (base == 16 && isxdigit (CurC)) {
 414                 k = (k << 4) + hexval (CurC);
 415             } else {
 416                 break;          /* not digit */
 417             }
 418             NextChar ();        /* gobble char */
 419         }
 420
 421         /* Check for a suffix */
 422         HaveSuffix = 1;
 423         if (CurC == 'u' || CurC == 'U') {
 424             /* Unsigned type */
 425             NextChar ();
 426             if (toupper (CurC) != 'L') {
 427                 types = IT_UINT | IT_ULONG;
 428             } else {
 429                 NextChar ();
 430                 types = IT_ULONG;
 431             }
 432         } else if (CurC == 'l' || CurC == 'L') {
 433             /* Long type */
 434             NextChar ();
 435             if (toupper (CurC) != 'U') {
 436                 types = IT_LONG | IT_ULONG;
 437             } else {
 438                 NextChar ();
 439                 types = IT_ULONG;
 440             }
 441         } else {
 442             HaveSuffix = 0;
 443         }
 444
 445         /* Check the range to determine the type */
 446         if (k > 0x7FFF) {
 447             /* Out of range for int */
 448             types &= ~IT_INT;
 449             /* If the value is in the range 0x8000..0xFFFF, unsigned int is not
 450              * allowed, and we don't have a type specifying suffix, emit a
 451              * warning.
 452              */
 453             if (k <= 0xFFFF && (types & IT_UINT) == 0 && !HaveSuffix) {
 454                 Warning (WARN_CONSTANT_IS_LONG);
 455             }
 456         }
 457         if (k > 0xFFFF) {
 458             /* Out of range for unsigned int */
 459             types &= ~IT_UINT;
 460         }
 461         if (k > 0x7FFFFFFF) {
 462             /* Out of range for long int */
 463             types &= ~IT_LONG;
 464         }
 465
 466         /* Now set the type string to the smallest type in types */
 467         if (types & IT_INT) {
 468             nxttype = type_int;
 469         } else if (types & IT_UINT) {
 470             nxttype = type_uint;
 471         } else if (types & IT_LONG) {
 472             nxttype = type_long;
 473         } else {
 474             nxttype = type_ulong;
 475         }
 476
 477         /* Set the value and the token */
 478         nxtval = k;
 479         nxttok = TOK_ICONST;
 480         return;
 481     }
 482
 483     if (IsSym (token)) {
 484
 485         /* Check for a keyword */
 486         if ((nxttok = FindKey (token)) != TOK_IDENT) {
 487             /* Reserved word found */
 488             return;
 489         }
 490         /* No reserved word, check for special symbols */
 491         if (token [0] == '_') {
 492             /* Special symbols */
 493             if (strcmp (token, "__FILE__") == 0) {
 494                 nxtval = AddLiteral (GetCurrentFile());
 495                 nxttok = TOK_SCONST;
 496                 return;
 497             } else if (strcmp (token, "__LINE__") == 0) {
 498                 nxttok  = TOK_ICONST;
 499                 nxtval  = GetCurrentLine();
 500                 nxttype = type_int;
 501                 return;
 502             } else if (strcmp (token, "__fixargs__") == 0) {
 503                 nxttok  = TOK_ICONST;
 504                 nxtval  = GetParamSize (CurrentFunc);
 505                 nxttype = type_uint;
 506                 return;
 507             } else if (strcmp (token, "__func__") == 0) {
 508                 /* __func__ is only defined in functions */
 509                 if (CurrentFunc) {
 510                     nxtval = AddLiteral (GetFuncName (CurrentFunc));
 511                     nxttok = TOK_SCONST;
 512                     return;
 513                 }
 514             }
 515         }
 516
 517         /* No reserved word but identifier */
 518         strcpy (NextTok.Ident, token);
 519         NextTok.Tok = TOK_IDENT;
 520         return;
 521     }
 522
 523     /* Monstrous switch statement ahead... */
 524     switch (CurC) {
 525
 526         case '!':
 527             NextChar ();
 528             if (CurC == '=') {
 529                 SetTok (TOK_NE);
 530             } else {
 531                 nxttok = TOK_BOOL_NOT;
 532             }
 533             break;
 534
 535         case '\"':
 536             StringConst ();
 537             break;
 538
 539         case '%':
 540             NextChar ();
 541             if (CurC == '=') {
 542                 SetTok (TOK_MOD_ASSIGN);
 543             } else {
 544                 nxttok = TOK_MOD;
 545             }
 546             break;
 547
 548         case '&':
 549             NextChar ();
 550             switch (CurC) {
 551                 case '&':
 552                     SetTok (TOK_BOOL_AND);
 553                     break;
 554                 case '=':
 555                     SetTok (TOK_AND_ASSIGN);
 556                     break;
 557                 default:
 558                     nxttok = TOK_AND;
 559             }
 560             break;
 561
 562         case '\'':
 563             CharConst ();
 564             break;
 565
 566         case '(':
 567             SetTok (TOK_LPAREN);
 568             break;
 569
 570         case ')':
 571             SetTok (TOK_RPAREN);
 572             break;
 573
 574         case '*':
 575             NextChar ();
 576             if (CurC == '=') {
 577                 SetTok (TOK_MUL_ASSIGN);
 578             } else {
 579                 nxttok = TOK_STAR;
 580             }
 581             break;
 582
 583         case '+':
 584             NextChar ();
 585             switch (CurC) {
 586                 case '+':
 587                     SetTok (TOK_INC);
 588                     break;
 589                 case '=':
 590                     SetTok (TOK_PLUS_ASSIGN);
 591                     break;
 592                 default:
 593                     nxttok = TOK_PLUS;
 594             }
 595             break;
 596
 597         case ',':
 598             SetTok (TOK_COMMA);
 599             break;
 600
 601         case '-':
 602             NextChar ();
 603             switch (CurC) {
 604                 case '-':
 605                     SetTok (TOK_DEC);
 606                     break;
 607                 case '=':
 608                     SetTok (TOK_MINUS_ASSIGN);
 609                     break;
 610                 case '>':
 611                     SetTok (TOK_PTR_REF);
 612                     break;
 613                 default:
 614                     nxttok = TOK_MINUS;
 615             }
 616             break;
 617
 618         case '.':
 619             NextChar ();
 620             if (CurC == '.') {
 621                 NextChar ();
 622                 if (CurC == '.') {
 623                     SetTok (TOK_ELLIPSIS);
 624                 } else {
 625                     unknown (CurC);
 626                 }
 627             } else {
 628                 nxttok = TOK_DOT;
 629             }
 630             break;
 631
 632         case '/':
 633             NextChar ();
 634             if (CurC == '=') {
 635                 SetTok (TOK_DIV_ASSIGN);
 636             } else {
 637                 nxttok = TOK_DIV;
 638             }
 639             break;
 640
 641         case ':':
 642             SetTok (TOK_COLON);
 643             break;
 644
 645         case ';':
 646             SetTok (TOK_SEMI);
 647             break;
 648
 649         case '<':
 650             NextChar ();
 651             switch (CurC) {
 652                 case '=':
 653                     SetTok (TOK_LE);
 654                     break;
 655                 case '<':
 656                     NextChar ();
 657                     if (CurC == '=') {
 658                         SetTok (TOK_SHL_ASSIGN);
 659                     } else {
 660                         nxttok = TOK_SHL;
 661                     }
 662                     break;
 663                 default:
 664                     nxttok = TOK_LT;
 665             }
 666             break;
 667
 668         case '=':
 669             NextChar ();
 670             if (CurC == '=') {
 671                 SetTok (TOK_EQ);
 672             } else {
 673                 nxttok = TOK_ASSIGN;
 674             }
 675             break;
 676
 677         case '>':
 678             NextChar ();
 679             switch (CurC) {
 680                 case '=':
 681                     SetTok (TOK_GE);
 682                     break;
 683                 case '>':
 684                     NextChar ();
 685                     if (CurC == '=') {
 686                         SetTok (TOK_SHR_ASSIGN);
 687                     } else {
 688                         nxttok = TOK_SHR;
 689                     }
 690                     break;
 691                 default:
 692                     nxttok = TOK_GT;
 693             }
 694             break;
 695
 696         case '?':
 697             SetTok (TOK_QUEST);
 698             break;
 699
 700         case '[':
 701             SetTok (TOK_LBRACK);
 702             break;
 703
 704         case ']':
 705             SetTok (TOK_RBRACK);
 706             break;
 707
 708         case '^':
 709             NextChar ();
 710             if (CurC == '=') {
 711                 SetTok (TOK_XOR_ASSIGN);
 712             } else {
 713                 nxttok = TOK_XOR;
 714             }
 715             break;
 716
 717         case '{':
 718             SetTok (TOK_LCURLY);
 719             break;
 720
 721         case '|':
 722             NextChar ();
 723             switch (CurC) {
 724                 case '|':
 725                     SetTok (TOK_BOOL_OR);
 726                     break;
 727                 case '=':
 728                     SetTok (TOK_OR_ASSIGN);
 729                     break;
 730                 default:
 731                     nxttok = TOK_OR;
 732             }
 733             break;
 734
 735         case '}':
 736             SetTok (TOK_RCURLY);
 737             break;
 738
 739         case '~':
 740             SetTok (TOK_COMP);
 741             break;
 742
 743         case '#':
 744             /* Skip it and following whitespace */
 745             do {
 746                 NextChar ();
 747             } while (CurC == ' ');
 748             if (!IsSym (token) || strcmp (token, "pragma") != 0) {
 749                 /* OOPS - should not happen */
 750                 Error (ERR_CPP_DIRECTIVE_EXPECTED);
 751             }
 752             nxttok = TOK_PRAGMA;
 753             break;
 754
 755         default:
 756             unknown (CurC);
 757
 758     }
 759
 760 }
 761
 762
 763
 764 void Consume (token_t Token, unsigned ErrNum)
 765 /* Eat token if it is the next in the input stream, otherwise print an error
 766  * message.
 767  */
 768 {
 769     if (curtok == Token) {
 770         NextToken ();
 771     } else {
 772         Error (ErrNum);
 773     }
 774 }
 775
 776
 777
 778 void ConsumeColon (void)
 779 /* Check for a colon and skip it. */
 780 {
 781     Consume (TOK_COLON, ERR_COLON_EXPECTED);
 782 }
 783
 784
 785
 786 void ConsumeSemi (void)
 787 /* Check for a semicolon and skip it. */
 788 {
 789     /* Try do be smart about typos... */
 790     if (curtok == TOK_SEMI) {
 791         NextToken ();
 792     } else {
 793         Error (ERR_SEMICOLON_EXPECTED);
 794         if (curtok == TOK_COLON || curtok == TOK_COMMA) {
 795             NextToken ();
 796         }
 797     }
 798 }
 799
 800
 801
 802 void ConsumeLParen (void)
 803 /* Check for a left parenthesis and skip it */
 804 {
 805     Consume (TOK_LPAREN, ERR_LPAREN_EXPECTED);
 806 }
 807
 808
 809
 810 void ConsumeRParen (void)
 811 /* Check for a right parenthesis and skip it */
 812 {
 813     Consume (TOK_RPAREN, ERR_RPAREN_EXPECTED);
 814 }
 815
 816
 817
 818 void ConsumeLBrack (void)
 819 /* Check for a left bracket and skip it */
 820 {
 821     Consume (TOK_LBRACK, ERR_LBRACK_EXPECTED);
 822 }
 823
 824
 825
 826 void ConsumeRBrack (void)
 827 /* Check for a right bracket and skip it */
 828 {
 829     Consume (TOK_RBRACK, ERR_RBRACK_EXPECTED);
 830 }
 831
 832
 833
 834 void ConsumeLCurly (void)
 835 /* Check for a left curly brace and skip it */
 836 {
 837     Consume (TOK_LCURLY, ERR_LCURLY_EXPECTED);
 838 }
 839
 840
 841
 842 void ConsumeRCurly (void)
 843 /* Check for a right curly brace and skip it */
 844 {
 845     Consume (TOK_RCURLY, ERR_RCURLY_EXPECTED);
 846 }
 847
 848
 849