header { //package org.erights.e.elang.syntax.antlr; } // UPDOC //The E grammar specifically treats updoc test sequences as comments. Any line //whose first non-whitespace character is '?' is considered by the lexer to be //the start of an UPDOC test case. The test case will include any continguous //lines that are whitespace, or that begin with '#' or '>'. // TODO: // updoc in the midst of quasi or string // support doubled '`' // make keywords (literals) case insensitive // only do literal resolution for IDENT and funky punctuation //---------------------------------------------------------------------------- // The E Lexer //---------------------------------------------------------------------------- //class EALexer extends Lexer("antlr.AstroLexer"); class EALexer extends Lexer("antlr.SwitchingLexer"); options { importVocab=E; exportVocab=EALexer; testLiterals=false; // don't automatically test for literals k=3; // four characters of lookahead // XXX where does \u0003 come from? // XXX should allow Unicode U+10000..U+10FFFF, but can't express that in Java charVocabulary='\u0003'..'\uFFFE'; // without inlining some bitset tests, couldn't do unicode; // I need to make ANTLR generate smaller bitsets; see // bottom of JavaLexer.java codeGenBitsetTestThreshold=20; caseSensitiveLiterals = false; } tokens { SR; GE; SR_ASSIGN; } { // quasi-E-source hole marker indexes static final int NO_HOLE_INDEX = -1; protected int[] valueHoles = {}; protected int[] patternHoles = {}; public void setHoles(int[] vh, int[] ph) { valueHoles = vh; patternHoles = ph; } protected int holeThere(String key, int offset) { int[] table = "$".equals(key) ? valueHoles : "@".equals(key) ? patternHoles : null; if (inputState instanceof CountingLexerSharedInputState) { int pos = ((CountingLexerSharedInputState)inputState).getPosition() + offset; // XXX use a hash table or something for (int i = 0; i < table.length; i++) { if (table[i] == pos) { return i; } } return NO_HOLE_INDEX; } else { if (table.length == 0) { return NO_HOLE_INDEX; } else { throw new RuntimeException("E lexer provided with nonempty hole table but no CountingLexerSharedInputState; cannot proceed."); } } } // set isFirstInLine whenever we produce a token, and reset it at the // beginning of every line protected boolean isFirstInLine = true; protected Token lastToken = null; protected Token makeToken(int t) { if (t != LINESEP) { isFirstInLine = false; } return lastToken = super.makeToken(t); } public void newline() { isFirstInLine = true; lastToken = null; super.newline(); } //public void traceIn(String rname) throws CharStreamException { } private String unicodeChar(Token s) { return String.valueOf((char) Integer.parseInt(s.getText(), 16)); } private String octalChar(Token s) { return String.valueOf((char) Integer.parseInt(s.getText(), 8)); } } // OPERATORS QUASIOPEN: '`' {selector.push("quasi");} ; RCURLY: '}' {selector.exitBrace();} ; OPERATOR options {testLiterals=true;} : // '`' {selector.push("quasi");} | '(' BR | ')' | '[' BR | ']' | '{' BR {selector.enterBrace();} // | '}' {selector.exitBrace();} // quasi-in-E-source | ('$'|'@') ( {holeThere($getText, -1) != NO_HOLE_INDEX}? {int i = holeThere($getText, NO_HOLE_INDEX); int type = "$".equals($getText) ? SOURCE_VALUE_HOLE : "@".equals($getText) ? SOURCE_PATTERN_HOLE : -1; // XXX most appropriate bad-token value? $setType(type); $setText(Integer.toString(i));} | {throw new TokenStreamException("A literal " + $getText + " is not meaningful in E source."); /*XXX most appropriate exception type?*/} ) // XXX todo: everywhere a $ can appear, make sure that holeness is checked // a question at the beginning of a line indicates an updoc line, and the line // is ignored. | '?' ({isFirstInLine}? UPDOC {$setType(Token.SKIP);} | BR) | ':' BR | ',' BR | '.' BR | ".." BR | "..!" BR | "==" BR | '=' BR | '!' BR | '~' BR | "!=" BR | '/' BR | "//" BR | '+' BR | '-' BR | "++" | "--" | '*' BR | '%' BR | "%%" BR | '^' BR | '|' BR | "||" BR | '&' BR | "&!" BR | "&&" BR | ';' | "**" BR // Assign | ":=" BR | "//=" BR | "/=" BR | "+=" BR | "-=" BR | "*=" BR | "%=" BR | "%%=" BR | "**=" BR | "^=" BR | "|=" BR | "&=" BR // Other tokens | DOC_COMMENT {$setType(DOC_COMMENT);} | "->" BR | "=>" BR | "=~" BR | "!~" BR | "::" BR //SR: ">>" BR //GE: ">=" BR //SR_ASSIGN : ">>=" BR | '>' ( // {isFirstInLine}? SKIPLINE // {$setType(Token.SKIP);} | '>' BR {$setType(SR);} | '=' BR {$setType(GE);} | ">=" BR {$setType(SR_ASSIGN);} | ) // should have BR, except for terminating a // URI ; LT: ('<' URISCHEME ('>' | ':')) => '<'! URISCHEME ( '>'! {$setType(URIGetter);} | ':' URI '>'! {$setType(URI);} ) | a:LT2 {$setToken(a);} ; protected LT2 options {testLiterals=true;} : "<" BR | "<<" BR | "<=" BR | "<=>" BR | "<<=" BR | "<-" BR ; // Whitespace -- ignored WS: (' '|'\t'|'\f'|ESCWS)+ {$setType(Token.SKIP);} ; protected ESCWS: '\\' (' '|'\t'|'\f')* EOL ; protected ANYWS: ' '|'\t'|'\f'|'\r'|'\n' ; LINESEP: (EOL)+ ; // Single-line comments SL_COMMENT : "#" (~('\n'|'\r'))* {$setType(Token.SKIP);} ; protected SKIPLINE: (~('\n'|'\r'))* EOL ; protected UPDOC : (~('\n'|'\r'))* (EOL // must be optional to deal with EOF ( ' '|'\t'|'\f' // TODO don't include a ? line in the same updoc | ('?'|'#'|'>') (options{greedy=true;}:~('\n'|'\r'))* | EOL )* )?; // multiple-line comments protected DOC_COMMENT : "/**" ( // '\r' '\n' can be matched in one alternative or by matching // '\r' in one iteration and '\n' in another. I am trying to // handle any flavor of newline that comes in, but the language // that allows both "\r\n" and "\r" and "\n" to all be valid // newline is ambiguous. Consequently, the resulting grammar // must be ambiguous. I'm shutting this warning off. options { generateAmbigWarnings=false; } : { LA(2)!='/' }? '*' | EOL | ~('*'|'\n'|'\r') )* '*' '/' BR ; // character literals CHAR_LITERAL options {testLiterals=false;} : '\''! ( ESC | ~('\''|'\n'|'\r'|'\\') ) '\''! ; // string literals STRING options {testLiterals=false;} : '"'! ( ESC | EOL | ~('"'|'\\'|'\n'|'\r') )* '"'! ; // escape sequence -- note that this is protected; it can only be called // from another lexer rule -- it will not ever directly return a token to // the parser // There are various ambiguities hushed in this rule. The optional // '0'...'9' digit matches should be matched here rather than letting // them go back to STRING to be matched. ANTLR does the // right thing by matching immediately; hence, it's ok to shut off // the FOLLOW ambig warnings. protected ESC: '\\'! ( (! 'n' {$setText("\n");} | 'r' {$setText("\r");} | 't' {$setText("\t");} | 'b' {$setText("\b");} | 'f' {$setText("\f");} | '"' {$setText("\"");} | '?' {$setText("?");} | '\'' {$setText("'");} | '\\' {$setText("\\");} ) |! u:ESC_UNICODE {$setText(unicodeChar(u));} | o:ESC_OCTAL {$setText(octalChar(o));} |! (' '|'\t'|'\f')* EOL ) ; // an identifier. Note that testLiterals is set to true! This means // that after we match the rule, we look in the literals table to see // if it's a literal or really an identifer // IDENT_S is protected; this isn't IDENT options {testLiterals=true;} : IDENT_S ; // the scheme component of a URI literal // NOTE: this should not be extended to Unicode; RFC 2396 lists this set specifically. protected URISCHEME options {testLiterals=false;} : ('a'..'z'|'A'..'Z') ('a'..'z'|'A'..'Z'|'0'..'9'|'+'|'-'|'.')* ; // a numeric literal // NOTE: INT is actually just the token type for the last case. INT: ("0x") => "0x"! (HEX_DIGIT)+ { $setType(HEX); } | ('0' ('0'..'9')) => '0'! ('0'..'7')+ { $setType(OCTAL); } | (FLOAT64) => FLOAT64 { $setType(FLOAT64); } | DEC_NATURAL ; // a decimal natural number (nonnegative integer) protected DEC_NATURAL: ('0'..'9')+ ('_'! DEC_NATURAL)? ; protected FLOAT64: DEC_NATURAL ('.' DEC_NATURAL (EXPONENT)? | EXPONENT) ; protected EXPONENT: ('e' | 'E') ('+'|'-')? DEC_NATURAL ; protected BR: ( {_saveIndex=text.length();}:) (' ' | '\t' | "#" (options {greedy=true;}:(~('\n'|'\r'|'#')!))* | EOL )* ({text.setLength(_saveIndex);}:) ; protected URI: ( 'a'..'z'|'A'..'Z'|'_'|'0'..'9' |';'|'/'|'?'|':'|'@'|'&'|'='|'+'|'$'|','|'-' |'.'|'!'|'~'|'*'|'\''|'('|')'|'%'|'\\'|'|'|'#' )+ ; // ---------------------------------------------------------------------------- // common suffix with quasi.g // XXX figure out if we can avoid this duplication protected IDENT_S : (XML10Letter | '_') (XML10Letter | '_' | XML10Digit | XML10CombiningChar | XML10Extender)* ; protected ESC_UNICODE: 'u'! HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT | 'U'! HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT ; protected ESC_OCTAL: '0'..'3' ( options { warnWhenFollowAmbig = false; } : '0'..'7' ( options { warnWhenFollowAmbig = false; } : '0'..'7' )? )? | '4'..'7' ( options { warnWhenFollowAmbig = false; } : '0'..'7' )? ; // hexadecimal digit protected HEX_DIGIT : ('0'..'9'|'A'..'F'|'a'..'f') ; // XXX extend to Unicode's line/paragraph separators protected EOL: (options {generateAmbigWarnings=false;} : "\r\n" | '\r' | '\n' ) { newline(); } ; // ANTLR does not provide access to Unicode character categories. For now, we'll borrow the XML 1.0 definition of identifier characters, so as not to invent something uniquely-broken. protected XML10Letter: XML10BaseChar | XML10Ideographic ; protected XML10BaseChar: '\u0041'..'\u005A' | '\u0061'..'\u007A' | '\u00C0'..'\u00D6' | '\u00D8'..'\u00F6' | '\u00F8'..'\u00FF' | '\u0100'..'\u0131' | '\u0134'..'\u013E' | '\u0141'..'\u0148' | '\u014A'..'\u017E' | '\u0180'..'\u01C3' | '\u01CD'..'\u01F0' | '\u01F4'..'\u01F5' | '\u01FA'..'\u0217' | '\u0250'..'\u02A8' | '\u02BB'..'\u02C1' | '\u0386' | '\u0388'..'\u038A' | '\u038C' | '\u038E'..'\u03A1' | '\u03A3'..'\u03CE' | '\u03D0'..'\u03D6' | '\u03DA' | '\u03DC' | '\u03DE' | '\u03E0' | '\u03E2'..'\u03F3' | '\u0401'..'\u040C' | '\u040E'..'\u044F' | '\u0451'..'\u045C' | '\u045E'..'\u0481' | '\u0490'..'\u04C4' | '\u04C7'..'\u04C8' | '\u04CB'..'\u04CC' | '\u04D0'..'\u04EB' | '\u04EE'..'\u04F5' | '\u04F8'..'\u04F9' | '\u0531'..'\u0556' | '\u0559' | '\u0561'..'\u0586' | '\u05D0'..'\u05EA' | '\u05F0'..'\u05F2' | '\u0621'..'\u063A' | '\u0641'..'\u064A' | '\u0671'..'\u06B7' | '\u06BA'..'\u06BE' | '\u06C0'..'\u06CE' | '\u06D0'..'\u06D3' | '\u06D5' | '\u06E5'..'\u06E6' | '\u0905'..'\u0939' | '\u093D' | '\u0958'..'\u0961' | '\u0985'..'\u098C' | '\u098F'..'\u0990' | '\u0993'..'\u09A8' | '\u09AA'..'\u09B0' | '\u09B2' | '\u09B6'..'\u09B9' | '\u09DC'..'\u09DD' | '\u09DF'..'\u09E1' | '\u09F0'..'\u09F1' | '\u0A05'..'\u0A0A' | '\u0A0F'..'\u0A10' | '\u0A13'..'\u0A28' | '\u0A2A'..'\u0A30' | '\u0A32'..'\u0A33' | '\u0A35'..'\u0A36' | '\u0A38'..'\u0A39' | '\u0A59'..'\u0A5C' | '\u0A5E' | '\u0A72'..'\u0A74' | '\u0A85'..'\u0A8B' | '\u0A8D' | '\u0A8F'..'\u0A91' | '\u0A93'..'\u0AA8' | '\u0AAA'..'\u0AB0' | '\u0AB2'..'\u0AB3' | '\u0AB5'..'\u0AB9' | '\u0ABD' | '\u0AE0' | '\u0B05'..'\u0B0C' | '\u0B0F'..'\u0B10' | '\u0B13'..'\u0B28' | '\u0B2A'..'\u0B30' | '\u0B32'..'\u0B33' | '\u0B36'..'\u0B39' | '\u0B3D' | '\u0B5C'..'\u0B5D' | '\u0B5F'..'\u0B61' | '\u0B85'..'\u0B8A' | '\u0B8E'..'\u0B90' | '\u0B92'..'\u0B95' | '\u0B99'..'\u0B9A' | '\u0B9C' | '\u0B9E'..'\u0B9F' | '\u0BA3'..'\u0BA4' | '\u0BA8'..'\u0BAA' | '\u0BAE'..'\u0BB5' | '\u0BB7'..'\u0BB9' | '\u0C05'..'\u0C0C' | '\u0C0E'..'\u0C10' | '\u0C12'..'\u0C28' | '\u0C2A'..'\u0C33' | '\u0C35'..'\u0C39' | '\u0C60'..'\u0C61' | '\u0C85'..'\u0C8C' | '\u0C8E'..'\u0C90' | '\u0C92'..'\u0CA8' | '\u0CAA'..'\u0CB3' | '\u0CB5'..'\u0CB9' | '\u0CDE' | '\u0CE0'..'\u0CE1' | '\u0D05'..'\u0D0C' | '\u0D0E'..'\u0D10' | '\u0D12'..'\u0D28' | '\u0D2A'..'\u0D39' | '\u0D60'..'\u0D61' | '\u0E01'..'\u0E2E' | '\u0E30' | '\u0E32'..'\u0E33' | '\u0E40'..'\u0E45' | '\u0E81'..'\u0E82' | '\u0E84' | '\u0E87'..'\u0E88' | '\u0E8A' | '\u0E8D' | '\u0E94'..'\u0E97' | '\u0E99'..'\u0E9F' | '\u0EA1'..'\u0EA3' | '\u0EA5' | '\u0EA7' | '\u0EAA'..'\u0EAB' | '\u0EAD'..'\u0EAE' | '\u0EB0' | '\u0EB2'..'\u0EB3' | '\u0EBD' | '\u0EC0'..'\u0EC4' | '\u0F40'..'\u0F47' | '\u0F49'..'\u0F69' | '\u10A0'..'\u10C5' | '\u10D0'..'\u10F6' | '\u1100' | '\u1102'..'\u1103' | '\u1105'..'\u1107' | '\u1109' | '\u110B'..'\u110C' | '\u110E'..'\u1112' | '\u113C' | '\u113E' | '\u1140' | '\u114C' | '\u114E' | '\u1150' | '\u1154'..'\u1155' | '\u1159' | '\u115F'..'\u1161' | '\u1163' | '\u1165' | '\u1167' | '\u1169' | '\u116D'..'\u116E' | '\u1172'..'\u1173' | '\u1175' | '\u119E' | '\u11A8' | '\u11AB' | '\u11AE'..'\u11AF' | '\u11B7'..'\u11B8' | '\u11BA' | '\u11BC'..'\u11C2' | '\u11EB' | '\u11F0' | '\u11F9' | '\u1E00'..'\u1E9B' | '\u1EA0'..'\u1EF9' | '\u1F00'..'\u1F15' | '\u1F18'..'\u1F1D' | '\u1F20'..'\u1F45' | '\u1F48'..'\u1F4D' | '\u1F50'..'\u1F57' | '\u1F59' | '\u1F5B' | '\u1F5D' | '\u1F5F'..'\u1F7D' | '\u1F80'..'\u1FB4' | '\u1FB6'..'\u1FBC' | '\u1FBE' | '\u1FC2'..'\u1FC4' | '\u1FC6'..'\u1FCC' | '\u1FD0'..'\u1FD3' | '\u1FD6'..'\u1FDB' | '\u1FE0'..'\u1FEC' | '\u1FF2'..'\u1FF4' | '\u1FF6'..'\u1FFC' | '\u2126' | '\u212A'..'\u212B' | '\u212E' | '\u2180'..'\u2182' | '\u3041'..'\u3094' | '\u30A1'..'\u30FA' | '\u3105'..'\u312C' | '\uAC00'..'\uD7A3' ; protected XML10Ideographic: '\u4E00'..'\u9FA5' | '\u3007' | '\u3021'..'\u3029' ; protected XML10CombiningChar: '\u0300'..'\u0345' | '\u0360'..'\u0361' | '\u0483'..'\u0486' | '\u0591'..'\u05A1' | '\u05A3'..'\u05B9' | '\u05BB'..'\u05BD' | '\u05BF' | '\u05C1'..'\u05C2' | '\u05C4' | '\u064B'..'\u0652' | '\u0670' | '\u06D6'..'\u06DC' | '\u06DD'..'\u06DF' | '\u06E0'..'\u06E4' | '\u06E7'..'\u06E8' | '\u06EA'..'\u06ED' | '\u0901'..'\u0903' | '\u093C' | '\u093E'..'\u094C' | '\u094D' | '\u0951'..'\u0954' | '\u0962'..'\u0963' | '\u0981'..'\u0983' | '\u09BC' | '\u09BE' | '\u09BF' | '\u09C0'..'\u09C4' | '\u09C7'..'\u09C8' | '\u09CB'..'\u09CD' | '\u09D7' | '\u09E2'..'\u09E3' | '\u0A02' | '\u0A3C' | '\u0A3E' | '\u0A3F' | '\u0A40'..'\u0A42' | '\u0A47'..'\u0A48' | '\u0A4B'..'\u0A4D' | '\u0A70'..'\u0A71' | '\u0A81'..'\u0A83' | '\u0ABC' | '\u0ABE'..'\u0AC5' | '\u0AC7'..'\u0AC9' | '\u0ACB'..'\u0ACD' | '\u0B01'..'\u0B03' | '\u0B3C' | '\u0B3E'..'\u0B43' | '\u0B47'..'\u0B48' | '\u0B4B'..'\u0B4D' | '\u0B56'..'\u0B57' | '\u0B82'..'\u0B83' | '\u0BBE'..'\u0BC2' | '\u0BC6'..'\u0BC8' | '\u0BCA'..'\u0BCD' | '\u0BD7' | '\u0C01'..'\u0C03' | '\u0C3E'..'\u0C44' | '\u0C46'..'\u0C48' | '\u0C4A'..'\u0C4D' | '\u0C55'..'\u0C56' | '\u0C82'..'\u0C83' | '\u0CBE'..'\u0CC4' | '\u0CC6'..'\u0CC8' | '\u0CCA'..'\u0CCD' | '\u0CD5'..'\u0CD6' | '\u0D02'..'\u0D03' | '\u0D3E'..'\u0D43' | '\u0D46'..'\u0D48' | '\u0D4A'..'\u0D4D' | '\u0D57' | '\u0E31' | '\u0E34'..'\u0E3A' | '\u0E47'..'\u0E4E' | '\u0EB1' | '\u0EB4'..'\u0EB9' | '\u0EBB'..'\u0EBC' | '\u0EC8'..'\u0ECD' | '\u0F18'..'\u0F19' | '\u0F35' | '\u0F37' | '\u0F39' | '\u0F3E' | '\u0F3F' | '\u0F71'..'\u0F84' | '\u0F86'..'\u0F8B' | '\u0F90'..'\u0F95' | '\u0F97' | '\u0F99'..'\u0FAD' | '\u0FB1'..'\u0FB7' | '\u0FB9' | '\u20D0'..'\u20DC' | '\u20E1' | '\u302A'..'\u302F' | '\u3099' | '\u309A' ; protected XML10Digit: '\u0030'..'\u0039' | '\u0660'..'\u0669' | '\u06F0'..'\u06F9' | '\u0966'..'\u096F' | '\u09E6'..'\u09EF' | '\u0A66'..'\u0A6F' | '\u0AE6'..'\u0AEF' | '\u0B66'..'\u0B6F' | '\u0BE7'..'\u0BEF' | '\u0C66'..'\u0C6F' | '\u0CE6'..'\u0CEF' | '\u0D66'..'\u0D6F' | '\u0E50'..'\u0E59' | '\u0ED0'..'\u0ED9' | '\u0F20'..'\u0F29' ; protected XML10Extender: '\u00B7' | '\u02D0' | '\u02D1' | '\u0387' | '\u0640' | '\u0E46' | '\u0EC6' | '\u3005' | '\u3031'..'\u3035' | '\u309D'..'\u309E' | '\u30FC'..'\u30FE' ;