; -*- mode: lisp; eval: (progn (modify-syntax-entry ?[ "(]") (modify-syntax-entry ?{ "(}") (modify-syntax-entry ?] ")[") (modify-syntax-entry ?} "){")) -*- ; Copyright 2005 Allen Short, under the terms of the MIT X license ; found at http://www.opensource.org/licenses/mit-license.html ................ (in-package :e.elang.lexer) ;;;; Setup the META stuff. (eval-when (:compile-toplevel :execute :load-toplevel) (defmacro match (x) (etypecase x (character ;; Match a single character `(if (>= __index__ __end__) (when (not __eof__) (setq __index__ __start__) (throw 'emit-token nil)) (when (eql (char __buffer__ __index__) ',x) (incf __index__)))) (string `(let ((__old-index__ __index__)) (or (and ,@(map 'list #'(lambda (c) `(match ,c)) x)) ;match all chars in string (progn (setq __index__ __old-index__) nil)))))) (defmacro match-type (x v) ;; Match a character class. `(if (>= __index__ __end__) (when (not __eof__) (setq __index__ __start__) (throw 'emit-token nil)) (when (typep (char __buffer__ __index__) ',x) (setq ,v (char __buffer__ __index__)) (incf __index__)))) (defun build-expr (x) ;; Generate code from a pattern. ;; Valid pattern chars: ;; !: run a lisp expression ;; ^: emit a token with the given symbol and the current text ;; []: sequence (AND) ;; {}: alternatives (OR) ;; $: Kleene star ;; @: match character class ;; ?: Attempt to parse, backtrack if failed. ;; Since ! doesn't consume any characters, it can be used for side ;; effects desired after matching patterns. (typecase x (meta ; if this is a pattern: (ecase (meta-char x) ; do the action associated with the pattern char (#\! (meta-form x)) (#\^ `(emit ',(meta-form x) __start__ __index__)) (#\[ `(and ,@(mapcar #'build-expr (meta-form x)))) (#\{ `(or ,@(mapcar #'build-expr (meta-form x)))) (#\$ `(not (do ()((not ,(build-expr (meta-form x))))))) (#\@ (let ((f (meta-form x))) `(match-type ,(car f) ,(cadr f)))) (#\? `(let ((__old-index__ __index__)) (or ,(build-expr (meta-form x)) (progn (setq __index__ __old-index__) nil)))))) (t `(match ,x)))) ;if it's not a pattern char, it's a literal. (defmacro matchit (x) (build-expr x)) (defstruct (meta (:print-function (lambda (m s d &aux (char (meta-char m)) (form (meta-form m))) (declare (ignore d)) (ecase char ((#\? #\^ #\@ #\! #\$) (format s "~A~A" char form)) (#\[ (format s "[~{~A~^ ~}]" form)) (#\{ (format s "{~{~A~^ ~}}" form)))))) char form) (defun meta-reader (s c) (make-meta :char c :form (read s))) (mapc #'(lambda (c) (set-macro-character c #'meta-reader)) '(#\@ #\$ #\! #\? #\^)) (set-macro-character #\[ #'(lambda (s c) (make-meta :char c :form (read-delimited-list #\] s t)))) (set-macro-character #\{ #'(lambda (s c) (make-meta :char c :form (read-delimited-list #\} s t)))) (mapc #'(lambda (c) (set-macro-character c (get-macro-character #\) nil))) '(#\] #\}))) (defvar *meta-debug* nil "Should META output debugging information?") ;; (defmacro defmeta (name other-args &body body) ;; `(defun ,name (__buffer__ &optional (__index__ 0) ;; (__end__ (length __buffer__)) ;; ,@other-args) ;; (declare (simple-base-string __buffer__) ;; (fixnum __index__ __end__)) ;; ,@body)) (defmacro meta-labels (functions &body body) (let* ((old-index (gensym "old-index-")) (labels-list (loop for fun in functions collect (destructuring-bind (name extra-args &body body) fun `(,name (&aux (,old-index __index__) ,@extra-args) (when *meta-debug* (format t "~&~S : ~S" ',name ;(subseq __buffer__ (min __index__ (length __buffer__)) (min (length __buffer__) (+ __index__ 10))))) __index__)) (or (progn ,@body) (progn (setf __index__ ,old-index) nil))))))) `(labels ,labels-list ,@body))) ;;;; Some useful character classes. It seemed easier to define these ;;;; as types rather than writing functions to match them. (deftype digit () '(member #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\0)) (deftype digit-or-underscore () '(member #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\0 \_)) (deftype octal-digit () '(member #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\0)) (deftype hex-digit () '(member #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\0 #\A #\B #\C #\D #\E #\F)) (deftype URI () '(or identifier-body (member #\; #\/ #\? #\: #\@ #\& #\= #\+ #\$ #\, #\- #\. #\! #\~ #\* #\\ #\( #\) #\% #\' #\| #\#))) ;;; really need to support unicode here, but that will wait until CL ;;; implementations offer a way to distinguish letter from non-letter ;;; characters (deftype identifier-initial () '(member #\a #\b #\c #\d #\e #\f #\g #\h #\i #\j #\k #\l #\m #\n #\o #\p #\q #\r #\s #\t #\u #\v #\w #\x #\y #\z #\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M #\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z #\_)) (deftype identifier-body () '(or identifier-initial digit)) (deftype string-literal-char () '(and character (not (member #\" #\\ #\Newline #\Return)))) (deftype char-literal-char () '(and character (not (member #\' #\\ #\Newline #\Return)))) ;; " ;;;; All of E's keywords. IDENT-OR-KEYWORD checks to see if it should ;;;; emit an IDENT token or one of the symbols here. (defparameter e-keywords '(suspects encapsulated define abstract facet defmacro suchthat escape static rely break fn reliant constructor continue uses catch for else is truncatable begin interface as typedef bind throws forall delicate reliance signed var method private pragma oneway function octet supports declare ensure try methods lambda know belief in operator hides an switch meta encapsulates reveal utf8 virtual this believe relies when attribute sake public be thunk delegate extends datatype unum to synchronized case let export guards eventually behalf transient do given unsigned eventual enum implements deprecated protected finally if hidden const module utf16 return match default native suspect dispatch assert fun believes end wstring knows raises accum on def using namespace class struct select inline while encapsulate package volatile obeys)) ;; The actual lexer. Returns a function that takes string input and ;; produces as many tokens as possible from it. Pass :eof at the end. (defun make-lexer () (let (a __index__ __end__ __buffer__ __eof__ __start__ first-in-line (lex-mode-stack nil) (brace-count `(0))) (lambda (input) ;; lexer state: ;; "input" is the new data just received. ;; "__buffer__" is the string being parsed; the unparsed portion ;; of the last input plus the new. ;; "a" is a temp variable used by the macros. ;; "__index__" is the current position in the string. ;; "__end__" is the last position in the string. ;; "__start__" is the beginning of the current token under consideration. ;; "first-in-line" is whether we just emitted a linebreak or not. ;; "lex-mode-stack" tracks the current and previous lex states, ;; either "quasi" or "e". ;; "brace-count" is a list of the number of opened braces that need to be ;; matched in the current lex modes. A close brace encountered in E mode ;; when the most recent brace-count is 0 means it's time to switch to the previous ;; parser mode. (flet ((emit (token token-begin token-end) (unless (equal token 'LINESEP) (setq first-in-line nil)) (throw 'emit-token (cons token (subseq __buffer__ token-begin token-end))))) (meta-labels ((QUASI () (setq __start__ __index__) (matchit { ["${" !(progn (push 'e lex-mode-stack) (push 0 brace-count) t) ^DOLLARCURLY] ["$" !(IDENT) !(incf __start__) ^DOLLARHOLE] ["@{" !(progn (push 'e lex-mode-stack) (push 0 brace-count) t) ^ATCURLY] ["@" !(IDENT) !(incf __start__) ^ATHOLE] [!(QUASIn) $!(QUASIn) ^QUASIBODY] ["`" !(pop lex-mode-stack) ^QUASICLOSE] })) (OPERATOR () (setq __start__ __index__) ;; if E is an orchestra, then operators are "miscellaneous percussion". (matchit { ;; these are matched in order, so longer ones before shorter ["?" !(BR) ^QUESTION] ["(" !(BR) ^LPAREN] ["[" !(BR) ^LBRACK] [")" ^RPAREN] ["]" ^RBRACK] ["%%" !(BR) ^MOD] [":=" !(BR) ^ASSIGN] ["<=>" !(BR) ^ABA] ["//=" !(BR) ^FLOORDIV-ASSIGN] ["<<" !(BR) ^SL] ["++" ^INC] ["<=" !(BR) ^LE] ["--" ^DEC] ["^" !(BR) ^BXOR] ["+=" !(BR) ^PLUS-ASSIGN] ["||" !(BR) ^LOR] ["-=" !(BR) ^MINUS-ASSIGN] ["|" !(BR) ^BOR] ["*=" !(BR) ^STAR-ASSIGN] ["&&" !(BR) ^LAND] ["%=" !(BR) ^REM-ASSIGN] ["&" !(BR) ^BAND] ["%%=" !(BR) ^MOD-ASSIGN] [";" !(BR) ^SEMI] ["|=" !(BR) ^BOR-ASSIGN] ["**" !(BR) ^POW] ["&=" !(BR) ^BAND-ASSIGN] ["&!" !(BR) ^BUTNOT] [">>=" !(BR) ^SR-ASSIGN] ["<-" !(BR) ^SEND] [">>" !(BR) ^SR] ["->" !(BR) ^WHEN] [">=" !(BR) ^GE] ["=>" !(BR) ^MAPSTO] [">" !(BR) ^GT] ["=~" !(BR) ^MATCHBIND] ["!~" !(BR) ^MISMATCH] ["::" !(BR) ^SCOPE] [":" !(BR) ^COLON] ["," !(BR) ^COMMA] ["..!" !(BR) ^TILL] [".." !(BR) ^THRU] ["." !(BR) ^DOT] ["==" !(BR) ^SAME] ["!=" !(BR) ^NOTSAME] ["!" !(BR) ^LNOT] ["~" !(BR) ^BNOT] ["//" !(BR) ^FLOORDIV] ["/" !(BR) ^DIV] ["+" !(BR) ^PLUS] ["-" !(BR) ^MINUS] ["*" !(BR) ^STAR] ["%" !(BR) ^REM] ["=" !(error "Lone = at ~S. Use ':=' for assignment, or '==' for equality." __index__)] })) (CURLY () (setq __start__ __index__) (matchit { ["{" !(BR) !(incf (car brace-count)) ^LCURLY] ["}" !(progn (if (> (car brace-count) 0) (decf (car brace-count)) (progn (pop brace-count) (pop lex-mode-stack))) t) ^RCURLY] })) (LT () (setq __start__ __index__) (matchit ["<" {?[!(IDENT) {[#\> ^URIGetter] [#\: {[!(ANYWS) !(BR) ^URIStart] [@(URI a) $@(URI a) ">" !(emit 'URI (1+ __start__) (1- __index__))] }]}] [!(BR) ^LT] }])) (WS () (matchit [{#\Space #\Tab #\Page !(ESCWS)} !(skip)])) (SL-COMMENT () (matchit ["#" !(consume-until-newline) !(skip)])) (DOC-COMMENT () (setq __start__ __index__) (matchit ["/**" !(consume-doc-comment) !(BR) !(emit 'DOC-COMMENT __index__ __index__)])) (CHAR-LITERAL () (setq __start__ __index__) (matchit [#\' {!(ESC) @(char-literal-char a)} #\' ^CHAR-LITERAL])) (E-STRING () (setq __start__ __index__) (matchit [#\" ${!(ESC) !(EOL) @(string-literal-char a)} #\" ^STRING])) ;; " (E-NUMBER () (setq __start__ __index__) (matchit { ?["0x" @(hex-digit a) $@(hex-digit a) ^HEX] ?["0" @(octal-digit a) $@(octal-digit a) ^OCTAL] ?[!(FLOAT64) ^FLOAT64] [!(POSINT) ^INT] })) (IDENT-OR-KEYWORD (kws) (setq __start__ __index__) (when (IDENT) (setq kws (member (find-symbol (string-upcase (subseq __buffer__ __start__ __index__)) '#.*package*) e-keywords)) (if kws (emit (car kws) __start__ __index__) (progn (if (matchit "=") (if (matchit "=") ; oops this is "==" (progn (decf __index__ 2) (emit 'IDENT __start__ __index__)) (emit 'VERBASSIGN __start__ (- __index__ 1)))) (emit 'IDENT __start__ __index__))))) ;; rules that do not produce tokens (IDENT () ;; doesn't emit from here because other rules call it (matchit [@(identifier-initial a) $@(identifier-body a)])) (ANYWS () (matchit {#\Space #\Tab #\Page #\Return #\Newline})) (ESCWS () (matchit [#\\ ${#\Space #\Tab #\Page} !(EOL)])) (UPDOC () (matchit ?[!(consume-until-newline) $[{ #\Space #\Tab #\Page [{#\? #\# #\>} !(consume-until-newline)]}]])) (ESC () (matchit [#\\ { #\Newline #\Return #\Tab #\Backspace #\Page #\" #\? #\' #\\ ; " [{#\u #\U} @(hex-digit a) @(hex-digit a) @(hex-digit a) @(hex-digit a)] ?[{#\0 #\1 #\2 #\3} @(octal-digit a) @(octal-digit a)] [@(octal-digit a) {@(octal-digit a) []}] [${#\Space #\Tab #\Page} !(EOL) !(setq __index__ __start__)] }])) (POSINT () (matchit [@(digit a) $@(digit-or-underscore a)])) (FLOAT64 () ; need to match at least one of fractional-part and exponent-part (matchit {?[!(POSINT) ["." !(POSINT)] {[{"e" "E"} !(EXPONENT)] []}] [!(POSINT) [{"e" "E"} !(EXPONENT)]]})) (EXPONENT () (matchit [{"+" "-" []} !(POSINT)])) (BR () (if __eof__ t (matchit [${#\Space #\Tab ["#" !(consume-until-newline)] !(EOL)}]))) (EOL () (matchit [{?[#\Return #\Newline] #\Return #\Newline} !(new-line)])) (QUASIn () (matchit {"$$" "$\\" "@@" "@\\" "``" !(QUASI1)})) (QUASI1 () (loop until (or (EOL) (matchit {"`" "$" "@"})) do (incf __index__) finally (decf __index__) ; the end char is a separate token return t)) ;; helper functions (new-line () (setq first-in-line t)) (consume-until-newline () (loop until (or (matchit [{#\Return #\Newline} !(new-line)]) (and (= __index__ __end__) __eof__)) ;EOF is EOL do (progn (when (and (>= __index__ __end__) (not __eof__)) (setq __index__ __start__) (throw 'emit-token nil)) (incf __index__))) t) (consume-doc-comment () (loop do (incf __index__) until (or (>= __index__ __end__) (matchit "*/"))) t) (skip () (throw 'skip-token nil))) (if (eq input :eof) (setq __eof__ t) (setq __buffer__ (concatenate 'string __buffer__ input))) (setq __index__ 0 __start__ 0 __end__ (length __buffer__)) (loop for token = (catch 'emit-token (loop (catch 'skip-token (when (>= __index__ __end__) (setq __start__ __index__) (throw 'emit-token nil)) (if (equal (car lex-mode-stack) 'quasi) (QUASI) (progn (setq __start__ __index__) (matchit { [#\` !(progn (push 'quasi lex-mode-stack) ) ^QUASIOPEN] [!first-in-line "?" !(UPDOC) !(skip)] [!(EOL) $!(EOL) ^LINESEP] !(CURLY) !(LT) !(WS) !(SL-COMMENT) !(CHAR-LITERAL) !(E-STRING) !(DOC-COMMENT) !(IDENT-OR-KEYWORD) !(E-NUMBER) !(OPERATOR) }) (error "Syntax error: no lexer rule matched")))))) while token collecting token finally (setq __buffer__ (subseq __buffer__ __start__ __end__)) )))))) (defun test-one-char-at-a-time (input) (let ((lex (make-lexer))) (append (loop for c across input appending (funcall lex (string c))) (funcall lex :eof)))) (defun test-all-at-once (input) (let ((lex (make-lexer))) (append (funcall lex input) (funcall lex :eof))))