(* Copyright (C) 1992, Digital Equipment Corporation *) (* All rights reserved. *) (* See the file COPYRIGHT for a full description. *) (* Modula-2+ version created by John Ellis in ancient times. *) (* Simplified and ported to Modula-3 by J.Stolfi on May 1990. *) (* Last modified on Sun Mar 1 14:21:53 PST 1992 by meehan *) (* modified on Wed Feb 12 12:37:40 PST 1992 by muller *) (* modified on Wed Nov 20 17:50:40 PST 1991 by stolfi *) INTERFACE Sx; (* Facilities for reading and printing symbolic expressions A symbolic expression is a human-readable ASCII string that encodes a List-like data structure, in a standardized syntax reminiscent of that of LISP. Here are some examples of symbolic expressions: | #| Have a nice day. |# # comment | #T # REF BOOLEAN | 'a' # REF CHAR | 13 # REF INTEGER | -4.0e9 # REF REAL | -4.0d9 # REF LONGREAL | "hello world" # TEXT | #Undefined # the "undefined" value | Wire.Length # SxSymbol.T | (10 23 45) # List.T | [15 32 -4] # REF ARRAY OF REFANY | (Employee (Salary 10000) (Name "John R. Ellis")) This interface provides facilities for reading and printing symbolic expressions composed of integers, characters, booleans, reals, long reals, texts, symbols (Lisp-like atoms), lists, vectors (REF ARRAY OF REFANY). Symbolic expressions provide a simple, flexible, and extensible mechanism for communicating complex structured data between humans and computers. They are particularly useful for encoding data with complex or ill-defined structure, or that must viewed or edited by people. They are also a convenient starting point for small special-purpose languages. Symbolic expresions save programming time, because tools such as parsers and prettyprinters need to be written only once; and user time, because their syntax is straightforward and needs to be learned only once. Clients can easily modify and extend the expression syntax to represent other REF types, or to change the parsing and printing of the standard types. For a detailed description of the standard syntax, and recipes for extending and modifying it, see the comments at the end of this interface. Index: symbolic expressions; atoms; expressions, symbolic CAVEAT: For efficiency, Sx provides preallocated booleans, chars, small integers, and popular floats, which will be reused by Read and other procedures whenever convenient. Thus you should not generally modify the value of any leaf records (Ref.Integer, Ref.Char, Ref.Boolean, ...) contained in any data structure that was read from a synbolic expression. List and vector nodes, however, are always newly allocated and distinct, so they can be modified without worry of aliasing. *) IMPORT SxSymbol, List, Rd, Wr, Thread; TYPE Syntax <: ROOT; (* A syntax table *) (**********************************************************) (* CREATION *) (**********************************************************) PROCEDURE NewInteger (i: INTEGER) : REF (*READONLY*) INTEGER ; PROCEDURE NewChar (c: CHAR) : REF (*READONLY*) CHAR ; PROCEDURE NewBoolean (b: BOOLEAN) : REF (*READONLY*) BOOLEAN ; PROCEDURE NewReal (r: REAL) : REF (*READONLY*) REAL ; PROCEDURE NewLongReal (lr: LONGREAL) : REF (*READONLY*) LONGREAL; (* Returns a REF to a heap record containing the given value. CAVEAT: In spite of the "New" in the name, these procedures may reuse records reurned by previous calls. Clients should never modify the contents of the records returned by these procedures. *) VAR (*CONST*) True: REF BOOLEAN; False: REF BOOLEAN; (* See also List.i3, SxSymbol.i3. *) (**********************************************************) (* PARSING *) (**********************************************************) PROCEDURE Read( rd: Rd.T; root: SxSymbol.T := NIL; syntax: Syntax := NIL; (* DefaultSyntax *) ): REFANY RAISES {ReadError, Rd.EndOfFile, Rd.Failure, Thread.Alerted}; (* Reads one symbolic expression from the reader, leaving the reader positioned at the following character. Symbols are interned relative to "root", and /syntax/ governs the external syntax. Raises ReadError if any syntax error is encountered. Raises Rd.EndOfFile if, after skipping any leading whitespace, end-of-file is encountered; if end-of-file is encountered in the middle of an expression, ReadError is raised. See the comments at the end of this file describing how Read parses its input. *) PROCEDURE FromText( text: TEXT; root: SxSymbol.T := NIL; syntax: Syntax := NIL (* DefaultSyntax *) ): REFANY RAISES {ReadError, Rd.EndOfFile, Thread.Alerted}; (* Parses the given text using Read. Raises ReadError if Read raises ReadError, or if it fails to consume all non-blank characters of /text/. *) PROCEDURE ReadDelimitedList( rd: Rd.T; delim: CHAR; root: SxSymbol.T := NIL; syntax: Syntax := NIL (* DefaultSyntax *) ): List.T RAISES {ReadError, Rd.Failure, Thread.Alerted}; (* Repeatedly reads symbolic expressions from the reader, ignoring whitespace, until /delim/ is encountered, returning the expressions in a list. Raises ReadError if any expression is malformed or incomplete, or end-of-file occurs before /delim/ is found. *) EXCEPTION ReadError(TEXT); (* Raised when Read encounters a syntax error of some sort. *) (**********************************************************) (* PRINTING *) (**********************************************************) (* In each of the printing procedures below, the /wr/ parameter should be an FWr.T. Since an FWr.T has several layers of buffering, the client should use FWr.Flush to force out any trailing output when necessary. Here is an example of use: | wr := FWr.New( Stdio.stdout ); | Sx.Print( wr, ... ); | Sx.Print( wr, ... ); | Wr.PrintF( wr, .... ); | ... | FWr.Flush( wr ); | FWr.Close (wr); As a convenience to clients, if /wr/ is not a FWr.T, the printing procedures will create a new temporary FWr.T /fwr/ on top of /wr/ (with default line width), print the given value to /fw/, and finally flush /fwr/. *) PROCEDURE Print( wr: Wr.T; value: REFANY; elision: Elision := NoElision; root: SxSymbol.T := NIL; syntax: Syntax := NIL (* DefaultSyntax *); ) RAISES {PrintError, Wr.Failure, Thread.Alerted}; (* Prints a symbolic expression to /wr/, eliding the object at depths and lengths greater than that specified by "elision". The syntax table tells how to print objects of each type, and lists beginning with specified symbols. The /root/ parameter is passed along to the actual printing routines specified by the syntax table, and usually controls how any symbols in /value/ get printed. The default syntax table, for instance, specifies that symbols are printed relative to the given /root/; that is, if the symbol's full name is Foo.Bar.Baz.Qux, and /root/'s full name is Foo.Bar, the symbol is printed as Baz.Qux. (The routine raises PrintError if /value/ contains any symbol that is not a descendant of /root/.) *) PROCEDURE ToText( value: REFANY; elision: Elision := NoElision; root: SxSymbol.T := NIL; syntax: Syntax := NIL; (* DefaultSyntax *) ): TEXT RAISES {PrintError, Thread.Alerted}; (* Like Print, but returns a TEXT (with embedded newlines) instead of outputting to a writer. *) PROCEDURE PrintNL( wr: Wr.T; value: REFANY; elision: Elision := NoElision; root: SxSymbol.T := NIL; syntax: Syntax := NIL; (* DefaultSyntax *) ) RAISES {PrintError, Wr.Failure, Thread.Alerted}; (* Like Print, but always terminates the output with a newline and flushes the writer. *) EXCEPTION PrintError (REFANY); (* Raised by Print and related procedures when the arguments are invalid. *) CONST DefaultLineWidth = 75; (* Default line width, used when /wr/ is not a FWr.T. *) (**********************************************************) (* ELISION CONTROL *) (**********************************************************) TYPE Elision = RECORD depth: CARDINAL; length: CARDINAL; END; (* The printing procedures take an elision parameter specifying how much of nested objects to print. Sub-expessions nested at depths greater than or equal to elision.depth are printed simply as "?". (The top-level expression is at depth 0.) Only the first elision.length elements of a list or vector are printed; the others are elided as "...". Only the first n characters of a text are printed, where | n = MAX( 10 * ElisionLength( elision ), 75 ); All characters beyond the first n are replaced by "...". *) CONST NoElision = Elision{depth := LAST(CARDINAL), length := LAST(CARDINAL)}; (* NoElision specifies "print everything". *) (**********************************************************) (* DEFAULT SYNTAX *) (**********************************************************) VAR DefaultSyntax: Syntax; (* The default syntax for Read and Print. *) (**********************************************************) (* UNDEFINED VALUE *) (**********************************************************) TYPE UndefinedType <: REFANY; VAR (*CONST*) Undefined: UndefinedType; (* The cannonical "undefined" value; reads and prints as #Undefined. *) END Sx. (**********************************************************) (* SYMBOLIC EXPRESSION SYNTAX *) (**********************************************************) (* 1 OVERVIEW The standard symbolic expression syntax includes notation for the following basic data types: | REF INTEGER | -23 (decimal) | 16_0ff0 (hex) | 2_11101111 (binary) | 3_-1022 (ternary) | REF REAL | 2.3e9 | -0.5 | +1. | 1e-20 | REF LONGREAL | 2.d-9 | -3.5d | REF CHAR | 'a' | '\012' (octal; ASCII.NL) | '\n' (ASCII.NL) | '\\' (Single backslash character) | REF BOOLEAN | #T | #True | #FALSE | TEXT | "" | "hello world" | "first line\nsecond line\njingle bells\007\007\007 \007\007\007" | SxSymbol.T | a | := | + | $@#! | Wire.Length | List.T | () | (a) | (1 2 3) | ((a b) (c)) | (() () (())) | REF ARRAY OF REFANY | [] | [foo] | [1 2 3] | [[1 2 3] "missing" (+ 6 7)] | Sx.Undefined (a standard "undefined" value) | #U | #UNDEFINED | #Undefined 2 DETAILS A /symbolic expression/ consists of one or more /tokens/, possibly preceded, separated, and/or followed by /whitespaces/. A token can be a /literal/ (boolean, numeric, character, or text), a /symbol/, or a /delimiter/. A delimiter is a parenthesis, a bracket, or a brace. 2.1 Comments and white space A /whitespace/ is a sequence of one or more spaces, tabs, newlines, returns, formfeeds, or comments. Whitespaces are optional before or after a delimiter. There are three types of comments: A /line comment/ starts with "#s", where s is any whitespace character, and extends to the end of the current line, including its newline character (which may be s itself). A line comment may contain any characters other than newline. A /block comment/ is surrounded by "#|" and "|#", and may span more than one line. A block comment may contain an arbitrary sequence of characters, including newlines and nested block comments, but all embedded occurrences of "#|" and "|#" must be properly matched. Note that the sequences "#|" and "|#" are interpreted as the block comment delimiters even if they occur inside embedded line comments or quoted strings. | #| Have a rotten day. |# | (Input x) # this reads the input data | #| | 1. solve system | 2. print answer | #| 2.2 Boolean literals The only Boolean literals are #T, #True, or #TRUE for TRUE, and #F, #False, or #FALSE for FALSE. Case is significant: #tRUE and #true are syntax errors. 2.3 Numeric literals A numeric literal is a token that begins with a digit, optionally preceded by a sign and/or a decimal period. It denotes a value REF INTEGER, REF REAL, or REF LONGREAL, depending of the token's format: A decimal integer literal consists of an optional sign followed by one or more decimal digits. The final value should be assignable to a Modula-3 INTEGER. A generic integer literal consist of an optional sign, followed by an unsigned integer b in the range [2..36], followed by an underscore "_" and one or more digits in the range [0..b-1], giving the number's absolute value in base b. (For b > 10, the digits after 9 are 'a', 'b', ..., 'z', ignoring case.) If a sign is present, the number must be assignable to a Modula-3 INTEGER; otherwise the number must lie in [0..2^w-1], where w = BITSIZE(INTEGER), and will be LOOPHOLE'd to an integer. The distinction between decimal and other bases is purely external, and is lost during parsing: on output, integers are always printed in decimal. A real literal has the form | real => ["+" | "-"] digit+ "." digit* [exponent] | | ["+" | "-"] digit+ exponent where | exponent => ("e" | "E") [ ["+" | "-"] digit+ ] LONGREAL literals have the same syntax, except that the exponent part is mandatory and uses 'd' or 'D' instead of 'e': | longreal => ["+" | "-"] digit+ "." digit* longexponent | | ["+" | "-"] digit+ longexponent where | longexponent => ('d' | 'D') [ ["+" | "-"] digit+ ] Examples: | 0 12 -1 +12556 (decimal integers) | 2_0 16_ff07 -8_00777 (integers in other bases) | 1. -2.0 6e 12.25e12 1.e-10 (reals) | 6d 12.25d12 1.d-10 (long reals) | Note that the integer part must have at least one digit. so these examples are illegal: | .5 +.22 e10 +d0 2.3 Character literals Character literals are delimited by single quotes (apostrophes), as in Modula-3: | 'a' ' ' '|' '\000' '\n' '\'' '\\' Printable ASCII characters (' '..'~') other than apostrople and backslash can be entered directly; all other characters must be entered as escape sequences. The known escapes are | \n newline | \t tab | \r return | \f formfeed | \b backspace | \e ESC character | \ddd the ASCII character represented by octal digits ddd | \x the character "x" for any non-alphanumeric graphic char x If x is a letter other than the above, '\x' is a syntax error; and so are the literals '\0' and '\00'. 2.4 Text literals Text literals follow the Modula-3 syntax. Printable ASCII characters (' '..'~') other than double quotes and backslash can be entered directly; all others (including newline) must be entered as escape sequences, with the same rules as for character literals. 2.5 Symbols Symbols are similar to Lisp atoms: internal objects that are uniquely identified by their printable names. Printing a symbol and then reading it back in will yield the exact same symbol. A vanilla symbol name cannot contain any embedded whitespaces, delimiters (parentheses, brackets, or curly braces), single or double quotes, periods, or sharp characters ('#'). | Hello funny-bone Real.First & - + != Also, a symbol may not begin with a digit, or with a sign followed by a digit. For example, these are NOT valid symbols: | 02 -12 +8 3_1 (integer literals) | 1e -3d 3.0 (float literals) | 1.1.3 2a 4--8 (syntax errors) | +0+ 2_c 5:0 (ditto) These rules can be circumvented by individual characters with "\". Consider the examples | A\ B \+3 \"foo\" The first example is a single symbol with name "A B". The second is the symbol with name "+3", not the integer 3. The third is the symbol with name "\"foo\"", not the text "foo". Another way to quote special characters in symbol names is to enclose all or part of the symbol's name within paired vertical bars "|". So, for example, the symbols | |A B| |+3| |"foo"| are equivalent to | A\ B \+3 \"foo\" In particular, || denotes the symbol whose name is the empty string. One can use As in text strings, only the printable ASCII characters (' '..'~') can be used directly in symbol names, within or without the vertical bars. Non-printable characters must be encoded using the standard character and text escapes such as \n, \t, and \ddd. In particular, | \012 or \n or |\012| or |\n| denotes the symbol whose name is the text "\n". Because of ambiguity with the \ddd notation, symbols whose name begins with a digit must be quoted with ||, not with \. For example, to force 012 and 3-in-1 to be parsed as symbols, you must write |012| and |3-in-1|; the notation \012 means something else, and \0\1\2, 01\2, and \3-in-\1 are syntax errors. 2.6 Compound symbol names Many applications require name spaces with an hierarchical structure, and some require that difeerent threads running in the same address space have separate symbol-naming spaces. For the benefit of such applications, the Read and Print procedures support a tree-structured symbol space. Each symbol is a node in a directed tree structure; it has at most one /parent symbol/, and zero or more /children symbols/. Each symbol has a /short name/ which is unique among all children of the same parent. The /long name/ of a symbol /s/, relative to some ancestor symbol /r/, is the sequence (not concatenation) of the short names of all symbols between /r/ (exclusive) and /s/ (inclusive) Thus, all proper descendants of /r/ have distinct long names relative to /r/. The /full name/ of a symbol /s/ is its long name relative to the root of the containing tree (the only ancestor of /s/ that has no parent). The SxSymbol interface provides a default root symbol, but clients may create more roots if needed; so a an address space may contain several disjoint symbol trees, with independent name spaces. Externally, a long symbol name is printed with dots (".") separating its component short names. When Read parses a symbol token, it break the token at the periods into a sequence of one or more short names, and interprets them as a path in a symbol tree starting from a client-specified /local root/ symbol (which need not be a true root). For example, if the given local root is a symbol with full name Foo.Bar, when Sx.Read parses the characters "Baz.Qux" it will return a symbol whose full name is Foo.Bar.Baz.Qux. Note that all symbols parsed by a call of Sx.Read will be proper descendants of the local root symbol. Conversely, when Sx.Print has to output a given symbol, it will print only its long name relative to a given /local root/ symbol. Sx.Print will raise an exception if it is asked to print a symbol that is not a proper descendant of this local root. One can disable the special meaning of period as a name separator by quoting it, either with backslash or vertical bars; the period is then treated as a plain letter. For example, the symbol C.1 has two component names ("C" and "1"), whereas C\.1 (or |C.1|) has only one component name ("C.1"). In particular, the symbol || has one component name with no characters. Note that .X is parsed as the two-component name ||.X, and ... is the four-component name ||.||.||.|| (weird, no?). 2.7 Lists Lists are indicated with parentheses surrounding an arbitrary number of sub-expressions separated by whitespace: | (1 2 3) (1 ('a' 'b') "hello" 3.0) NIL is represented using the empty list: | () 2.8 Vectors Vectors are like lists, but using brackets: | [1 2 3] [(1 2) 3 [4 5] ] The empty vector [] is not the same as NIL; it is a REF to an array of REFANY with 0 elements. 2.9 Undefined value Some applications that need a distinguished REFANY value in addition to NIL, for example to use as the default value of un-initialized variables. For the benefit of such clients, the Sx package declares the UndefinedType (a private branded REF type), and exports the Undefined constant (an UndefinedType record allocated at startup). This value is externally denoted by #U, #Undefined, or #UNDEFINED. 3 EXTENDING AND MODIFYING THE STANDARD SYNTAX The standard syntax can be extended and modified by clients in various ways. The changes can be effected by passing Read or Print a suitable SxSyntax (or by modifying the default one). The "standard" extensions are based on two schemas, the /curly notation/ and the /sharp notation/. 3.1 The curly notation The curly notation has the general form | "{" symbol expression* "}" where /symbol/ is a symbol name, and the /expression/s are symbolic expressions whose number and type depend on the symbol, in a way specified by the client. For example, suppose I have a Modula-3 ref type called Wire.T that has two parts, an optional name and a list of pin numbers. I could define a curly notation for Wire.T that looks like: | {Wire name: "ground" pins: 35 40 45} or | {Wire pins: 35 40 45} Teaching this notation to Read and Print is very easy. I need only write two Modula-3 procerues: a "parser" that takes a List.T containing the symbolic expressions between the curly braces | (Wire name: "ground" pins: 35 40 45) and returns the corresponding Wire.T; and a "printer" that converts a Wire.T into a List.T. I then "register" these procedures into the syntax table with SxStandardSyntax.DefineCurly. 3.2 Extending the # syntax If the curly-brace notation is not powerful enough for your job, you may consider adding new variants to the # notation. Say you want special syntax for bits strings of arbitrary length, internally represented as REF ARRAY OF BOOLEAN. If possible, you should pick a notation that begins with "#" and some other fixed character that is currently not allowed after "#"; say | #[001001000001000001] You will then have to write a parsing routine, folowing the conventions described in SxSyntax interface, that gobbles the token characters after the "#[", up to and including the "]", and constructs the appropriate REF ARRAY OF BOOLEAN; you then register this procedure under "#[" using SxStandardSyntax.SetSharpReadProcedure. If you want to write such bits strings, you will have to write a compatible printing procedure, as specified in SxStandardSyntax, and register it using SxStandardSyntax.SetSharpPrintProcedure. 3.3 The #<...> convention For some kinds of objects, it makes sense only to have a print syntax without a read syntax. For example, it might be desirable to have a print syntax for Rd.Ts, but a read syntax isn't well-defined. It is recommended that the print syntax begins with #<, preferably followed by the name of the REF type. For example | # Assuming no one registers a SharpReadMacro for #<, the Read procedure will raise ReadError if it tries to read such an object back in. 3.3 Adding other tokens The main reason for using '#' as a starting character for client-defined token types is to allow most other characters to be used in symbol names, without having to quote or escape them. If this is not important, you can pick a notation that starts with a character other than '#'. You just need to write the appropriate parsing procedure, and store it in the appropriate entry of the syntax table. (In the standard syntax table, the characters "(", "[", "{", "#", "'", and "\"" are handled this way.) For instance, suppose you want to implement the Lisp quoting notation, whereby the expression `(A B) gets parsed into the list (quote (A B)). You need to declare ` as an active character, and provide a read macro that calls Read recursively to gobble the (A B), and then puts the (quote ...) around the result. For output, you will also need to define a suitable print method, and register it in the appropriate entry of the 3.4 Hacking deeper With due care, you can modify the standard syntax for most everything by replacing the standard read macros (associated with "{", "(", "[", "'", "\""), or the standard "sharp" read macros (associated with #T, #F, #U, #|, and so on), or the standard routines used to parse symbols and numeric literals. See SxSyntax interface for more details. *)