#!/usr/bin/python2

''' scanner.py -- generates a state machine scanner for the Algol W READ statement

See the Algol W Language Description, section 6.3

--

This file is part of Awe. Copyright 2012 Glyn Webster.

Awe is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Awe is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public
License along with Awe.  If not, see <http://www.gnu.org/licenses/>.
'''

MESSAGE = """/* GENERATED BY scanner.py: DON'T EDIT THIS FILE, EDIT scanner.py */
"""

STATE = '''
    case %(STATE)s:
      switch (c) {
%(CASES)s
      default: Scanner_close_buffer(scanner); return Error;
      }
      break;
'''

CASE = '      %(CONDITION)s scanner->state = %(NEXT)s; %(ACTION)s break;\n'


# Shorthand for the various classes of character that the scanner should see:
condition_data = r"""
n  case '\n':                 //various white space
s  case ' ':
!  case EOF:
w  case ' ': case '\n': case EOF:

d  case '0' ... '9':          //parts of numbers
+  case '-': case '+':
.  case '.':
e  case '\'': case 'e': case 'E':
                              //hexadecimal for BITS
#  case '#':
h  case '0' ... '9': case 'A' ... 'F': case 'a' ... 'f': 

                              //strings of printable ISO-8559-1 characters
"  case '"':
c  case ' ': case '!':  case '#' ... '~': case '\xAF' ... '\xFF':

I  case 'I': case 'i':        //various letters
T  case 'T': case 't':
R  case 'R': case 'r':
U  case 'U': case 'u':
E  case 'E': case 'e':
F  case 'F': case 'f':
A  case 'A': case 'a':
L  case 'L': case 'l':
S  case 'S': case 's':
E  case 'E': case 'e':
""" #"


# The state machine:  current state, current character, next state, action.
#
# See aweio.c for the meanings of the Scanner_* actions.
# (Basically, they are filling a buffer with the C versions of 
# the Algol W constants which will be scanned with 'strtol' and friends.)

default_action = 'Scanner_addchar(scanner, c);'

transition_data = r'''

// Ready for the next constant.

  0  d  101  Scanner_start(scanner); Scanner_addchar(scanner, c);         //These enter the number reading states.
  0  e  103  Scanner_start(scanner); Scanner_addstring(scanner, "1.0e");
  0  .  107  Scanner_start(scanner); Scanner_addstring(scanner, "0.");
  0  +  100  Scanner_start(scanner); Scanner_addchar(scanner, c);

  0  "  301  Scanner_start(scanner);                   //These enter string, bits, and TRUE/FALSE constant states.
  0  #  401  Scanner_start(scanner); 
  0  T  501  Scanner_start(scanner); Scanner_addchar(scanner, c);
  0  F  601  Scanner_start(scanner); Scanner_addchar(scanner, c);

  0  s    0  ;              //Leading spaces are ignored
  0  n    0  ;              //Newlines are ignored, but increment the line count
  0  !    0  Scanner_close_buffer(scanner); return Eof;
                            //The end of file, when expecting a constant, raises an exception.

// Numbers: integer, real, or imaginary.
// Algol W lets a ridiculous number of real constant parts be optional.
// An integer or real terminated with a sign rather than a space is the 
// real part of a complex number, the imaginary part will follow (state 200).

100  .  107  Scanner_addstring(scanner, "0.");
100  d  101
100  e  103  Scanner_addstring(scanner, "1.0e");
101  d  101
101  .  102
101  e  103  Scanner_addstring(scanner, ".0e");
101  I  105  ;
101  L  109  ;
101  +  200
101  w    0  Scanner_close_buffer(scanner); return Integer;
102  d  102
102  e  103  Scanner_addstring(scanner, "0e");
102  w    0  Scanner_close_buffer(scanner); return Real;
102  I  105  Scanner_addstring(scanner, "0");
102  L  109  Scanner_addstring(scanner, "0");
102  +  200
103  d  104
103  +  108
104  d  104
104  I  105  ;
104  L  109  ;
104  +  200
104  w    0  Scanner_close_buffer(scanner); return Real;
105  L  106  ;
105  w    0  Scanner_close_buffer(scanner); return Imaginary;
106  w    0  Scanner_close_buffer(scanner); return Imaginary;
107  d  102
108  d  104
109  +  200
109  w    0  Scanner_close_buffer(scanner); return Real;

// Numbers, the imaginary part of a complex number.
// This must contain an I.

200  d  201
200  e  203  Scanner_addstring(scanner, "1.0e");
200  .  207  Scanner_addstring(scanner, "0.");
201  d  201
201  .  202
201  I  205  ;
201  e  203  Scanner_addstring(scanner, ".0e");
202  d  202
202  e  203  Scanner_addstring(scanner, "0e");
202  I  205  Scanner_addstring(scanner, "0");
203  d  204
203  +  209
204  d  204
204  I  205  ;
204  w    0  Scanner_close_buffer(scanner); return Complex;
205  L  206  ;
205  w    0  Scanner_close_buffer(scanner); return Complex;
206  w    0  Scanner_close_buffer(scanner); return Complex;
207  d  202
209  d  204

// Strings, two double quotes represent a double quote.

301  c  301
301  "  302  ;
302  "  301  Scanner_addstring(scanner, "\"\"");
302  w    0  Scanner_close_buffer(scanner); return String; 

// Bits, in hexadecimal.

401  h  401
401  w    0  Scanner_close_buffer(scanner); return Bits;

// "TRUE" and "FALSE"

501  R  502
502  U  503
503  E  504
504  w    0  Scanner_close_buffer(scanner); return Logical;

601  A  602
602  L  603
603  S  604
604  E  605
605  w    0  Scanner_close_buffer(scanner); return Logical;
'''

# Generate the C code of the state machine conditions and transitions
import re
COMMENT = re.compile(r'(?m) *//.*?$')
CONDITION  = re.compile(r'(?m)^ *(.) +(.+) *$')
conditions = {}
for m in CONDITION.finditer(COMMENT.sub('', condition_data)):
    conditions[m.group(1)] = m.group(2)

TRANSITION = re.compile(r'(?m)^ *(\d+) +(.) +(\d+) *(.+)? *$')
transitions = {}
for m in TRANSITION.finditer(COMMENT.sub('', transition_data)):
    state, condition_code, next, action = m.group(1,2,3,4)
    if not action:
        action = default_action
    transitions.setdefault(state, []).append((condition_code, next, action))

# Output C code for state machine to scanner.inc
c_code = ''
for state in sorted(transitions.keys()):
    cases = ''
    for c, next, action in transitions[state]:
        cases += CASE % {'CONDITION': conditions[c], 'NEXT': next, 'ACTION': action}
    c_code += STATE % {'STATE': state, 'CASES': cases}
f = open('scanner.inc', 'w')
f.write(MESSAGE)
f.write(c_code)
f.close()

# Output a "dot file" that can be used to visualize the state machine with GraphViz.
def escape(s): return s.decode('string_escape').replace('"', '\\"')
f = open("scanner.dot", 'w')
f.write('digraph state_machine { rankdir=LR; node [shape = circle];\n')
for state in sorted(transitions.keys()):
    for c, next, action in transitions[state]:
        #label = escape(c) + '\\n' + escape(action)
        label = escape(c)
        f.write('%s -> %s [label="%s"];\n' % (state, next, label))
f.write('}\n')
f.close()

#end
