491 lines
16 KiB
Python
491 lines
16 KiB
Python
#!/usr/bin/python3
|
|
#
|
|
# MSX BASIC tokenizer
|
|
#
|
|
# Copyright © 2020 Pedro Gimeno Fortea
|
|
#
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
# copy of this software and associated documentation files (the "Software"),
|
|
# to deal in the Software without restriction, including without limitation
|
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
# and/or sell copies of the Software, and to permit persons to whom the
|
|
# Software is furnished to do so, subject to the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be included in
|
|
# all copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
# IN THE SOFTWARE.
|
|
|
|
|
|
import sys, re, io, struct
|
|
assert sys.hexversion >= 0x3050300, "Invalid Python version, must be 3.5.3+"
|
|
|
|
tokens1MSX = [
|
|
"", "END", "FOR", "NEXT", "DATA", "INPUT", "DIM", "READ", "LET",
|
|
"GOTO", "RUN", "IF", "RESTORE", "GOSUB", "RETURN", "REM", "STOP",
|
|
"PRINT", "CLEAR", "LIST", "NEW", "ON", "WAIT", "DEF", "POKE", "CONT",
|
|
"CSAVE", "CLOAD", "OUT", "LPRINT", "LLIST", "CLS", "WIDTH", "ELSE",
|
|
"TRON", "TROFF", "SWAP", "ERASE", "ERROR", "RESUME", "DELETE",
|
|
"AUTO", "RENUM", "DEFSTR", "DEFINT", "DEFSNG", "DEFDBL", "LINE",
|
|
"OPEN", "FIELD", "GET", "PUT", "CLOSE", "LOAD", "MERGE", "FILES",
|
|
"LSET", "RSET", "SAVE", "LFILES", "CIRCLE", "COLOR", "DRAW", "PAINT",
|
|
"BEEP", "PLAY", "PSET", "PRESET", "SOUND", "SCREEN", "VPOKE",
|
|
"SPRITE", "VDP", "BASE", "CALL", "TIME", "KEY", "MAX", "MOTOR",
|
|
"BLOAD", "BSAVE", "DSKO$", "SET", "NAME", "KILL", "IPL", "COPY", "CMD",
|
|
"LOCATE", "TO", "THEN", "TAB(", "STEP", "USR", "FN", "SPC(", "NOT",
|
|
"ERL", "ERR", "STRING$", "USING", "INSTR", "'", "VARPTR", "CSRLIN",
|
|
"ATTR$", "DSKI$", "OFF", "INKEY$", "POINT", ">", "=", "<", "+", "-", "*",
|
|
"/", "^", "AND", "OR", "XOR", "EQV", "IMP", "MOD", "\\", "", "", "",
|
|
]
|
|
|
|
tokens2 = [
|
|
"", "LEFT$", "RIGHT$", "MID$", "SGN", "INT", "ABS", "SQR", "RND", "SIN",
|
|
"LOG", "EXP", "COS", "TAN", "ATN", "FRE", "INP", "POS", "LEN", "STR$",
|
|
"VAL", "ASC", "CHR$", "PEEK", "VPEEK", "SPACE$", "OCT$", "HEX$",
|
|
"LPOS", "BIN$", "CINT", "CSNG", "CDBL", "FIX", "STICK", "STRIG", "PDL",
|
|
"PAD", "DSKF", "FPOS", "CVI", "CVS", "CVD", "EOF", "LOC", "LOF", "MKI$",
|
|
"MKS$", "MKD$", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
|
|
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
|
|
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
|
|
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
|
|
]
|
|
|
|
trans = (
|
|
'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F'
|
|
'\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
|
|
' !"#$%&\'()*+,-./0123456789:;<=>?'
|
|
'@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_'
|
|
'`abcdefghijklmnopqrstuvwxyz{|}~\x7F'
|
|
'ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒ'
|
|
'áíóúñѪº¿⌐¬½¼¡«»ÃãĨĩÕõŨũIJij¾∽◊‰¶§'
|
|
'▂▚▆🮂▬🮅▎▞▊🮇🮊🮘🮙🭭🭯🭬🭮🮚🮛▘▗▝▖🮖Δ‡ω█▄▌▐▀'
|
|
'αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²■█'
|
|
)
|
|
|
|
# Token parsing modes, affects how integers are parsed
|
|
ModeFloat = 0
|
|
ModeUint = 1
|
|
ModeLiteral = 2
|
|
|
|
# Create charset translation tables from the above
|
|
msx_s2b = {}
|
|
msx_b2s = {}
|
|
|
|
for k, v in enumerate(trans):
|
|
b = bytes((k,))
|
|
msx_s2b[v] = b
|
|
msx_b2s[b] = v
|
|
|
|
def encode(s):
|
|
return b''.join([msx_s2b[c] for c in s])
|
|
|
|
# Exception for the BASIC errors
|
|
class BasicError(Exception):
|
|
pass
|
|
|
|
|
|
def num2bytes(s, tok_mode):
|
|
"""Translate a decimal number string to floating point"""
|
|
|
|
assert tok_mode != ModeLiteral
|
|
assert b'e' not in s, "Lower-case exponent in number"
|
|
|
|
# Remove embedded spaces and leading zeros
|
|
s = s.replace(b' ', b'')
|
|
assert s
|
|
s = s.lstrip(b'0')
|
|
if s == b'':
|
|
s = b'0' # don't strip the last character
|
|
|
|
# Unsigned Int is simple
|
|
if tok_mode == ModeUint:
|
|
return struct.pack('<BH', 0x0E, int(s))
|
|
|
|
# Find suffix
|
|
suffix = s[-1:]
|
|
if suffix in {b'%', b'!', b'#'}:
|
|
s = s[:-1]
|
|
else:
|
|
suffix = b''
|
|
|
|
# Remove point and exponent after taking note of their position and value
|
|
point = s.find(b'.')
|
|
s = s.replace(b'.', b'')
|
|
ep = s.upper().find(b'E') # exponent pointer
|
|
|
|
exp = 0 # exponent
|
|
has_exp = ep != -1
|
|
if not has_exp:
|
|
ep = len(s)
|
|
else:
|
|
tmp = s[ep + 1:]
|
|
s = s[:ep]
|
|
if tmp not in {b'', b'+', b'-'}:
|
|
exp = int(tmp, 10)
|
|
|
|
has_point = point != -1
|
|
if point == -1:
|
|
point = ep
|
|
exp += point + 0x40 # apply bias
|
|
|
|
# Remove zeros after the decimal point and subtract the count from exp
|
|
orig_len = len(s)
|
|
s = s.lstrip(b'0')
|
|
exp -= orig_len - len(s)
|
|
point -= orig_len - len(s)
|
|
# Pad with zeros to the right
|
|
s += b'0' * (14 - len(s))
|
|
|
|
# Calculate the integer value for the case of % or no point/exp
|
|
nint = 0
|
|
if point > 0:
|
|
# Perform rounding of the 14th decimal
|
|
nint = int(s[:point])
|
|
if point <= 14 and (s[point:14] == b'9' * (14 - point)
|
|
and s[14:15] >= b'5'):
|
|
nint += 1
|
|
|
|
if suffix == b'%' or (not has_point and not has_exp and suffix == b''
|
|
and 0 <= nint <= 32767):
|
|
if not (0 <= nint <= 32767):
|
|
raise BasicError("Overflow")
|
|
if nint < 10:
|
|
return struct.pack('<B', 0x11 + nint)
|
|
if nint < 256:
|
|
return struct.pack('<BB', 0x0F, nint)
|
|
return struct.pack('<Bh', 0x1C, nint)
|
|
|
|
n = int(s[:14])
|
|
if suffix == b'!' or (n % 100000000 == 0 and suffix != b'#'):
|
|
# Handle as single-precision
|
|
n = (n + 50000000) // 100000000
|
|
|
|
if n > 999999:
|
|
# Replicate a bug in MS's tokenizer where 9.999995! is tokenized as 1!
|
|
# (uncomment to fix the bug)
|
|
#exp += 1
|
|
n //= 10
|
|
assert 0 <= n <= 999999
|
|
if n == 0: exp = 0
|
|
if exp > 127 or exp < 0:
|
|
raise BasicError("Overflow")
|
|
return struct.pack('>BL', 0x1D, int(str(n), 16) + exp * 0x1000000)
|
|
|
|
if s[14:15] >= b'5':
|
|
n += 1
|
|
if n > 99999999999999:
|
|
exp += 1
|
|
n //= 10
|
|
assert 0 <= n <= 99999999999999
|
|
if n == 0: exp = 0
|
|
if exp > 127 or exp < 0:
|
|
raise BasicError("Overflow")
|
|
return struct.pack('>BLL', 0x1F,
|
|
int(str(n // 100000000), 16) + exp * 0x1000000,
|
|
int(str(n % 100000000), 16))
|
|
|
|
|
|
def tokenize(src, use_cr=False, type_mode=False, remove_spaces=False):
|
|
"""Options:
|
|
type_mode: True if it should behave like typing in a program; False if
|
|
it should behave like LOAD. When True:
|
|
- Spaces at EOL are removed.
|
|
- Entering line numbers >= 65530 raises Syntax Error.
|
|
use_cr: True to split lines at CR and ignore LF (like the real thing),
|
|
False to split lines at LF and ignore CR (multiplatform).
|
|
remove_spaces: True to "minify" the lines by removing all spaces between
|
|
tokens.
|
|
"""
|
|
|
|
src = encode(src)
|
|
|
|
LeadingASCII = re.compile(b'^[A-Z]+')
|
|
tok_translate = {}
|
|
for k, v in enumerate(tokens1MSX):
|
|
v = encode(v)
|
|
assert k < 128 and v not in tok_translate
|
|
if v != b'':
|
|
tok_translate[v] = bytes((k+128,))
|
|
|
|
for k, v in enumerate(tokens2):
|
|
v = encode(v)
|
|
assert k < 128 and v not in tok_translate
|
|
if v != b'':
|
|
tok_translate[v] = bytes((255,k+128))
|
|
|
|
# Special encoding of ' as :REM'
|
|
tok_translate[b"'"] = b':' + tok_translate[b'REM'] + tok_translate[b"'"]
|
|
# Special encoding of ELSE as :ELSE'
|
|
tok_translate[b'ELSE'] = b':' + tok_translate[b'ELSE']
|
|
# ? is parsed as PRINT
|
|
tok_translate[b'?'] = tok_translate[b'PRINT']
|
|
# GOTO admits the spelling GO TO but not e.g. GO TO. GOSUB not affected.
|
|
tok_translate[b'GO TO'] = tok_translate[b'GOTO']
|
|
|
|
# Create list of keywords sorted by longest, escaped
|
|
L = []
|
|
for v in tokens1MSX + tokens2 + ['GO TO']:
|
|
v = encode(v)
|
|
if LeadingASCII.search(v):
|
|
L.append(v)
|
|
|
|
L.sort(key=len, reverse=True) # longest match first
|
|
# Escape for RE
|
|
L = [re.escape(v) for v in L]
|
|
|
|
# Tokenizer for decimal numbers (floats).
|
|
decimal = (
|
|
br'|(?P<dec>(?:'
|
|
# Cautiously avoid matching 1 space alone or 1 trailing space.
|
|
br'(?:[0-9][0-9\ ]*)?\.(?:[0-9\ ]*[0-9])?' # at least a dot
|
|
br'|(?:[0-9](?:[0-9\ ]*[0-9])?)' # or at least a digit
|
|
br')' # not optional
|
|
br'(?:'
|
|
br'\ *[%!\#]' # suffix
|
|
br'|\ *E\ *[-+]?(?:[0-9\ ]*[0-9])?' # or exponent, but not both
|
|
br')?)' # optional
|
|
)
|
|
# Tokenizer for uint. Up to 6552, read up to 5 digits; from 6553 on, up to 4.
|
|
# Used for line numbers, either leading or after GOTO/GOSUB/etc.
|
|
uint = (br'|(?P<dec>'
|
|
br'(?:0[0\ ]*)?' # leading zeros prefix
|
|
br'(?:0' # zero
|
|
br'|[1-5](?:\ *[0-9]){4}' # prefix 1..5, 5 digits
|
|
br'|6\ *[0-4](?:\ *[0-9]){3}' # prefix 60..64, 5 digits
|
|
br'|6\ *5\ *[0-4](?:\ *[0-9]){2}' # prefix 650..654, 5 digits
|
|
br'|6\ *5\ *5\ *[0-2](?:\ *[0-9])?' # prefix 6550..6552, 5 digits
|
|
br'|[1-9](?:\ *[0-9]){,3}' # rest, 1 to 4 digits
|
|
br'))'
|
|
)
|
|
|
|
tokenizer = (br'(?:'
|
|
br"(?P<rem>(?:REM|').*)" # comment
|
|
br'|(?P<data>(?:DATA)(?:"[^"]*(?:"|$)|[^:])*)' # data
|
|
br'|(?P<call>(?:CALL|_)[^:(]*)' # call
|
|
br'|(?P<kw>' + br'|'.join(L) + br')' # keywords
|
|
br'|[A-Z](?:\ *[0-9.])*' # identifier
|
|
br'|&H(?P<hex>[0-9A-F]*)' # hex number
|
|
br'|&O(?P<oct>[0-7]*)' # octal number
|
|
#br'|&B(?P<bin>[01]+)' # binary numbers don't have tokens
|
|
br'%s' # decimal number
|
|
br'|(?P<str>"(?:[^"]*)(?:"|$))' # string literal
|
|
b'|(?P<del>[\x80-\xFF])' # remove those
|
|
br'|.'
|
|
br')'
|
|
)
|
|
|
|
tokenizer_uint = re.compile(tokenizer % uint, re.I)
|
|
tokenizer_float = re.compile(tokenizer % decimal, re.I)
|
|
tokenizer_literal = re.compile(tokenizer % b'|(?P<dec>(?!))', re.I)
|
|
|
|
del L, uint, decimal, LeadingASCII, tokenizer
|
|
|
|
# Control Character Behaviour:
|
|
# \x00: Terminates the line prematurely (ignores up to the next \x0D).
|
|
# \x01-\x08: Skip it and next token.
|
|
# \x09: Inserted verbatim, finish token.
|
|
# \x0B-\x0C: Replaced by spaces
|
|
# \x0E-\x0F,\x11-19,\x1B-\x1F: Inserted verbatim.
|
|
# \x0A: Ignored (skipped).
|
|
# \x0D: EOL
|
|
# \x10: Undefined effects
|
|
# Two effects we got:
|
|
# - Hang the machine.
|
|
# - Delete last byte and insert current line's start address.
|
|
# \x1A: EOF. Stops further processing.
|
|
# \x7F: Unknown
|
|
|
|
leadingdigit = re.compile(br'[\ \x01-\x1F\x80-\xFF]*(?:([0-9]?))')
|
|
trunc_at_null = re.compile(br'\x00.*', re.S)
|
|
call_strip = re.compile(br'[^0-\x7F\ (]*')
|
|
|
|
# Truncate source at \x1A (^Z)
|
|
src = re.sub(b'\x1A.*', b'', src, flags=re.S)
|
|
if use_cr:
|
|
# realistic
|
|
src = src.split(b'\r')
|
|
ignore = b'\n'
|
|
else:
|
|
# practical
|
|
src = src.split(b'\n')
|
|
ignore = b'\r'
|
|
|
|
# First pass: Read the lines and tokenize them into a dict with the line
|
|
# number as the key. Handle line deletion, overwriting, etc.
|
|
PRGLines = {}
|
|
for line in src:
|
|
line = trunc_at_null.sub(b'', line.replace(ignore, b'').lstrip(b' '))
|
|
if type_mode: line = line.rstrip(b' ')
|
|
if line == b'':
|
|
continue
|
|
|
|
g = tokenizer_uint.match(line)
|
|
# Error if not numeric
|
|
if not g.group('dec'):
|
|
# we can't execute BASIC statements even in typing mode
|
|
raise BasicError("Direct statement in file")
|
|
|
|
p = g.end()
|
|
linenum = int(g.group().replace(b' ', b''))
|
|
if linenum != 0:
|
|
# The first space is not removed for line 0
|
|
if line[p:p+1] == b' ':
|
|
p += 1
|
|
|
|
g = leadingdigit.match(line, p)
|
|
if type_mode:
|
|
# Entering e.g.
|
|
# 65530
|
|
# alone raises Syntax error. In LOAD mode, it enters 6553 0
|
|
if group(1): raise BasicError("Syntax error")
|
|
if not g.group(1) and g.end(0) == len(line):
|
|
# Delete line
|
|
if linenum not in PRGLines:
|
|
raise BasicError("Undefined line number")
|
|
del PRGLines[linenum]
|
|
continue
|
|
|
|
lbuf = io.BytesIO() # Tokenized line buffer
|
|
tok_mode = ModeFloat
|
|
|
|
while True:
|
|
nextch = line[p:p+1]
|
|
if tok_mode == ModeUint and (nextch == b':'
|
|
or b'A' <= nextch <= b'Z'
|
|
or b'a' <= nextch <= b'z'):
|
|
tok_mode = ModeFloat
|
|
|
|
if p == len(line): break
|
|
|
|
tokenizer = tokenizer_float
|
|
if tok_mode == ModeUint:
|
|
tokenizer = tokenizer_uint
|
|
elif tok_mode == ModeLiteral:
|
|
tokenizer = tokenizer_literal
|
|
g = tokenizer.match(line, p)
|
|
|
|
assert g, "No match in tokenizer for " + line[p]
|
|
match = g.group().upper()
|
|
p = g.end()
|
|
|
|
# Handle control chars... somewhat
|
|
if match in {b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06',
|
|
b'\x07', b'\x08'}:
|
|
# Eat a token
|
|
g = tokenizer.match(line, p)
|
|
p = g.end()
|
|
continue
|
|
|
|
if tok_mode != ModeUint and match in {b'\x0B', b'\x0C', b'\x0E',
|
|
b'\x0F', b'\x10', b'\x11', b'\x12', b'\x13', b'\x14', b'\x15',
|
|
b'\x16', b'\x17', b'\x18', b'\x19', b'\x1B', b'\x1C', b'\x1D',
|
|
b'\x1E', b'\x1F'}:
|
|
lbuf.write(b' ')
|
|
tok_mode = ModeFloat
|
|
continue
|
|
|
|
# Handle token types
|
|
if g.group('rem'):
|
|
if g.group().startswith(b"'"):
|
|
lbuf.write(tok_translate[b"'"])
|
|
lbuf.write(g.group()[1:])
|
|
else:
|
|
lbuf.write(tok_translate[b'REM'])
|
|
lbuf.write(g.group()[3:])
|
|
elif g.group('data'):
|
|
lbuf.write(tok_translate[b'DATA'])
|
|
lbuf.write(g.group()[4:])
|
|
elif g.group('call'):
|
|
match = call_strip.sub(b'', match)
|
|
if match.startswith(b'_'):
|
|
lbuf.write(match)
|
|
else: # starts with 'CALL'
|
|
lbuf.write(tok_translate[b'CALL'])
|
|
lbuf.write(match[4:])
|
|
elif g.group('kw'):
|
|
# keyword
|
|
if match in {b'GOTO', b'GO TO', b'GOSUB', b'THEN', b'ELSE', b'LIST',
|
|
b'DELETE', b'AUTO', b'LLIST', b'RENUM', b'RESTORE', b'RESUME'}:
|
|
tok_mode = ModeUint
|
|
else:
|
|
tok_mode = ModeFloat
|
|
lbuf.write(tok_translate[match])
|
|
elif g.group('hex') is not None:
|
|
# hex literal
|
|
lbuf.write(b'\x0C')
|
|
num = int(g.group('hex'), 16) if g.group('hex') != b'' else 0
|
|
if num > 65535:
|
|
raise BasicError("Overflow")
|
|
lbuf.write(struct.pack('<H', num))
|
|
elif g.group('oct') is not None:
|
|
# oct literal
|
|
lbuf.write(b'\x0B')
|
|
num = int(g.group('oct'), 8) if g.group('oct') != b'' else 0
|
|
if num > 65535:
|
|
raise BasicError("Overflow")
|
|
lbuf.write(struct.pack('<H', num))
|
|
elif g.group('dec'):
|
|
# dec literal
|
|
lbuf.write(num2bytes(match, tok_mode))
|
|
elif g.group('str'):
|
|
# string
|
|
lbuf.write(g.group())
|
|
elif g.group('del'):
|
|
# characters > 127 aren't written
|
|
pass
|
|
else:
|
|
if len(match) == 1 and (b'A' <= match <= b'Z' or b'0' <= match <= b'9'):
|
|
tok_mode = ModeLiteral # for identifiers like A1
|
|
elif tok_mode == ModeLiteral:
|
|
tok_mode = ModeFloat # revert to float mode otherwise
|
|
if remove_spaces and match == b' ':
|
|
continue
|
|
if match in tok_translate:
|
|
# some symbols need to be translated to tokens for some reason
|
|
lbuf.write(tok_translate[match])
|
|
else:
|
|
lbuf.write(match)
|
|
lbuf.write(b'\0')
|
|
|
|
PRGLines[linenum] = lbuf.getvalue()
|
|
|
|
|
|
# Second pass - Write remaining lines in order
|
|
addr = 0x8001
|
|
buf = io.BytesIO()
|
|
|
|
for linenum in sorted(PRGLines.keys()):
|
|
line = PRGLines[linenum]
|
|
addr += len(line) + 4
|
|
buf.write(struct.pack('<HH', addr, linenum))
|
|
buf.write(line)
|
|
|
|
buf.write(b'\0\0')
|
|
|
|
return buf.getvalue()
|
|
|
|
|
|
def main():
|
|
f = open(sys.argv[1], 'r', newline='')
|
|
try:
|
|
lines = f.read()
|
|
finally:
|
|
f.close()
|
|
out = tokenize(lines, use_cr=len(sys.argv) > 3 and sys.argv[3] == '1')
|
|
f = open(sys.argv[2], 'wb')
|
|
try:
|
|
f.write(b'\xFF' + out)
|
|
finally:
|
|
f.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|