447 lines
14 KiB
Python
447 lines
14 KiB
Python
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
import logging
|
|
import os
|
|
|
|
import ply.lex as lex
|
|
|
|
_MYPY = False
|
|
if _MYPY:
|
|
import typing # noqa: F401 # pylint: disable=import-error,unused-import,useless-suppression
|
|
|
|
|
|
class MultiToken(object):
|
|
"""Object used to monkeypatch ply.lex so that we can return multiple
|
|
tokens from one lex operation."""
|
|
def __init__(self, tokens):
|
|
self.type = tokens[0].type
|
|
self.tokens = tokens
|
|
|
|
# Represents a null value. We want to differentiate between the Python "None"
|
|
# and null in several places.
|
|
NullToken = object()
|
|
|
|
|
|
class Lexer(object):
|
|
"""
|
|
Lexer. Tokenizes stone files.
|
|
"""
|
|
|
|
states = (
|
|
('WSIGNORE', 'inclusive'),
|
|
)
|
|
|
|
def __init__(self):
|
|
self.lex = None
|
|
self.tokens_queue = None
|
|
# The current indentation "level" rather than a count of spaces.
|
|
self.cur_indent = None
|
|
self._logger = logging.getLogger('stone.stone.lexer')
|
|
self.last_token = None
|
|
# [(character, line number), ...]
|
|
self.errors = []
|
|
|
|
def input(self, file_data, **kwargs):
|
|
"""
|
|
Required by ply.yacc for this to quack (duck typing) like a ply lexer.
|
|
|
|
:param str file_data: Contents of the file to lex.
|
|
"""
|
|
self.lex = lex.lex(module=self, **kwargs)
|
|
self.tokens_queue = []
|
|
self.cur_indent = 0
|
|
# Hack to avoid tokenization bugs caused by files that do not end in a
|
|
# new line.
|
|
self.lex.input(file_data + '\n')
|
|
|
|
def token(self):
|
|
"""
|
|
Returns the next LexToken. Returns None when all tokens have been
|
|
exhausted.
|
|
"""
|
|
|
|
if self.tokens_queue:
|
|
self.last_token = self.tokens_queue.pop(0)
|
|
else:
|
|
r = self.lex.token()
|
|
if isinstance(r, MultiToken):
|
|
self.tokens_queue.extend(r.tokens)
|
|
self.last_token = self.tokens_queue.pop(0)
|
|
else:
|
|
if r is None and self.cur_indent > 0:
|
|
if (self.last_token and
|
|
self.last_token.type not in ('NEWLINE', 'LINE')):
|
|
newline_token = _create_token(
|
|
'NEWLINE', '\n', self.lex.lineno, self.lex.lexpos)
|
|
self.tokens_queue.append(newline_token)
|
|
dedent_count = self.cur_indent
|
|
dedent_token = _create_token(
|
|
'DEDENT', '\t', self.lex.lineno, self.lex.lexpos)
|
|
self.tokens_queue.extend([dedent_token] * dedent_count)
|
|
|
|
self.cur_indent = 0
|
|
self.last_token = self.tokens_queue.pop(0)
|
|
else:
|
|
self.last_token = r
|
|
return self.last_token
|
|
|
|
def test(self, data):
|
|
"""Logs all tokens for human inspection. Useful for debugging."""
|
|
self.input(data)
|
|
while True:
|
|
token = self.token()
|
|
if not token:
|
|
break
|
|
self._logger.debug('Token %r', token)
|
|
|
|
# List of token names
|
|
tokens = (
|
|
'ID',
|
|
'KEYWORD',
|
|
'PATH',
|
|
'DOT',
|
|
) # type: typing.Tuple[typing.Text, ...]
|
|
|
|
# Whitespace tokens
|
|
tokens += (
|
|
'DEDENT',
|
|
'INDENT',
|
|
'NEWLINE',
|
|
)
|
|
|
|
# Attribute lists, aliases
|
|
tokens += (
|
|
'COMMA',
|
|
'EQ',
|
|
'LPAR',
|
|
'RPAR',
|
|
)
|
|
|
|
# Primitive types
|
|
tokens += (
|
|
'BOOLEAN',
|
|
'FLOAT',
|
|
'INTEGER',
|
|
'NULL',
|
|
'STRING',
|
|
)
|
|
|
|
# List notation
|
|
tokens += (
|
|
'LBRACKET',
|
|
'RBRACKET',
|
|
)
|
|
|
|
# Map notation
|
|
tokens += (
|
|
'LBRACE',
|
|
'RBRACE',
|
|
'COLON',
|
|
)
|
|
|
|
tokens += (
|
|
'Q',
|
|
)
|
|
|
|
# Annotation notation
|
|
tokens += (
|
|
'AT',
|
|
)
|
|
|
|
# Regular expression rules for simple tokens
|
|
t_DOT = r'\.'
|
|
t_LBRACKET = r'\['
|
|
t_RBRACKET = r'\]'
|
|
t_EQ = r'='
|
|
t_COMMA = r','
|
|
t_Q = r'\?'
|
|
t_LBRACE = r'\{'
|
|
t_RBRACE = r'\}'
|
|
t_COLON = r'\:'
|
|
t_AT = r'@'
|
|
|
|
# TODO(kelkabany): Use scoped/conditional lexing to restrict where keywords
|
|
# are identified as such.
|
|
KEYWORDS = [
|
|
'alias',
|
|
'annotation',
|
|
'annotation_type',
|
|
'attrs',
|
|
'by',
|
|
'deprecated',
|
|
'doc',
|
|
'example',
|
|
'error',
|
|
'extends',
|
|
'import',
|
|
'namespace',
|
|
'patch',
|
|
'route',
|
|
'struct',
|
|
'union',
|
|
'union_closed',
|
|
]
|
|
|
|
RESERVED = {
|
|
'annotation': 'ANNOTATION',
|
|
'annotation_type': 'ANNOTATION_TYPE',
|
|
'attrs': 'ATTRS',
|
|
'deprecated': 'DEPRECATED',
|
|
'by': 'BY',
|
|
'extends': 'EXTENDS',
|
|
'import': 'IMPORT',
|
|
'patch': 'PATCH',
|
|
'route': 'ROUTE',
|
|
'struct': 'STRUCT',
|
|
'union': 'UNION',
|
|
'union_closed': 'UNION_CLOSED',
|
|
}
|
|
|
|
tokens += tuple(RESERVED.values())
|
|
|
|
def t_LPAR(self, token):
|
|
r'\('
|
|
token.lexer.push_state('WSIGNORE')
|
|
return token
|
|
|
|
def t_RPAR(self, token):
|
|
r'\)'
|
|
token.lexer.pop_state()
|
|
return token
|
|
|
|
def t_ANY_BOOLEAN(self, token):
|
|
r'\btrue\b|\bfalse\b'
|
|
token.value = (token.value == 'true')
|
|
return token
|
|
|
|
def t_ANY_NULL(self, token):
|
|
r'\bnull\b'
|
|
token.value = NullToken
|
|
return token
|
|
|
|
# No leading digits
|
|
def t_ANY_ID(self, token):
|
|
r'[a-zA-Z_][a-zA-Z0-9_-]*'
|
|
if token.value in self.KEYWORDS:
|
|
if (token.value == 'annotation_type') and self.cur_indent:
|
|
# annotation_type was added as a reserved keyword relatively
|
|
# late, when there could be identifers with the same name
|
|
# in existing specs. because annotation_type-the-keyword can
|
|
# only be used at the beginning of a non-indented line, this
|
|
# check lets both the keyword and the identifer coexist and
|
|
# maintains backward compatibility.
|
|
# Note: this is kind of a hack, and we should get rid of it if
|
|
# the lexer gets better at telling keywords from identifiers in general.
|
|
return token
|
|
token.type = self.RESERVED.get(token.value, 'KEYWORD')
|
|
return token
|
|
else:
|
|
return token
|
|
|
|
def t_ANY_PATH(self, token):
|
|
r'\/[/a-zA-Z0-9_-]*'
|
|
return token
|
|
|
|
def t_ANY_FLOAT(self, token):
|
|
r'-?\d+(\.\d*(e-?\d+)?|e-?\d+)'
|
|
token.value = float(token.value)
|
|
return token
|
|
|
|
def t_ANY_INTEGER(self, token):
|
|
r'-?\d+'
|
|
token.value = int(token.value)
|
|
return token
|
|
|
|
# Read in a string while respecting the following escape sequences:
|
|
# \", \\, \n, and \t.
|
|
def t_ANY_STRING(self, t):
|
|
r'\"([^\\"]|(\\.))*\"'
|
|
escaped = 0
|
|
t.lexer.lineno += t.value.count('\n')
|
|
s = t.value[1:-1]
|
|
new_str = ""
|
|
for i in range(0, len(s)):
|
|
c = s[i]
|
|
if escaped:
|
|
if c == 'n':
|
|
c = '\n'
|
|
elif c == 't':
|
|
c = '\t'
|
|
new_str += c
|
|
escaped = 0
|
|
else:
|
|
if c == '\\':
|
|
escaped = 1
|
|
else:
|
|
new_str += c
|
|
# remove current indentation
|
|
indentation_str = ' ' * _indent_level_to_spaces_count(self.cur_indent)
|
|
lines_without_indentation = [
|
|
line.replace(indentation_str, '', 1)
|
|
for line in new_str.splitlines()]
|
|
t.value = '\n'.join(lines_without_indentation)
|
|
return t
|
|
|
|
# Ignore comments.
|
|
# There are two types of comments.
|
|
# 1. Comments that take up a full line. These lines are ignored entirely.
|
|
# 2. Comments that come after tokens in the same line. These comments
|
|
# are ignored, but, we still need to emit a NEWLINE since this rule
|
|
# takes all trailing newlines.
|
|
# Regardless of comment type, the following line must be checked for a
|
|
# DEDENT or INDENT.
|
|
def t_INITIAL_comment(self, token):
|
|
r'[#][^\n]*\n+'
|
|
token.lexer.lineno += token.value.count('\n')
|
|
# Scan backwards from the comment hash to figure out which type of
|
|
# comment this is. If we find an non-ws character, we know it was a
|
|
# partial line. But, if we find a newline before a non-ws character,
|
|
# then we know the entire line was a comment.
|
|
i = token.lexpos - 1
|
|
while i >= 0:
|
|
is_full_line_comment = token.lexer.lexdata[i] == '\n'
|
|
is_partial_line_comment = (not is_full_line_comment and
|
|
token.lexer.lexdata[i] != ' ')
|
|
if is_full_line_comment or is_partial_line_comment:
|
|
newline_token = _create_token('NEWLINE', '\n',
|
|
token.lineno, token.lexpos + len(token.value) - 1)
|
|
newline_token.lexer = token.lexer
|
|
dent_tokens = self._create_tokens_for_next_line_dent(
|
|
newline_token)
|
|
if is_full_line_comment:
|
|
# Comment takes the full line so ignore entirely.
|
|
return dent_tokens
|
|
elif is_partial_line_comment:
|
|
# Comment is only a partial line. Preserve newline token.
|
|
if dent_tokens:
|
|
dent_tokens.tokens.insert(0, newline_token)
|
|
return dent_tokens
|
|
else:
|
|
return newline_token
|
|
i -= 1
|
|
|
|
def t_WSIGNORE_comment(self, token):
|
|
r'[#][^\n]*\n+'
|
|
token.lexer.lineno += token.value.count('\n')
|
|
newline_token = _create_token('NEWLINE', '\n',
|
|
token.lineno, token.lexpos + len(token.value) - 1)
|
|
newline_token.lexer = token.lexer
|
|
self._check_for_indent(newline_token)
|
|
|
|
# Define a rule so we can track line numbers
|
|
def t_INITIAL_NEWLINE(self, newline_token):
|
|
r'\n+'
|
|
newline_token.lexer.lineno += newline_token.value.count('\n')
|
|
dent_tokens = self._create_tokens_for_next_line_dent(newline_token)
|
|
if dent_tokens:
|
|
dent_tokens.tokens.insert(0, newline_token)
|
|
return dent_tokens
|
|
else:
|
|
return newline_token
|
|
|
|
def t_WSIGNORE_NEWLINE(self, newline_token):
|
|
r'\n+'
|
|
newline_token.lexer.lineno += newline_token.value.count('\n')
|
|
self._check_for_indent(newline_token)
|
|
|
|
def _create_tokens_for_next_line_dent(self, newline_token):
|
|
"""
|
|
Starting from a newline token that isn't followed by another newline
|
|
token, returns any indent or dedent tokens that immediately follow.
|
|
If indentation doesn't change, returns None.
|
|
"""
|
|
indent_delta = self._get_next_line_indent_delta(newline_token)
|
|
if indent_delta is None or indent_delta == 0:
|
|
# Next line's indent isn't relevant OR there was no change in
|
|
# indentation.
|
|
return None
|
|
|
|
dent_type = 'INDENT' if indent_delta > 0 else 'DEDENT'
|
|
dent_token = _create_token(
|
|
dent_type, '\t', newline_token.lineno + 1,
|
|
newline_token.lexpos + len(newline_token.value))
|
|
|
|
tokens = [dent_token] * abs(indent_delta)
|
|
self.cur_indent += indent_delta
|
|
return MultiToken(tokens)
|
|
|
|
def _check_for_indent(self, newline_token):
|
|
"""
|
|
Checks that the line following a newline is indented, otherwise a
|
|
parsing error is generated.
|
|
"""
|
|
indent_delta = self._get_next_line_indent_delta(newline_token)
|
|
if indent_delta is None or indent_delta == 1:
|
|
# Next line's indent isn't relevant (e.g. it's a comment) OR
|
|
# next line is correctly indented.
|
|
return None
|
|
else:
|
|
self.errors.append(
|
|
('Line continuation must increment indent by 1.',
|
|
newline_token.lexer.lineno))
|
|
|
|
def _get_next_line_indent_delta(self, newline_token):
|
|
"""
|
|
Returns the change in indentation. The return units are in
|
|
indentations rather than spaces/tabs.
|
|
|
|
If the next line's indent isn't relevant (e.g. it's a comment),
|
|
returns None. Since the return value might be 0, the caller should
|
|
explicitly check the return type, rather than rely on truthiness.
|
|
"""
|
|
assert newline_token.type == 'NEWLINE', \
|
|
'Can only search for a dent starting from a newline.'
|
|
next_line_pos = newline_token.lexpos + len(newline_token.value)
|
|
if next_line_pos == len(newline_token.lexer.lexdata):
|
|
# Reached end of file
|
|
return None
|
|
|
|
line = newline_token.lexer.lexdata[next_line_pos:].split(os.linesep, 1)[0]
|
|
if not line:
|
|
return None
|
|
lstripped_line = line.lstrip()
|
|
lstripped_line_length = len(lstripped_line)
|
|
if lstripped_line_length == 0:
|
|
# If the next line is composed of only spaces, ignore indentation.
|
|
return None
|
|
if lstripped_line[0] == '#':
|
|
# If it's a comment line, ignore indentation.
|
|
return None
|
|
|
|
indent = len(line) - lstripped_line_length
|
|
if indent % 4 > 0:
|
|
self.errors.append(
|
|
('Indent is not divisible by 4.', newline_token.lexer.lineno))
|
|
return None
|
|
|
|
indent_delta = indent - _indent_level_to_spaces_count(self.cur_indent)
|
|
return indent_delta // 4
|
|
|
|
# A string containing ignored characters (spaces and tabs)
|
|
t_ignore = ' \t'
|
|
|
|
# Error handling rule
|
|
def t_ANY_error(self, token):
|
|
self._logger.debug('Illegal character %r at line %d',
|
|
token.value[0], token.lexer.lineno)
|
|
self.errors.append(
|
|
('Illegal character %s.' % repr(token.value[0]).lstrip('u'),
|
|
token.lexer.lineno))
|
|
token.lexer.skip(1)
|
|
|
|
|
|
def _create_token(token_type, value, lineno, lexpos):
|
|
"""
|
|
Helper for creating ply.lex.LexToken objects. Unfortunately, LexToken
|
|
does not have a constructor defined to make settings these values easy.
|
|
"""
|
|
token = lex.LexToken()
|
|
token.type = token_type
|
|
token.value = value
|
|
token.lineno = lineno
|
|
token.lexpos = lexpos
|
|
return token
|
|
|
|
def _indent_level_to_spaces_count(indent):
|
|
return indent * 4
|