microproduct/atmosphericDelay/ISCEApp/site-packages/stone/frontend/lexer.py

447 lines
14 KiB
Python

from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
import ply.lex as lex
_MYPY = False
if _MYPY:
import typing # noqa: F401 # pylint: disable=import-error,unused-import,useless-suppression
class MultiToken(object):
"""Object used to monkeypatch ply.lex so that we can return multiple
tokens from one lex operation."""
def __init__(self, tokens):
self.type = tokens[0].type
self.tokens = tokens
# Represents a null value. We want to differentiate between the Python "None"
# and null in several places.
NullToken = object()
class Lexer(object):
"""
Lexer. Tokenizes stone files.
"""
states = (
('WSIGNORE', 'inclusive'),
)
def __init__(self):
self.lex = None
self.tokens_queue = None
# The current indentation "level" rather than a count of spaces.
self.cur_indent = None
self._logger = logging.getLogger('stone.stone.lexer')
self.last_token = None
# [(character, line number), ...]
self.errors = []
def input(self, file_data, **kwargs):
"""
Required by ply.yacc for this to quack (duck typing) like a ply lexer.
:param str file_data: Contents of the file to lex.
"""
self.lex = lex.lex(module=self, **kwargs)
self.tokens_queue = []
self.cur_indent = 0
# Hack to avoid tokenization bugs caused by files that do not end in a
# new line.
self.lex.input(file_data + '\n')
def token(self):
"""
Returns the next LexToken. Returns None when all tokens have been
exhausted.
"""
if self.tokens_queue:
self.last_token = self.tokens_queue.pop(0)
else:
r = self.lex.token()
if isinstance(r, MultiToken):
self.tokens_queue.extend(r.tokens)
self.last_token = self.tokens_queue.pop(0)
else:
if r is None and self.cur_indent > 0:
if (self.last_token and
self.last_token.type not in ('NEWLINE', 'LINE')):
newline_token = _create_token(
'NEWLINE', '\n', self.lex.lineno, self.lex.lexpos)
self.tokens_queue.append(newline_token)
dedent_count = self.cur_indent
dedent_token = _create_token(
'DEDENT', '\t', self.lex.lineno, self.lex.lexpos)
self.tokens_queue.extend([dedent_token] * dedent_count)
self.cur_indent = 0
self.last_token = self.tokens_queue.pop(0)
else:
self.last_token = r
return self.last_token
def test(self, data):
"""Logs all tokens for human inspection. Useful for debugging."""
self.input(data)
while True:
token = self.token()
if not token:
break
self._logger.debug('Token %r', token)
# List of token names
tokens = (
'ID',
'KEYWORD',
'PATH',
'DOT',
) # type: typing.Tuple[typing.Text, ...]
# Whitespace tokens
tokens += (
'DEDENT',
'INDENT',
'NEWLINE',
)
# Attribute lists, aliases
tokens += (
'COMMA',
'EQ',
'LPAR',
'RPAR',
)
# Primitive types
tokens += (
'BOOLEAN',
'FLOAT',
'INTEGER',
'NULL',
'STRING',
)
# List notation
tokens += (
'LBRACKET',
'RBRACKET',
)
# Map notation
tokens += (
'LBRACE',
'RBRACE',
'COLON',
)
tokens += (
'Q',
)
# Annotation notation
tokens += (
'AT',
)
# Regular expression rules for simple tokens
t_DOT = r'\.'
t_LBRACKET = r'\['
t_RBRACKET = r'\]'
t_EQ = r'='
t_COMMA = r','
t_Q = r'\?'
t_LBRACE = r'\{'
t_RBRACE = r'\}'
t_COLON = r'\:'
t_AT = r'@'
# TODO(kelkabany): Use scoped/conditional lexing to restrict where keywords
# are identified as such.
KEYWORDS = [
'alias',
'annotation',
'annotation_type',
'attrs',
'by',
'deprecated',
'doc',
'example',
'error',
'extends',
'import',
'namespace',
'patch',
'route',
'struct',
'union',
'union_closed',
]
RESERVED = {
'annotation': 'ANNOTATION',
'annotation_type': 'ANNOTATION_TYPE',
'attrs': 'ATTRS',
'deprecated': 'DEPRECATED',
'by': 'BY',
'extends': 'EXTENDS',
'import': 'IMPORT',
'patch': 'PATCH',
'route': 'ROUTE',
'struct': 'STRUCT',
'union': 'UNION',
'union_closed': 'UNION_CLOSED',
}
tokens += tuple(RESERVED.values())
def t_LPAR(self, token):
r'\('
token.lexer.push_state('WSIGNORE')
return token
def t_RPAR(self, token):
r'\)'
token.lexer.pop_state()
return token
def t_ANY_BOOLEAN(self, token):
r'\btrue\b|\bfalse\b'
token.value = (token.value == 'true')
return token
def t_ANY_NULL(self, token):
r'\bnull\b'
token.value = NullToken
return token
# No leading digits
def t_ANY_ID(self, token):
r'[a-zA-Z_][a-zA-Z0-9_-]*'
if token.value in self.KEYWORDS:
if (token.value == 'annotation_type') and self.cur_indent:
# annotation_type was added as a reserved keyword relatively
# late, when there could be identifers with the same name
# in existing specs. because annotation_type-the-keyword can
# only be used at the beginning of a non-indented line, this
# check lets both the keyword and the identifer coexist and
# maintains backward compatibility.
# Note: this is kind of a hack, and we should get rid of it if
# the lexer gets better at telling keywords from identifiers in general.
return token
token.type = self.RESERVED.get(token.value, 'KEYWORD')
return token
else:
return token
def t_ANY_PATH(self, token):
r'\/[/a-zA-Z0-9_-]*'
return token
def t_ANY_FLOAT(self, token):
r'-?\d+(\.\d*(e-?\d+)?|e-?\d+)'
token.value = float(token.value)
return token
def t_ANY_INTEGER(self, token):
r'-?\d+'
token.value = int(token.value)
return token
# Read in a string while respecting the following escape sequences:
# \", \\, \n, and \t.
def t_ANY_STRING(self, t):
r'\"([^\\"]|(\\.))*\"'
escaped = 0
t.lexer.lineno += t.value.count('\n')
s = t.value[1:-1]
new_str = ""
for i in range(0, len(s)):
c = s[i]
if escaped:
if c == 'n':
c = '\n'
elif c == 't':
c = '\t'
new_str += c
escaped = 0
else:
if c == '\\':
escaped = 1
else:
new_str += c
# remove current indentation
indentation_str = ' ' * _indent_level_to_spaces_count(self.cur_indent)
lines_without_indentation = [
line.replace(indentation_str, '', 1)
for line in new_str.splitlines()]
t.value = '\n'.join(lines_without_indentation)
return t
# Ignore comments.
# There are two types of comments.
# 1. Comments that take up a full line. These lines are ignored entirely.
# 2. Comments that come after tokens in the same line. These comments
# are ignored, but, we still need to emit a NEWLINE since this rule
# takes all trailing newlines.
# Regardless of comment type, the following line must be checked for a
# DEDENT or INDENT.
def t_INITIAL_comment(self, token):
r'[#][^\n]*\n+'
token.lexer.lineno += token.value.count('\n')
# Scan backwards from the comment hash to figure out which type of
# comment this is. If we find an non-ws character, we know it was a
# partial line. But, if we find a newline before a non-ws character,
# then we know the entire line was a comment.
i = token.lexpos - 1
while i >= 0:
is_full_line_comment = token.lexer.lexdata[i] == '\n'
is_partial_line_comment = (not is_full_line_comment and
token.lexer.lexdata[i] != ' ')
if is_full_line_comment or is_partial_line_comment:
newline_token = _create_token('NEWLINE', '\n',
token.lineno, token.lexpos + len(token.value) - 1)
newline_token.lexer = token.lexer
dent_tokens = self._create_tokens_for_next_line_dent(
newline_token)
if is_full_line_comment:
# Comment takes the full line so ignore entirely.
return dent_tokens
elif is_partial_line_comment:
# Comment is only a partial line. Preserve newline token.
if dent_tokens:
dent_tokens.tokens.insert(0, newline_token)
return dent_tokens
else:
return newline_token
i -= 1
def t_WSIGNORE_comment(self, token):
r'[#][^\n]*\n+'
token.lexer.lineno += token.value.count('\n')
newline_token = _create_token('NEWLINE', '\n',
token.lineno, token.lexpos + len(token.value) - 1)
newline_token.lexer = token.lexer
self._check_for_indent(newline_token)
# Define a rule so we can track line numbers
def t_INITIAL_NEWLINE(self, newline_token):
r'\n+'
newline_token.lexer.lineno += newline_token.value.count('\n')
dent_tokens = self._create_tokens_for_next_line_dent(newline_token)
if dent_tokens:
dent_tokens.tokens.insert(0, newline_token)
return dent_tokens
else:
return newline_token
def t_WSIGNORE_NEWLINE(self, newline_token):
r'\n+'
newline_token.lexer.lineno += newline_token.value.count('\n')
self._check_for_indent(newline_token)
def _create_tokens_for_next_line_dent(self, newline_token):
"""
Starting from a newline token that isn't followed by another newline
token, returns any indent or dedent tokens that immediately follow.
If indentation doesn't change, returns None.
"""
indent_delta = self._get_next_line_indent_delta(newline_token)
if indent_delta is None or indent_delta == 0:
# Next line's indent isn't relevant OR there was no change in
# indentation.
return None
dent_type = 'INDENT' if indent_delta > 0 else 'DEDENT'
dent_token = _create_token(
dent_type, '\t', newline_token.lineno + 1,
newline_token.lexpos + len(newline_token.value))
tokens = [dent_token] * abs(indent_delta)
self.cur_indent += indent_delta
return MultiToken(tokens)
def _check_for_indent(self, newline_token):
"""
Checks that the line following a newline is indented, otherwise a
parsing error is generated.
"""
indent_delta = self._get_next_line_indent_delta(newline_token)
if indent_delta is None or indent_delta == 1:
# Next line's indent isn't relevant (e.g. it's a comment) OR
# next line is correctly indented.
return None
else:
self.errors.append(
('Line continuation must increment indent by 1.',
newline_token.lexer.lineno))
def _get_next_line_indent_delta(self, newline_token):
"""
Returns the change in indentation. The return units are in
indentations rather than spaces/tabs.
If the next line's indent isn't relevant (e.g. it's a comment),
returns None. Since the return value might be 0, the caller should
explicitly check the return type, rather than rely on truthiness.
"""
assert newline_token.type == 'NEWLINE', \
'Can only search for a dent starting from a newline.'
next_line_pos = newline_token.lexpos + len(newline_token.value)
if next_line_pos == len(newline_token.lexer.lexdata):
# Reached end of file
return None
line = newline_token.lexer.lexdata[next_line_pos:].split(os.linesep, 1)[0]
if not line:
return None
lstripped_line = line.lstrip()
lstripped_line_length = len(lstripped_line)
if lstripped_line_length == 0:
# If the next line is composed of only spaces, ignore indentation.
return None
if lstripped_line[0] == '#':
# If it's a comment line, ignore indentation.
return None
indent = len(line) - lstripped_line_length
if indent % 4 > 0:
self.errors.append(
('Indent is not divisible by 4.', newline_token.lexer.lineno))
return None
indent_delta = indent - _indent_level_to_spaces_count(self.cur_indent)
return indent_delta // 4
# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'
# Error handling rule
def t_ANY_error(self, token):
self._logger.debug('Illegal character %r at line %d',
token.value[0], token.lexer.lineno)
self.errors.append(
('Illegal character %s.' % repr(token.value[0]).lstrip('u'),
token.lexer.lineno))
token.lexer.skip(1)
def _create_token(token_type, value, lineno, lexpos):
"""
Helper for creating ply.lex.LexToken objects. Unfortunately, LexToken
does not have a constructor defined to make settings these values easy.
"""
token = lex.LexToken()
token.type = token_type
token.value = value
token.lineno = lineno
token.lexpos = lexpos
return token
def _indent_level_to_spaces_count(indent):
return indent * 4