CS2613/utils/python-venv/Lib/site-packages/coverage/phystokens.py

228 lines
8.1 KiB
Python
Raw Normal View History

2022-10-31 10:10:52 -03:00
# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt
"""Better tokenizing for coverage.py."""
import ast
import keyword
import re
import token
import tokenize
from coverage import env
from coverage.misc import contract
def phys_tokens(toks):
"""Return all physical tokens, even line continuations.
tokenize.generate_tokens() doesn't return a token for the backslash that
continues lines. This wrapper provides those tokens so that we can
re-create a faithful representation of the original source.
Returns the same values as generate_tokens()
"""
last_line = None
last_lineno = -1
last_ttext = None
for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
if last_lineno != elineno:
if last_line and last_line.endswith("\\\n"):
# We are at the beginning of a new line, and the last line
# ended with a backslash. We probably have to inject a
# backslash token into the stream. Unfortunately, there's more
# to figure out. This code::
#
# usage = """\
# HEY THERE
# """
#
# triggers this condition, but the token text is::
#
# '"""\\\nHEY THERE\n"""'
#
# so we need to figure out if the backslash is already in the
# string token or not.
inject_backslash = True
if last_ttext.endswith("\\"):
inject_backslash = False
elif ttype == token.STRING:
if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
# It's a multi-line string and the first line ends with
# a backslash, so we don't need to inject another.
inject_backslash = False
if inject_backslash:
# Figure out what column the backslash is in.
ccol = len(last_line.split("\n")[-2]) - 1
# Yield the token, with a fake token type.
yield (
99999, "\\\n",
(slineno, ccol), (slineno, ccol+2),
last_line
)
last_line = ltext
if ttype not in (tokenize.NEWLINE, tokenize.NL):
last_ttext = ttext
yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
last_lineno = elineno
class MatchCaseFinder(ast.NodeVisitor):
"""Helper for finding match/case lines."""
def __init__(self, source):
# This will be the set of line numbers that start match or case statements.
self.match_case_lines = set()
self.visit(ast.parse(source))
def visit_Match(self, node):
"""Invoked by ast.NodeVisitor.visit"""
self.match_case_lines.add(node.lineno)
for case in node.cases:
self.match_case_lines.add(case.pattern.lineno)
self.generic_visit(node)
@contract(source='unicode')
def source_token_lines(source):
"""Generate a series of lines, one for each line in `source`.
Each line is a list of pairs, each pair is a token::
[('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
Each pair has a token class, and the token text.
If you concatenate all the token texts, and then join them with newlines,
you should have your original `source` back, with two differences:
trailing whitespace is not preserved, and a final line with no newline
is indistinguishable from a final line with a newline.
"""
ws_tokens = {token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL}
line = []
col = 0
source = source.expandtabs(8).replace('\r\n', '\n')
tokgen = generate_tokens(source)
if env.PYBEHAVIOR.soft_keywords:
match_case_lines = MatchCaseFinder(source).match_case_lines
for ttype, ttext, (sline, scol), (_, ecol), _ in phys_tokens(tokgen):
mark_start = True
for part in re.split('(\n)', ttext):
if part == '\n':
yield line
line = []
col = 0
mark_end = False
elif part == '':
mark_end = False
elif ttype in ws_tokens:
mark_end = False
else:
if mark_start and scol > col:
line.append(("ws", " " * (scol - col)))
mark_start = False
tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
if ttype == token.NAME:
if keyword.iskeyword(ttext):
# Hard keywords are always keywords.
tok_class = "key"
elif env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext):
# Soft keywords appear at the start of the line, on lines that start
# match or case statements.
if len(line) == 0:
is_start_of_line = True
elif (len(line) == 1) and line[0][0] == "ws":
is_start_of_line = True
else:
is_start_of_line = False
if is_start_of_line and sline in match_case_lines:
tok_class = "key"
line.append((tok_class, part))
mark_end = True
scol = 0
if mark_end:
col = ecol
if line:
yield line
class CachedTokenizer:
"""A one-element cache around tokenize.generate_tokens.
When reporting, coverage.py tokenizes files twice, once to find the
structure of the file, and once to syntax-color it. Tokenizing is
expensive, and easily cached.
This is a one-element cache so that our twice-in-a-row tokenizing doesn't
actually tokenize twice.
"""
def __init__(self):
self.last_text = None
self.last_tokens = None
@contract(text='unicode')
def generate_tokens(self, text):
"""A stand-in for `tokenize.generate_tokens`."""
if text != self.last_text:
self.last_text = text
readline = iter(text.splitlines(True)).__next__
try:
self.last_tokens = list(tokenize.generate_tokens(readline))
except:
self.last_text = None
raise
return self.last_tokens
# Create our generate_tokens cache as a callable replacement function.
generate_tokens = CachedTokenizer().generate_tokens
COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
@contract(source='bytes')
def source_encoding(source):
"""Determine the encoding for `source`, according to PEP 263.
`source` is a byte string: the text of the program.
Returns a string, the name of the encoding.
"""
readline = iter(source.splitlines(True)).__next__
return tokenize.detect_encoding(readline)[0]
@contract(source='unicode')
def compile_unicode(source, filename, mode):
"""Just like the `compile` builtin, but works on any Unicode string.
Python 2's compile() builtin has a stupid restriction: if the source string
is Unicode, then it may not have a encoding declaration in it. Why not?
Who knows! It also decodes to utf-8, and then tries to interpret those
utf-8 bytes according to the encoding declaration. Why? Who knows!
This function neuters the coding declaration, and compiles it.
"""
source = neuter_encoding_declaration(source)
code = compile(source, filename, mode)
return code
@contract(source='unicode', returns='unicode')
def neuter_encoding_declaration(source):
"""Return `source`, with any encoding declaration neutered."""
if COOKIE_RE.search(source):
source_lines = source.splitlines(True)
for lineno in range(min(2, len(source_lines))):
source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno])
source = "".join(source_lines)
return source