CS2613/utils/python-venv/Lib/site-packages/coverage/phystokens.py

# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt

"""Better tokenizing for coverage.py."""

import ast
import keyword
import re
import token
import tokenize

from coverage import env
from coverage.misc import contract


def phys_tokens(toks):
    """Return all physical tokens, even line continuations.

    tokenize.generate_tokens() doesn't return a token for the backslash that
    continues lines.  This wrapper provides those tokens so that we can
    re-create a faithful representation of the original source.

    Returns the same values as generate_tokens()

    """
    last_line = None
    last_lineno = -1
    last_ttext = None
    for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
        if last_lineno != elineno:
            if last_line and last_line.endswith("\\\n"):
                # We are at the beginning of a new line, and the last line
                # ended with a backslash.  We probably have to inject a
                # backslash token into the stream. Unfortunately, there's more
                # to figure out.  This code::
                #
                #   usage = """\
                #   HEY THERE
                #   """
                #
                # triggers this condition, but the token text is::
                #
                #   '"""\\\nHEY THERE\n"""'
                #
                # so we need to figure out if the backslash is already in the
                # string token or not.
                inject_backslash = True
                if last_ttext.endswith("\\"):
                    inject_backslash = False
                elif ttype == token.STRING:
                    if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
                        # It's a multi-line string and the first line ends with
                        # a backslash, so we don't need to inject another.
                        inject_backslash = False
                if inject_backslash:
                    # Figure out what column the backslash is in.
                    ccol = len(last_line.split("\n")[-2]) - 1
                    # Yield the token, with a fake token type.
                    yield (
                        99999, "\\\n",
                        (slineno, ccol), (slineno, ccol+2),
                        last_line
                    )
            last_line = ltext
        if ttype not in (tokenize.NEWLINE, tokenize.NL):
            last_ttext = ttext
        yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
        last_lineno = elineno


class MatchCaseFinder(ast.NodeVisitor):
    """Helper for finding match/case lines."""
    def __init__(self, source):
        # This will be the set of line numbers that start match or case statements.
        self.match_case_lines = set()
        self.visit(ast.parse(source))

    def visit_Match(self, node):
        """Invoked by ast.NodeVisitor.visit"""
        self.match_case_lines.add(node.lineno)
        for case in node.cases:
            self.match_case_lines.add(case.pattern.lineno)
        self.generic_visit(node)


@contract(source='unicode')
def source_token_lines(source):
    """Generate a series of lines, one for each line in `source`.

    Each line is a list of pairs, each pair is a token::

        [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]

    Each pair has a token class, and the token text.

    If you concatenate all the token texts, and then join them with newlines,
    you should have your original `source` back, with two differences:
    trailing whitespace is not preserved, and a final line with no newline
    is indistinguishable from a final line with a newline.

    """

    ws_tokens = {token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL}
    line = []
    col = 0

    source = source.expandtabs(8).replace('\r\n', '\n')
    tokgen = generate_tokens(source)

    if env.PYBEHAVIOR.soft_keywords:
        match_case_lines = MatchCaseFinder(source).match_case_lines

    for ttype, ttext, (sline, scol), (_, ecol), _ in phys_tokens(tokgen):
        mark_start = True
        for part in re.split('(\n)', ttext):
            if part == '\n':
                yield line
                line = []
                col = 0
                mark_end = False
            elif part == '':
                mark_end = False
            elif ttype in ws_tokens:
                mark_end = False
            else:
                if mark_start and scol > col:
                    line.append(("ws", " " * (scol - col)))
                    mark_start = False
                tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
                if ttype == token.NAME:
                    if keyword.iskeyword(ttext):
                        # Hard keywords are always keywords.
                        tok_class = "key"
                    elif env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext):
                        # Soft keywords appear at the start of the line, on lines that start
                        # match or case statements.
                        if len(line) == 0:
                            is_start_of_line = True
                        elif (len(line) == 1) and line[0][0] == "ws":
                            is_start_of_line = True
                        else:
                            is_start_of_line = False
                        if is_start_of_line and sline in match_case_lines:
                            tok_class = "key"
                line.append((tok_class, part))
                mark_end = True
            scol = 0
        if mark_end:
            col = ecol

    if line:
        yield line


class CachedTokenizer:
    """A one-element cache around tokenize.generate_tokens.

    When reporting, coverage.py tokenizes files twice, once to find the
    structure of the file, and once to syntax-color it.  Tokenizing is
    expensive, and easily cached.

    This is a one-element cache so that our twice-in-a-row tokenizing doesn't
    actually tokenize twice.

    """
    def __init__(self):
        self.last_text = None
        self.last_tokens = None

    @contract(text='unicode')
    def generate_tokens(self, text):
        """A stand-in for `tokenize.generate_tokens`."""
        if text != self.last_text:
            self.last_text = text
            readline = iter(text.splitlines(True)).__next__
            try:
                self.last_tokens = list(tokenize.generate_tokens(readline))
            except:
                self.last_text = None
                raise
        return self.last_tokens

# Create our generate_tokens cache as a callable replacement function.
generate_tokens = CachedTokenizer().generate_tokens


COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)

@contract(source='bytes')
def source_encoding(source):
    """Determine the encoding for `source`, according to PEP 263.

    `source` is a byte string: the text of the program.

    Returns a string, the name of the encoding.

    """
    readline = iter(source.splitlines(True)).__next__
    return tokenize.detect_encoding(readline)[0]


@contract(source='unicode')
def compile_unicode(source, filename, mode):
    """Just like the `compile` builtin, but works on any Unicode string.

    Python 2's compile() builtin has a stupid restriction: if the source string
    is Unicode, then it may not have a encoding declaration in it.  Why not?
    Who knows!  It also decodes to utf-8, and then tries to interpret those
    utf-8 bytes according to the encoding declaration.  Why? Who knows!

    This function neuters the coding declaration, and compiles it.

    """
    source = neuter_encoding_declaration(source)
    code = compile(source, filename, mode)
    return code


@contract(source='unicode', returns='unicode')
def neuter_encoding_declaration(source):
    """Return `source`, with any encoding declaration neutered."""
    if COOKIE_RE.search(source):
        source_lines = source.splitlines(True)
        for lineno in range(min(2, len(source_lines))):
            source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno])
        source = "".join(source_lines)
    return source
Add python venv 2022-10-31 10:10:52 -03:00			`# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0`
			`# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt`

			`"""Better tokenizing for coverage.py."""`

			`import ast`
			`import keyword`
			`import re`
			`import token`
			`import tokenize`

			`from coverage import env`
			`from coverage.misc import contract`


			`def phys_tokens(toks):`
			`"""Return all physical tokens, even line continuations.`

			`tokenize.generate_tokens() doesn't return a token for the backslash that`
			`continues lines. This wrapper provides those tokens so that we can`
			`re-create a faithful representation of the original source.`

			`Returns the same values as generate_tokens()`

			`"""`
			`last_line = None`
			`last_lineno = -1`
			`last_ttext = None`
			`for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:`
			`if last_lineno != elineno:`
			`if last_line and last_line.endswith("\\\n"):`
			`# We are at the beginning of a new line, and the last line`
			`# ended with a backslash. We probably have to inject a`
			`# backslash token into the stream. Unfortunately, there's more`
			`# to figure out. This code::`
			`#`
			`# usage = """\`
			`# HEY THERE`
			`# """`
			`#`
			`# triggers this condition, but the token text is::`
			`#`
			`# '"""\\\nHEY THERE\n"""'`
			`#`
			`# so we need to figure out if the backslash is already in the`
			`# string token or not.`
			`inject_backslash = True`
			`if last_ttext.endswith("\\"):`
			`inject_backslash = False`
			`elif ttype == token.STRING:`
			`if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':`
			`# It's a multi-line string and the first line ends with`
			`# a backslash, so we don't need to inject another.`
			`inject_backslash = False`
			`if inject_backslash:`
			`# Figure out what column the backslash is in.`
			`ccol = len(last_line.split("\n")[-2]) - 1`
			`# Yield the token, with a fake token type.`
			`yield (`
			`99999, "\\\n",`
			`(slineno, ccol), (slineno, ccol+2),`
			`last_line`
			`)`
			`last_line = ltext`
			`if ttype not in (tokenize.NEWLINE, tokenize.NL):`
			`last_ttext = ttext`
			`yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext`
			`last_lineno = elineno`


			`class MatchCaseFinder(ast.NodeVisitor):`
			`"""Helper for finding match/case lines."""`
			`def __init__(self, source):`
			`# This will be the set of line numbers that start match or case statements.`
			`self.match_case_lines = set()`
			`self.visit(ast.parse(source))`

			`def visit_Match(self, node):`
			`"""Invoked by ast.NodeVisitor.visit"""`
			`self.match_case_lines.add(node.lineno)`
			`for case in node.cases:`
			`self.match_case_lines.add(case.pattern.lineno)`
			`self.generic_visit(node)`


			`@contract(source='unicode')`
			`def source_token_lines(source):`
			"""Generate a series of lines, one for each line in `source`.

			`Each line is a list of pairs, each pair is a token::`

			`[('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]`

			`Each pair has a token class, and the token text.`

			`If you concatenate all the token texts, and then join them with newlines,`
			you should have your original `source` back, with two differences:
			`trailing whitespace is not preserved, and a final line with no newline`
			`is indistinguishable from a final line with a newline.`

			`"""`

			`ws_tokens = {token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL}`
			`line = []`
			`col = 0`

			`source = source.expandtabs(8).replace('\r\n', '\n')`
			`tokgen = generate_tokens(source)`

			`if env.PYBEHAVIOR.soft_keywords:`
			`match_case_lines = MatchCaseFinder(source).match_case_lines`

			`for ttype, ttext, (sline, scol), (_, ecol), _ in phys_tokens(tokgen):`
			`mark_start = True`
			`for part in re.split('(\n)', ttext):`
			`if part == '\n':`
			`yield line`
			`line = []`
			`col = 0`
			`mark_end = False`
			`elif part == '':`
			`mark_end = False`
			`elif ttype in ws_tokens:`
			`mark_end = False`
			`else:`
			`if mark_start and scol > col:`
			`line.append(("ws", " " * (scol - col)))`
			`mark_start = False`
			`tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]`
			`if ttype == token.NAME:`
			`if keyword.iskeyword(ttext):`
			`# Hard keywords are always keywords.`
			`tok_class = "key"`
			`elif env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext):`
			`# Soft keywords appear at the start of the line, on lines that start`
			`# match or case statements.`
			`if len(line) == 0:`
			`is_start_of_line = True`
			`elif (len(line) == 1) and line[0][0] == "ws":`
			`is_start_of_line = True`
			`else:`
			`is_start_of_line = False`
			`if is_start_of_line and sline in match_case_lines:`
			`tok_class = "key"`
			`line.append((tok_class, part))`
			`mark_end = True`
			`scol = 0`
			`if mark_end:`
			`col = ecol`

			`if line:`
			`yield line`


			`class CachedTokenizer:`
			`"""A one-element cache around tokenize.generate_tokens.`

			`When reporting, coverage.py tokenizes files twice, once to find the`
			`structure of the file, and once to syntax-color it. Tokenizing is`
			`expensive, and easily cached.`

			`This is a one-element cache so that our twice-in-a-row tokenizing doesn't`
			`actually tokenize twice.`

			`"""`
			`def __init__(self):`
			`self.last_text = None`
			`self.last_tokens = None`

			`@contract(text='unicode')`
			`def generate_tokens(self, text):`
			"""A stand-in for `tokenize.generate_tokens`."""
			`if text != self.last_text:`
			`self.last_text = text`
			`readline = iter(text.splitlines(True)).__next__`
			`try:`
			`self.last_tokens = list(tokenize.generate_tokens(readline))`
			`except:`
			`self.last_text = None`
			`raise`
			`return self.last_tokens`

			`# Create our generate_tokens cache as a callable replacement function.`
			`generate_tokens = CachedTokenizer().generate_tokens`


			`COOKIE_RE = re.compile(r"^[ \t]#.coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)`

			`@contract(source='bytes')`
			`def source_encoding(source):`
			"""Determine the encoding for `source`, according to PEP 263.

			`source` is a byte string: the text of the program.

			`Returns a string, the name of the encoding.`

			`"""`
			`readline = iter(source.splitlines(True)).__next__`
			`return tokenize.detect_encoding(readline)[0]`


			`@contract(source='unicode')`
			`def compile_unicode(source, filename, mode):`
			"""Just like the `compile` builtin, but works on any Unicode string.

			`Python 2's compile() builtin has a stupid restriction: if the source string`
			`is Unicode, then it may not have a encoding declaration in it. Why not?`
			`Who knows! It also decodes to utf-8, and then tries to interpret those`
			`utf-8 bytes according to the encoding declaration. Why? Who knows!`

			`This function neuters the coding declaration, and compiles it.`

			`"""`
			`source = neuter_encoding_declaration(source)`
			`code = compile(source, filename, mode)`
			`return code`


			`@contract(source='unicode', returns='unicode')`
			`def neuter_encoding_declaration(source):`
			"""Return `source`, with any encoding declaration neutered."""
			`if COOKIE_RE.search(source):`
			`source_lines = source.splitlines(True)`
			`for lineno in range(min(2, len(source_lines))):`
			`source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno])`
			`source = "".join(source_lines)`
			`return source`