Add python venv

This commit is contained in:
Isaac Shoebottom
2022-10-31 10:10:52 -03:00
parent fb1a0435c1
commit a50f49d2c8
913 changed files with 287881 additions and 0 deletions

View File

@ -0,0 +1,111 @@
"""
pip._vendor is for vendoring dependencies of pip to prevent needing pip to
depend on something external.
Files inside of pip._vendor should be considered immutable and should only be
updated to versions from upstream.
"""
from __future__ import absolute_import
import glob
import os.path
import sys
# Downstream redistributors which have debundled our dependencies should also
# patch this value to be true. This will trigger the additional patching
# to cause things like "six" to be available as pip.
DEBUNDLED = False
# By default, look in this directory for a bunch of .whl files which we will
# add to the beginning of sys.path before attempting to import anything. This
# is done to support downstream re-distributors like Debian and Fedora who
# wish to create their own Wheels for our dependencies to aid in debundling.
WHEEL_DIR = os.path.abspath(os.path.dirname(__file__))
# Define a small helper function to alias our vendored modules to the real ones
# if the vendored ones do not exist. This idea of this was taken from
# https://github.com/kennethreitz/requests/pull/2567.
def vendored(modulename):
vendored_name = "{0}.{1}".format(__name__, modulename)
try:
__import__(modulename, globals(), locals(), level=0)
except ImportError:
# We can just silently allow import failures to pass here. If we
# got to this point it means that ``import pip._vendor.whatever``
# failed and so did ``import whatever``. Since we're importing this
# upfront in an attempt to alias imports, not erroring here will
# just mean we get a regular import error whenever pip *actually*
# tries to import one of these modules to use it, which actually
# gives us a better error message than we would have otherwise
# gotten.
pass
else:
sys.modules[vendored_name] = sys.modules[modulename]
base, head = vendored_name.rsplit(".", 1)
setattr(sys.modules[base], head, sys.modules[modulename])
# If we're operating in a debundled setup, then we want to go ahead and trigger
# the aliasing of our vendored libraries as well as looking for wheels to add
# to our sys.path. This will cause all of this code to be a no-op typically
# however downstream redistributors can enable it in a consistent way across
# all platforms.
if DEBUNDLED:
# Actually look inside of WHEEL_DIR to find .whl files and add them to the
# front of our sys.path.
sys.path[:] = glob.glob(os.path.join(WHEEL_DIR, "*.whl")) + sys.path
# Actually alias all of our vendored dependencies.
vendored("cachecontrol")
vendored("certifi")
vendored("colorama")
vendored("distlib")
vendored("distro")
vendored("html5lib")
vendored("six")
vendored("six.moves")
vendored("six.moves.urllib")
vendored("six.moves.urllib.parse")
vendored("packaging")
vendored("packaging.version")
vendored("packaging.specifiers")
vendored("pep517")
vendored("pkg_resources")
vendored("platformdirs")
vendored("progress")
vendored("requests")
vendored("requests.exceptions")
vendored("requests.packages")
vendored("requests.packages.urllib3")
vendored("requests.packages.urllib3._collections")
vendored("requests.packages.urllib3.connection")
vendored("requests.packages.urllib3.connectionpool")
vendored("requests.packages.urllib3.contrib")
vendored("requests.packages.urllib3.contrib.ntlmpool")
vendored("requests.packages.urllib3.contrib.pyopenssl")
vendored("requests.packages.urllib3.exceptions")
vendored("requests.packages.urllib3.fields")
vendored("requests.packages.urllib3.filepost")
vendored("requests.packages.urllib3.packages")
vendored("requests.packages.urllib3.packages.ordered_dict")
vendored("requests.packages.urllib3.packages.six")
vendored("requests.packages.urllib3.packages.ssl_match_hostname")
vendored("requests.packages.urllib3.packages.ssl_match_hostname."
"_implementation")
vendored("requests.packages.urllib3.poolmanager")
vendored("requests.packages.urllib3.request")
vendored("requests.packages.urllib3.response")
vendored("requests.packages.urllib3.util")
vendored("requests.packages.urllib3.util.connection")
vendored("requests.packages.urllib3.util.request")
vendored("requests.packages.urllib3.util.response")
vendored("requests.packages.urllib3.util.retry")
vendored("requests.packages.urllib3.util.ssl_")
vendored("requests.packages.urllib3.util.timeout")
vendored("requests.packages.urllib3.util.url")
vendored("resolvelib")
vendored("tenacity")
vendored("tomli")
vendored("urllib3")

View File

@ -0,0 +1,11 @@
"""CacheControl import Interface.
Make it easy to import from cachecontrol without long namespaces.
"""
__author__ = "Eric Larson"
__email__ = "eric@ionrock.org"
__version__ = "0.12.6"
from .wrapper import CacheControl
from .adapter import CacheControlAdapter
from .controller import CacheController

View File

@ -0,0 +1,57 @@
import logging
from pip._vendor import requests
from pip._vendor.cachecontrol.adapter import CacheControlAdapter
from pip._vendor.cachecontrol.cache import DictCache
from pip._vendor.cachecontrol.controller import logger
from argparse import ArgumentParser
def setup_logging():
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
logger.addHandler(handler)
def get_session():
adapter = CacheControlAdapter(
DictCache(), cache_etags=True, serializer=None, heuristic=None
)
sess = requests.Session()
sess.mount("http://", adapter)
sess.mount("https://", adapter)
sess.cache_controller = adapter.controller
return sess
def get_args():
parser = ArgumentParser()
parser.add_argument("url", help="The URL to try and cache")
return parser.parse_args()
def main(args=None):
args = get_args()
sess = get_session()
# Make a request to get a response
resp = sess.get(args.url)
# Turn on logging
setup_logging()
# try setting the cache
sess.cache_controller.cache_response(resp.request, resp.raw)
# Now try to get it
if sess.cache_controller.cached_request(resp.request):
print("Cached!")
else:
print("Not cached :(")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,133 @@
import types
import functools
import zlib
from pip._vendor.requests.adapters import HTTPAdapter
from .controller import CacheController
from .cache import DictCache
from .filewrapper import CallbackFileWrapper
class CacheControlAdapter(HTTPAdapter):
invalidating_methods = {"PUT", "DELETE"}
def __init__(
self,
cache=None,
cache_etags=True,
controller_class=None,
serializer=None,
heuristic=None,
cacheable_methods=None,
*args,
**kw
):
super(CacheControlAdapter, self).__init__(*args, **kw)
self.cache = DictCache() if cache is None else cache
self.heuristic = heuristic
self.cacheable_methods = cacheable_methods or ("GET",)
controller_factory = controller_class or CacheController
self.controller = controller_factory(
self.cache, cache_etags=cache_etags, serializer=serializer
)
def send(self, request, cacheable_methods=None, **kw):
"""
Send a request. Use the request information to see if it
exists in the cache and cache the response if we need to and can.
"""
cacheable = cacheable_methods or self.cacheable_methods
if request.method in cacheable:
try:
cached_response = self.controller.cached_request(request)
except zlib.error:
cached_response = None
if cached_response:
return self.build_response(request, cached_response, from_cache=True)
# check for etags and add headers if appropriate
request.headers.update(self.controller.conditional_headers(request))
resp = super(CacheControlAdapter, self).send(request, **kw)
return resp
def build_response(
self, request, response, from_cache=False, cacheable_methods=None
):
"""
Build a response by making a request or using the cache.
This will end up calling send and returning a potentially
cached response
"""
cacheable = cacheable_methods or self.cacheable_methods
if not from_cache and request.method in cacheable:
# Check for any heuristics that might update headers
# before trying to cache.
if self.heuristic:
response = self.heuristic.apply(response)
# apply any expiration heuristics
if response.status == 304:
# We must have sent an ETag request. This could mean
# that we've been expired already or that we simply
# have an etag. In either case, we want to try and
# update the cache if that is the case.
cached_response = self.controller.update_cached_response(
request, response
)
if cached_response is not response:
from_cache = True
# We are done with the server response, read a
# possible response body (compliant servers will
# not return one, but we cannot be 100% sure) and
# release the connection back to the pool.
response.read(decode_content=False)
response.release_conn()
response = cached_response
# We always cache the 301 responses
elif response.status == 301:
self.controller.cache_response(request, response)
else:
# Wrap the response file with a wrapper that will cache the
# response when the stream has been consumed.
response._fp = CallbackFileWrapper(
response._fp,
functools.partial(
self.controller.cache_response, request, response
),
)
if response.chunked:
super_update_chunk_length = response._update_chunk_length
def _update_chunk_length(self):
super_update_chunk_length()
if self.chunk_left == 0:
self._fp._close()
response._update_chunk_length = types.MethodType(
_update_chunk_length, response
)
resp = super(CacheControlAdapter, self).build_response(request, response)
# See if we should invalidate the cache.
if request.method in self.invalidating_methods and resp.ok:
cache_url = self.controller.cache_url(request.url)
self.cache.delete(cache_url)
# Give the request a from_cache attr to let people use it
resp.from_cache = from_cache
return resp
def close(self):
self.cache.close()
super(CacheControlAdapter, self).close()

View File

@ -0,0 +1,39 @@
"""
The cache object API for implementing caches. The default is a thread
safe in-memory dictionary.
"""
from threading import Lock
class BaseCache(object):
def get(self, key):
raise NotImplementedError()
def set(self, key, value):
raise NotImplementedError()
def delete(self, key):
raise NotImplementedError()
def close(self):
pass
class DictCache(BaseCache):
def __init__(self, init_dict=None):
self.lock = Lock()
self.data = init_dict or {}
def get(self, key):
return self.data.get(key, None)
def set(self, key, value):
with self.lock:
self.data.update({key: value})
def delete(self, key):
with self.lock:
if key in self.data:
self.data.pop(key)

View File

@ -0,0 +1,2 @@
from .file_cache import FileCache # noqa
from .redis_cache import RedisCache # noqa

View File

@ -0,0 +1,146 @@
import hashlib
import os
from textwrap import dedent
from ..cache import BaseCache
from ..controller import CacheController
try:
FileNotFoundError
except NameError:
# py2.X
FileNotFoundError = (IOError, OSError)
def _secure_open_write(filename, fmode):
# We only want to write to this file, so open it in write only mode
flags = os.O_WRONLY
# os.O_CREAT | os.O_EXCL will fail if the file already exists, so we only
# will open *new* files.
# We specify this because we want to ensure that the mode we pass is the
# mode of the file.
flags |= os.O_CREAT | os.O_EXCL
# Do not follow symlinks to prevent someone from making a symlink that
# we follow and insecurely open a cache file.
if hasattr(os, "O_NOFOLLOW"):
flags |= os.O_NOFOLLOW
# On Windows we'll mark this file as binary
if hasattr(os, "O_BINARY"):
flags |= os.O_BINARY
# Before we open our file, we want to delete any existing file that is
# there
try:
os.remove(filename)
except (IOError, OSError):
# The file must not exist already, so we can just skip ahead to opening
pass
# Open our file, the use of os.O_CREAT | os.O_EXCL will ensure that if a
# race condition happens between the os.remove and this line, that an
# error will be raised. Because we utilize a lockfile this should only
# happen if someone is attempting to attack us.
fd = os.open(filename, flags, fmode)
try:
return os.fdopen(fd, "wb")
except:
# An error occurred wrapping our FD in a file object
os.close(fd)
raise
class FileCache(BaseCache):
def __init__(
self,
directory,
forever=False,
filemode=0o0600,
dirmode=0o0700,
use_dir_lock=None,
lock_class=None,
):
if use_dir_lock is not None and lock_class is not None:
raise ValueError("Cannot use use_dir_lock and lock_class together")
try:
from lockfile import LockFile
from lockfile.mkdirlockfile import MkdirLockFile
except ImportError:
notice = dedent(
"""
NOTE: In order to use the FileCache you must have
lockfile installed. You can install it via pip:
pip install lockfile
"""
)
raise ImportError(notice)
else:
if use_dir_lock:
lock_class = MkdirLockFile
elif lock_class is None:
lock_class = LockFile
self.directory = directory
self.forever = forever
self.filemode = filemode
self.dirmode = dirmode
self.lock_class = lock_class
@staticmethod
def encode(x):
return hashlib.sha224(x.encode()).hexdigest()
def _fn(self, name):
# NOTE: This method should not change as some may depend on it.
# See: https://github.com/ionrock/cachecontrol/issues/63
hashed = self.encode(name)
parts = list(hashed[:5]) + [hashed]
return os.path.join(self.directory, *parts)
def get(self, key):
name = self._fn(key)
try:
with open(name, "rb") as fh:
return fh.read()
except FileNotFoundError:
return None
def set(self, key, value):
name = self._fn(key)
# Make sure the directory exists
try:
os.makedirs(os.path.dirname(name), self.dirmode)
except (IOError, OSError):
pass
with self.lock_class(name) as lock:
# Write our actual file
with _secure_open_write(lock.path, self.filemode) as fh:
fh.write(value)
def delete(self, key):
name = self._fn(key)
if not self.forever:
try:
os.remove(name)
except FileNotFoundError:
pass
def url_to_file_path(url, filecache):
"""Return the file cache path based on the URL.
This does not ensure the file exists!
"""
key = CacheController.cache_url(url)
return filecache._fn(key)

View File

@ -0,0 +1,33 @@
from __future__ import division
from datetime import datetime
from pip._vendor.cachecontrol.cache import BaseCache
class RedisCache(BaseCache):
def __init__(self, conn):
self.conn = conn
def get(self, key):
return self.conn.get(key)
def set(self, key, value, expires=None):
if not expires:
self.conn.set(key, value)
else:
expires = expires - datetime.utcnow()
self.conn.setex(key, int(expires.total_seconds()), value)
def delete(self, key):
self.conn.delete(key)
def clear(self):
"""Helper for clearing all the keys in a database. Use with
caution!"""
for key in self.conn.keys():
self.conn.delete(key)
def close(self):
"""Redis uses connection pooling, no need to close the connection."""
pass

View File

@ -0,0 +1,29 @@
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
try:
import cPickle as pickle
except ImportError:
import pickle
# Handle the case where the requests module has been patched to not have
# urllib3 bundled as part of its source.
try:
from pip._vendor.requests.packages.urllib3.response import HTTPResponse
except ImportError:
from pip._vendor.urllib3.response import HTTPResponse
try:
from pip._vendor.requests.packages.urllib3.util import is_fp_closed
except ImportError:
from pip._vendor.urllib3.util import is_fp_closed
# Replicate some six behaviour
try:
text_type = unicode
except NameError:
text_type = str

View File

@ -0,0 +1,376 @@
"""
The httplib2 algorithms ported for use with requests.
"""
import logging
import re
import calendar
import time
from email.utils import parsedate_tz
from pip._vendor.requests.structures import CaseInsensitiveDict
from .cache import DictCache
from .serialize import Serializer
logger = logging.getLogger(__name__)
URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
def parse_uri(uri):
"""Parses a URI using the regex given in Appendix B of RFC 3986.
(scheme, authority, path, query, fragment) = parse_uri(uri)
"""
groups = URI.match(uri).groups()
return (groups[1], groups[3], groups[4], groups[6], groups[8])
class CacheController(object):
"""An interface to see if request should cached or not.
"""
def __init__(
self, cache=None, cache_etags=True, serializer=None, status_codes=None
):
self.cache = DictCache() if cache is None else cache
self.cache_etags = cache_etags
self.serializer = serializer or Serializer()
self.cacheable_status_codes = status_codes or (200, 203, 300, 301)
@classmethod
def _urlnorm(cls, uri):
"""Normalize the URL to create a safe key for the cache"""
(scheme, authority, path, query, fragment) = parse_uri(uri)
if not scheme or not authority:
raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
scheme = scheme.lower()
authority = authority.lower()
if not path:
path = "/"
# Could do syntax based normalization of the URI before
# computing the digest. See Section 6.2.2 of Std 66.
request_uri = query and "?".join([path, query]) or path
defrag_uri = scheme + "://" + authority + request_uri
return defrag_uri
@classmethod
def cache_url(cls, uri):
return cls._urlnorm(uri)
def parse_cache_control(self, headers):
known_directives = {
# https://tools.ietf.org/html/rfc7234#section-5.2
"max-age": (int, True),
"max-stale": (int, False),
"min-fresh": (int, True),
"no-cache": (None, False),
"no-store": (None, False),
"no-transform": (None, False),
"only-if-cached": (None, False),
"must-revalidate": (None, False),
"public": (None, False),
"private": (None, False),
"proxy-revalidate": (None, False),
"s-maxage": (int, True),
}
cc_headers = headers.get("cache-control", headers.get("Cache-Control", ""))
retval = {}
for cc_directive in cc_headers.split(","):
if not cc_directive.strip():
continue
parts = cc_directive.split("=", 1)
directive = parts[0].strip()
try:
typ, required = known_directives[directive]
except KeyError:
logger.debug("Ignoring unknown cache-control directive: %s", directive)
continue
if not typ or not required:
retval[directive] = None
if typ:
try:
retval[directive] = typ(parts[1].strip())
except IndexError:
if required:
logger.debug(
"Missing value for cache-control " "directive: %s",
directive,
)
except ValueError:
logger.debug(
"Invalid value for cache-control directive " "%s, must be %s",
directive,
typ.__name__,
)
return retval
def cached_request(self, request):
"""
Return a cached response if it exists in the cache, otherwise
return False.
"""
cache_url = self.cache_url(request.url)
logger.debug('Looking up "%s" in the cache', cache_url)
cc = self.parse_cache_control(request.headers)
# Bail out if the request insists on fresh data
if "no-cache" in cc:
logger.debug('Request header has "no-cache", cache bypassed')
return False
if "max-age" in cc and cc["max-age"] == 0:
logger.debug('Request header has "max_age" as 0, cache bypassed')
return False
# Request allows serving from the cache, let's see if we find something
cache_data = self.cache.get(cache_url)
if cache_data is None:
logger.debug("No cache entry available")
return False
# Check whether it can be deserialized
resp = self.serializer.loads(request, cache_data)
if not resp:
logger.warning("Cache entry deserialization failed, entry ignored")
return False
# If we have a cached 301, return it immediately. We don't
# need to test our response for other headers b/c it is
# intrinsically "cacheable" as it is Permanent.
# See:
# https://tools.ietf.org/html/rfc7231#section-6.4.2
#
# Client can try to refresh the value by repeating the request
# with cache busting headers as usual (ie no-cache).
if resp.status == 301:
msg = (
'Returning cached "301 Moved Permanently" response '
"(ignoring date and etag information)"
)
logger.debug(msg)
return resp
headers = CaseInsensitiveDict(resp.headers)
if not headers or "date" not in headers:
if "etag" not in headers:
# Without date or etag, the cached response can never be used
# and should be deleted.
logger.debug("Purging cached response: no date or etag")
self.cache.delete(cache_url)
logger.debug("Ignoring cached response: no date")
return False
now = time.time()
date = calendar.timegm(parsedate_tz(headers["date"]))
current_age = max(0, now - date)
logger.debug("Current age based on date: %i", current_age)
# TODO: There is an assumption that the result will be a
# urllib3 response object. This may not be best since we
# could probably avoid instantiating or constructing the
# response until we know we need it.
resp_cc = self.parse_cache_control(headers)
# determine freshness
freshness_lifetime = 0
# Check the max-age pragma in the cache control header
if "max-age" in resp_cc:
freshness_lifetime = resp_cc["max-age"]
logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime)
# If there isn't a max-age, check for an expires header
elif "expires" in headers:
expires = parsedate_tz(headers["expires"])
if expires is not None:
expire_time = calendar.timegm(expires) - date
freshness_lifetime = max(0, expire_time)
logger.debug("Freshness lifetime from expires: %i", freshness_lifetime)
# Determine if we are setting freshness limit in the
# request. Note, this overrides what was in the response.
if "max-age" in cc:
freshness_lifetime = cc["max-age"]
logger.debug(
"Freshness lifetime from request max-age: %i", freshness_lifetime
)
if "min-fresh" in cc:
min_fresh = cc["min-fresh"]
# adjust our current age by our min fresh
current_age += min_fresh
logger.debug("Adjusted current age from min-fresh: %i", current_age)
# Return entry if it is fresh enough
if freshness_lifetime > current_age:
logger.debug('The response is "fresh", returning cached response')
logger.debug("%i > %i", freshness_lifetime, current_age)
return resp
# we're not fresh. If we don't have an Etag, clear it out
if "etag" not in headers:
logger.debug('The cached response is "stale" with no etag, purging')
self.cache.delete(cache_url)
# return the original handler
return False
def conditional_headers(self, request):
cache_url = self.cache_url(request.url)
resp = self.serializer.loads(request, self.cache.get(cache_url))
new_headers = {}
if resp:
headers = CaseInsensitiveDict(resp.headers)
if "etag" in headers:
new_headers["If-None-Match"] = headers["ETag"]
if "last-modified" in headers:
new_headers["If-Modified-Since"] = headers["Last-Modified"]
return new_headers
def cache_response(self, request, response, body=None, status_codes=None):
"""
Algorithm for caching requests.
This assumes a requests Response object.
"""
# From httplib2: Don't cache 206's since we aren't going to
# handle byte range requests
cacheable_status_codes = status_codes or self.cacheable_status_codes
if response.status not in cacheable_status_codes:
logger.debug(
"Status code %s not in %s", response.status, cacheable_status_codes
)
return
response_headers = CaseInsensitiveDict(response.headers)
# If we've been given a body, our response has a Content-Length, that
# Content-Length is valid then we can check to see if the body we've
# been given matches the expected size, and if it doesn't we'll just
# skip trying to cache it.
if (
body is not None
and "content-length" in response_headers
and response_headers["content-length"].isdigit()
and int(response_headers["content-length"]) != len(body)
):
return
cc_req = self.parse_cache_control(request.headers)
cc = self.parse_cache_control(response_headers)
cache_url = self.cache_url(request.url)
logger.debug('Updating cache with response from "%s"', cache_url)
# Delete it from the cache if we happen to have it stored there
no_store = False
if "no-store" in cc:
no_store = True
logger.debug('Response header has "no-store"')
if "no-store" in cc_req:
no_store = True
logger.debug('Request header has "no-store"')
if no_store and self.cache.get(cache_url):
logger.debug('Purging existing cache entry to honor "no-store"')
self.cache.delete(cache_url)
if no_store:
return
# https://tools.ietf.org/html/rfc7234#section-4.1:
# A Vary header field-value of "*" always fails to match.
# Storing such a response leads to a deserialization warning
# during cache lookup and is not allowed to ever be served,
# so storing it can be avoided.
if "*" in response_headers.get("vary", ""):
logger.debug('Response header has "Vary: *"')
return
# If we've been given an etag, then keep the response
if self.cache_etags and "etag" in response_headers:
logger.debug("Caching due to etag")
self.cache.set(
cache_url, self.serializer.dumps(request, response, body=body)
)
# Add to the cache any 301s. We do this before looking that
# the Date headers.
elif response.status == 301:
logger.debug("Caching permanant redirect")
self.cache.set(cache_url, self.serializer.dumps(request, response))
# Add to the cache if the response headers demand it. If there
# is no date header then we can't do anything about expiring
# the cache.
elif "date" in response_headers:
# cache when there is a max-age > 0
if "max-age" in cc and cc["max-age"] > 0:
logger.debug("Caching b/c date exists and max-age > 0")
self.cache.set(
cache_url, self.serializer.dumps(request, response, body=body)
)
# If the request can expire, it means we should cache it
# in the meantime.
elif "expires" in response_headers:
if response_headers["expires"]:
logger.debug("Caching b/c of expires header")
self.cache.set(
cache_url, self.serializer.dumps(request, response, body=body)
)
def update_cached_response(self, request, response):
"""On a 304 we will get a new set of headers that we want to
update our cached value with, assuming we have one.
This should only ever be called when we've sent an ETag and
gotten a 304 as the response.
"""
cache_url = self.cache_url(request.url)
cached_response = self.serializer.loads(request, self.cache.get(cache_url))
if not cached_response:
# we didn't have a cached response
return response
# Lets update our headers with the headers from the new request:
# http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
#
# The server isn't supposed to send headers that would make
# the cached body invalid. But... just in case, we'll be sure
# to strip out ones we know that might be problmatic due to
# typical assumptions.
excluded_headers = ["content-length"]
cached_response.headers.update(
dict(
(k, v)
for k, v in response.headers.items()
if k.lower() not in excluded_headers
)
)
# we want a 200 b/c we have content via the cache
cached_response.status = 200
# update our cache
self.cache.set(cache_url, self.serializer.dumps(request, cached_response))
return cached_response

View File

@ -0,0 +1,80 @@
from io import BytesIO
class CallbackFileWrapper(object):
"""
Small wrapper around a fp object which will tee everything read into a
buffer, and when that file is closed it will execute a callback with the
contents of that buffer.
All attributes are proxied to the underlying file object.
This class uses members with a double underscore (__) leading prefix so as
not to accidentally shadow an attribute.
"""
def __init__(self, fp, callback):
self.__buf = BytesIO()
self.__fp = fp
self.__callback = callback
def __getattr__(self, name):
# The vaguaries of garbage collection means that self.__fp is
# not always set. By using __getattribute__ and the private
# name[0] allows looking up the attribute value and raising an
# AttributeError when it doesn't exist. This stop thigns from
# infinitely recursing calls to getattr in the case where
# self.__fp hasn't been set.
#
# [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers
fp = self.__getattribute__("_CallbackFileWrapper__fp")
return getattr(fp, name)
def __is_fp_closed(self):
try:
return self.__fp.fp is None
except AttributeError:
pass
try:
return self.__fp.closed
except AttributeError:
pass
# We just don't cache it then.
# TODO: Add some logging here...
return False
def _close(self):
if self.__callback:
self.__callback(self.__buf.getvalue())
# We assign this to None here, because otherwise we can get into
# really tricky problems where the CPython interpreter dead locks
# because the callback is holding a reference to something which
# has a __del__ method. Setting this to None breaks the cycle
# and allows the garbage collector to do it's thing normally.
self.__callback = None
def read(self, amt=None):
data = self.__fp.read(amt)
self.__buf.write(data)
if self.__is_fp_closed():
self._close()
return data
def _safe_read(self, amt):
data = self.__fp._safe_read(amt)
if amt == 2 and data == b"\r\n":
# urllib executes this read to toss the CRLF at the end
# of the chunk.
return data
self.__buf.write(data)
if self.__is_fp_closed():
self._close()
return data

View File

@ -0,0 +1,135 @@
import calendar
import time
from email.utils import formatdate, parsedate, parsedate_tz
from datetime import datetime, timedelta
TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT"
def expire_after(delta, date=None):
date = date or datetime.utcnow()
return date + delta
def datetime_to_header(dt):
return formatdate(calendar.timegm(dt.timetuple()))
class BaseHeuristic(object):
def warning(self, response):
"""
Return a valid 1xx warning header value describing the cache
adjustments.
The response is provided too allow warnings like 113
http://tools.ietf.org/html/rfc7234#section-5.5.4 where we need
to explicitly say response is over 24 hours old.
"""
return '110 - "Response is Stale"'
def update_headers(self, response):
"""Update the response headers with any new headers.
NOTE: This SHOULD always include some Warning header to
signify that the response was cached by the client, not
by way of the provided headers.
"""
return {}
def apply(self, response):
updated_headers = self.update_headers(response)
if updated_headers:
response.headers.update(updated_headers)
warning_header_value = self.warning(response)
if warning_header_value is not None:
response.headers.update({"Warning": warning_header_value})
return response
class OneDayCache(BaseHeuristic):
"""
Cache the response by providing an expires 1 day in the
future.
"""
def update_headers(self, response):
headers = {}
if "expires" not in response.headers:
date = parsedate(response.headers["date"])
expires = expire_after(timedelta(days=1), date=datetime(*date[:6]))
headers["expires"] = datetime_to_header(expires)
headers["cache-control"] = "public"
return headers
class ExpiresAfter(BaseHeuristic):
"""
Cache **all** requests for a defined time period.
"""
def __init__(self, **kw):
self.delta = timedelta(**kw)
def update_headers(self, response):
expires = expire_after(self.delta)
return {"expires": datetime_to_header(expires), "cache-control": "public"}
def warning(self, response):
tmpl = "110 - Automatically cached for %s. Response might be stale"
return tmpl % self.delta
class LastModified(BaseHeuristic):
"""
If there is no Expires header already, fall back on Last-Modified
using the heuristic from
http://tools.ietf.org/html/rfc7234#section-4.2.2
to calculate a reasonable value.
Firefox also does something like this per
https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching_FAQ
http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397
Unlike mozilla we limit this to 24-hr.
"""
cacheable_by_default_statuses = {
200, 203, 204, 206, 300, 301, 404, 405, 410, 414, 501
}
def update_headers(self, resp):
headers = resp.headers
if "expires" in headers:
return {}
if "cache-control" in headers and headers["cache-control"] != "public":
return {}
if resp.status not in self.cacheable_by_default_statuses:
return {}
if "date" not in headers or "last-modified" not in headers:
return {}
date = calendar.timegm(parsedate_tz(headers["date"]))
last_modified = parsedate(headers["last-modified"])
if date is None or last_modified is None:
return {}
now = time.time()
current_age = max(0, now - date)
delta = date - calendar.timegm(last_modified)
freshness_lifetime = max(0, min(delta / 10, 24 * 3600))
if freshness_lifetime <= current_age:
return {}
expires = date + freshness_lifetime
return {"expires": time.strftime(TIME_FMT, time.gmtime(expires))}
def warning(self, resp):
return None

View File

@ -0,0 +1,188 @@
import base64
import io
import json
import zlib
from pip._vendor import msgpack
from pip._vendor.requests.structures import CaseInsensitiveDict
from .compat import HTTPResponse, pickle, text_type
def _b64_decode_bytes(b):
return base64.b64decode(b.encode("ascii"))
def _b64_decode_str(s):
return _b64_decode_bytes(s).decode("utf8")
class Serializer(object):
def dumps(self, request, response, body=None):
response_headers = CaseInsensitiveDict(response.headers)
if body is None:
body = response.read(decode_content=False)
# NOTE: 99% sure this is dead code. I'm only leaving it
# here b/c I don't have a test yet to prove
# it. Basically, before using
# `cachecontrol.filewrapper.CallbackFileWrapper`,
# this made an effort to reset the file handle. The
# `CallbackFileWrapper` short circuits this code by
# setting the body as the content is consumed, the
# result being a `body` argument is *always* passed
# into cache_response, and in turn,
# `Serializer.dump`.
response._fp = io.BytesIO(body)
# NOTE: This is all a bit weird, but it's really important that on
# Python 2.x these objects are unicode and not str, even when
# they contain only ascii. The problem here is that msgpack
# understands the difference between unicode and bytes and we
# have it set to differentiate between them, however Python 2
# doesn't know the difference. Forcing these to unicode will be
# enough to have msgpack know the difference.
data = {
u"response": {
u"body": body,
u"headers": dict(
(text_type(k), text_type(v)) for k, v in response.headers.items()
),
u"status": response.status,
u"version": response.version,
u"reason": text_type(response.reason),
u"strict": response.strict,
u"decode_content": response.decode_content,
}
}
# Construct our vary headers
data[u"vary"] = {}
if u"vary" in response_headers:
varied_headers = response_headers[u"vary"].split(",")
for header in varied_headers:
header = text_type(header).strip()
header_value = request.headers.get(header, None)
if header_value is not None:
header_value = text_type(header_value)
data[u"vary"][header] = header_value
return b",".join([b"cc=4", msgpack.dumps(data, use_bin_type=True)])
def loads(self, request, data):
# Short circuit if we've been given an empty set of data
if not data:
return
# Determine what version of the serializer the data was serialized
# with
try:
ver, data = data.split(b",", 1)
except ValueError:
ver = b"cc=0"
# Make sure that our "ver" is actually a version and isn't a false
# positive from a , being in the data stream.
if ver[:3] != b"cc=":
data = ver + data
ver = b"cc=0"
# Get the version number out of the cc=N
ver = ver.split(b"=", 1)[-1].decode("ascii")
# Dispatch to the actual load method for the given version
try:
return getattr(self, "_loads_v{}".format(ver))(request, data)
except AttributeError:
# This is a version we don't have a loads function for, so we'll
# just treat it as a miss and return None
return
def prepare_response(self, request, cached):
"""Verify our vary headers match and construct a real urllib3
HTTPResponse object.
"""
# Special case the '*' Vary value as it means we cannot actually
# determine if the cached response is suitable for this request.
# This case is also handled in the controller code when creating
# a cache entry, but is left here for backwards compatibility.
if "*" in cached.get("vary", {}):
return
# Ensure that the Vary headers for the cached response match our
# request
for header, value in cached.get("vary", {}).items():
if request.headers.get(header, None) != value:
return
body_raw = cached["response"].pop("body")
headers = CaseInsensitiveDict(data=cached["response"]["headers"])
if headers.get("transfer-encoding", "") == "chunked":
headers.pop("transfer-encoding")
cached["response"]["headers"] = headers
try:
body = io.BytesIO(body_raw)
except TypeError:
# This can happen if cachecontrol serialized to v1 format (pickle)
# using Python 2. A Python 2 str(byte string) will be unpickled as
# a Python 3 str (unicode string), which will cause the above to
# fail with:
#
# TypeError: 'str' does not support the buffer interface
body = io.BytesIO(body_raw.encode("utf8"))
return HTTPResponse(body=body, preload_content=False, **cached["response"])
def _loads_v0(self, request, data):
# The original legacy cache data. This doesn't contain enough
# information to construct everything we need, so we'll treat this as
# a miss.
return
def _loads_v1(self, request, data):
try:
cached = pickle.loads(data)
except ValueError:
return
return self.prepare_response(request, cached)
def _loads_v2(self, request, data):
try:
cached = json.loads(zlib.decompress(data).decode("utf8"))
except (ValueError, zlib.error):
return
# We need to decode the items that we've base64 encoded
cached["response"]["body"] = _b64_decode_bytes(cached["response"]["body"])
cached["response"]["headers"] = dict(
(_b64_decode_str(k), _b64_decode_str(v))
for k, v in cached["response"]["headers"].items()
)
cached["response"]["reason"] = _b64_decode_str(cached["response"]["reason"])
cached["vary"] = dict(
(_b64_decode_str(k), _b64_decode_str(v) if v is not None else v)
for k, v in cached["vary"].items()
)
return self.prepare_response(request, cached)
def _loads_v3(self, request, data):
# Due to Python 2 encoding issues, it's impossible to know for sure
# exactly how to load v3 entries, thus we'll treat these as a miss so
# that they get rewritten out as v4 entries.
return
def _loads_v4(self, request, data):
try:
cached = msgpack.loads(data, raw=False)
except ValueError:
return
return self.prepare_response(request, cached)

View File

@ -0,0 +1,29 @@
from .adapter import CacheControlAdapter
from .cache import DictCache
def CacheControl(
sess,
cache=None,
cache_etags=True,
serializer=None,
heuristic=None,
controller_class=None,
adapter_class=None,
cacheable_methods=None,
):
cache = DictCache() if cache is None else cache
adapter_class = adapter_class or CacheControlAdapter
adapter = adapter_class(
cache,
cache_etags=cache_etags,
serializer=serializer,
heuristic=heuristic,
controller_class=controller_class,
cacheable_methods=cacheable_methods,
)
sess.mount("http://", adapter)
sess.mount("https://", adapter)
return sess

View File

@ -0,0 +1,3 @@
from .core import contents, where
__version__ = "2021.05.30"

View File

@ -0,0 +1,12 @@
import argparse
from pip._vendor.certifi import contents, where
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--contents", action="store_true")
args = parser.parse_args()
if args.contents:
print(contents())
else:
print(where())

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
"""
certifi.py
~~~~~~~~~~
This module returns the installation location of cacert.pem or its contents.
"""
import os
class _PipPatchedCertificate(Exception):
pass
try:
# Return a certificate file on disk for a standalone pip zipapp running in
# an isolated build environment to use. Passing --cert to the standalone
# pip does not work since requests calls where() unconditionally on import.
_PIP_STANDALONE_CERT = os.environ.get("_PIP_STANDALONE_CERT")
if _PIP_STANDALONE_CERT:
def where():
return _PIP_STANDALONE_CERT
raise _PipPatchedCertificate()
from importlib.resources import path as get_path, read_text
_CACERT_CTX = None
_CACERT_PATH = None
def where():
# This is slightly terrible, but we want to delay extracting the file
# in cases where we're inside of a zipimport situation until someone
# actually calls where(), but we don't want to re-extract the file
# on every call of where(), so we'll do it once then store it in a
# global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you to
# manage the cleanup of this file, so it doesn't actually return a
# path, it returns a context manager that will give you the path
# when you enter it and will do any cleanup when you leave it. In
# the common case of not needing a temporary file, it will just
# return the file system location and the __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = get_path("pip._vendor.certifi", "cacert.pem")
_CACERT_PATH = str(_CACERT_CTX.__enter__())
return _CACERT_PATH
except _PipPatchedCertificate:
pass
except ImportError:
# This fallback will work for Python versions prior to 3.7 that lack the
# importlib.resources module but relies on the existing `where` function
# so won't address issues with environments like PyOxidizer that don't set
# __file__ on modules.
def read_text(_module, _path, encoding="ascii"):
with open(where(), "r", encoding=encoding) as data:
return data.read()
# If we don't have importlib.resources, then we will just do the old logic
# of assuming we're on the filesystem and munge the path directly.
def where():
f = os.path.dirname(__file__)
return os.path.join(f, "cacert.pem")
def contents():
return read_text("certifi", "cacert.pem", encoding="ascii")

View File

@ -0,0 +1,83 @@
######################## BEGIN LICENSE BLOCK ########################
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .universaldetector import UniversalDetector
from .enums import InputState
from .version import __version__, VERSION
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
def detect(byte_str):
"""
Detect the encoding of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
return detector.close()
def detect_all(byte_str):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
if detector._input_state == InputState.HIGH_BYTE:
results = []
for prober in detector._charset_probers:
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
charset_name = prober.charset_name
lower_charset_name = prober.charset_name.lower()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if detector._has_win_bytes:
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
results.append({
'encoding': charset_name,
'confidence': prober.get_confidence(),
'language': prober.language,
})
if len(results) > 0:
return sorted(results, key=lambda result: -result['confidence'])
return [detector.result]

View File

@ -0,0 +1,386 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
# Big5 frequency table
# by Taiwan's Mandarin Promotion Council
# <http://www.edu.tw:81/mandr/>
#
# 128 --> 0.42261
# 256 --> 0.57851
# 512 --> 0.74851
# 1024 --> 0.89384
# 2048 --> 0.97583
#
# Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98
# Random Distribution Ration = 512/(5401-512)=0.105
#
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
#Char to FreqOrder table
BIG5_TABLE_SIZE = 5376
BIG5_CHAR_TO_FREQ_ORDER = (
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
63,5010,5011, 317,1614, 75, 222, 159,4203,2417,1480,5012,3555,3091, 224,2822, # 64
3682, 3, 10,3973,1471, 29,2787,1135,2866,1940, 873, 130,3275,1123, 312,5013, # 80
4511,2052, 507, 252, 682,5014, 142,1915, 124, 206,2947, 34,3556,3204, 64, 604, # 96
5015,2501,1977,1978, 155,1991, 645, 641,1606,5016,3452, 337, 72, 406,5017, 80, # 112
630, 238,3205,1509, 263, 939,1092,2654, 756,1440,1094,3453, 449, 69,2987, 591, # 128
179,2096, 471, 115,2035,1844, 60, 50,2988, 134, 806,1869, 734,2036,3454, 180, # 144
995,1607, 156, 537,2907, 688,5018, 319,1305, 779,2145, 514,2379, 298,4512, 359, # 160
2502, 90,2716,1338, 663, 11, 906,1099,2553, 20,2441, 182, 532,1716,5019, 732, # 176
1376,4204,1311,1420,3206, 25,2317,1056, 113, 399, 382,1950, 242,3455,2474, 529, # 192
3276, 475,1447,3683,5020, 117, 21, 656, 810,1297,2300,2334,3557,5021, 126,4205, # 208
706, 456, 150, 613,4513, 71,1118,2037,4206, 145,3092, 85, 835, 486,2115,1246, # 224
1426, 428, 727,1285,1015, 800, 106, 623, 303,1281,5022,2128,2359, 347,3815, 221, # 240
3558,3135,5023,1956,1153,4207, 83, 296,1199,3093, 192, 624, 93,5024, 822,1898, # 256
2823,3136, 795,2065, 991,1554,1542,1592, 27, 43,2867, 859, 139,1456, 860,4514, # 272
437, 712,3974, 164,2397,3137, 695, 211,3037,2097, 195,3975,1608,3559,3560,3684, # 288
3976, 234, 811,2989,2098,3977,2233,1441,3561,1615,2380, 668,2077,1638, 305, 228, # 304
1664,4515, 467, 415,5025, 262,2099,1593, 239, 108, 300, 200,1033, 512,1247,2078, # 320
5026,5027,2176,3207,3685,2682, 593, 845,1062,3277, 88,1723,2038,3978,1951, 212, # 336
266, 152, 149, 468,1899,4208,4516, 77, 187,5028,3038, 37, 5,2990,5029,3979, # 352
5030,5031, 39,2524,4517,2908,3208,2079, 55, 148, 74,4518, 545, 483,1474,1029, # 368
1665, 217,1870,1531,3138,1104,2655,4209, 24, 172,3562, 900,3980,3563,3564,4519, # 384
32,1408,2824,1312, 329, 487,2360,2251,2717, 784,2683, 4,3039,3351,1427,1789, # 400
188, 109, 499,5032,3686,1717,1790, 888,1217,3040,4520,5033,3565,5034,3352,1520, # 416
3687,3981, 196,1034, 775,5035,5036, 929,1816, 249, 439, 38,5037,1063,5038, 794, # 432
3982,1435,2301, 46, 178,3278,2066,5039,2381,5040, 214,1709,4521, 804, 35, 707, # 448
324,3688,1601,2554, 140, 459,4210,5041,5042,1365, 839, 272, 978,2262,2580,3456, # 464
2129,1363,3689,1423, 697, 100,3094, 48, 70,1231, 495,3139,2196,5043,1294,5044, # 480
2080, 462, 586,1042,3279, 853, 256, 988, 185,2382,3457,1698, 434,1084,5045,3458, # 496
314,2625,2788,4522,2335,2336, 569,2285, 637,1817,2525, 757,1162,1879,1616,3459, # 512
287,1577,2116, 768,4523,1671,2868,3566,2526,1321,3816, 909,2418,5046,4211, 933, # 528
3817,4212,2053,2361,1222,4524, 765,2419,1322, 786,4525,5047,1920,1462,1677,2909, # 544
1699,5048,4526,1424,2442,3140,3690,2600,3353,1775,1941,3460,3983,4213, 309,1369, # 560
1130,2825, 364,2234,1653,1299,3984,3567,3985,3986,2656, 525,1085,3041, 902,2001, # 576
1475, 964,4527, 421,1845,1415,1057,2286, 940,1364,3141, 376,4528,4529,1381, 7, # 592
2527, 983,2383, 336,1710,2684,1846, 321,3461, 559,1131,3042,2752,1809,1132,1313, # 608
265,1481,1858,5049, 352,1203,2826,3280, 167,1089, 420,2827, 776, 792,1724,3568, # 624
4214,2443,3281,5050,4215,5051, 446, 229, 333,2753, 901,3818,1200,1557,4530,2657, # 640
1921, 395,2754,2685,3819,4216,1836, 125, 916,3209,2626,4531,5052,5053,3820,5054, # 656
5055,5056,4532,3142,3691,1133,2555,1757,3462,1510,2318,1409,3569,5057,2146, 438, # 672
2601,2910,2384,3354,1068, 958,3043, 461, 311,2869,2686,4217,1916,3210,4218,1979, # 688
383, 750,2755,2627,4219, 274, 539, 385,1278,1442,5058,1154,1965, 384, 561, 210, # 704
98,1295,2556,3570,5059,1711,2420,1482,3463,3987,2911,1257, 129,5060,3821, 642, # 720
523,2789,2790,2658,5061, 141,2235,1333, 68, 176, 441, 876, 907,4220, 603,2602, # 736
710, 171,3464, 404, 549, 18,3143,2398,1410,3692,1666,5062,3571,4533,2912,4534, # 752
5063,2991, 368,5064, 146, 366, 99, 871,3693,1543, 748, 807,1586,1185, 22,2263, # 768
379,3822,3211,5065,3212, 505,1942,2628,1992,1382,2319,5066, 380,2362, 218, 702, # 784
1818,1248,3465,3044,3572,3355,3282,5067,2992,3694, 930,3283,3823,5068, 59,5069, # 800
585, 601,4221, 497,3466,1112,1314,4535,1802,5070,1223,1472,2177,5071, 749,1837, # 816
690,1900,3824,1773,3988,1476, 429,1043,1791,2236,2117, 917,4222, 447,1086,1629, # 832
5072, 556,5073,5074,2021,1654, 844,1090, 105, 550, 966,1758,2828,1008,1783, 686, # 848
1095,5075,2287, 793,1602,5076,3573,2603,4536,4223,2948,2302,4537,3825, 980,2503, # 864
544, 353, 527,4538, 908,2687,2913,5077, 381,2629,1943,1348,5078,1341,1252, 560, # 880
3095,5079,3467,2870,5080,2054, 973, 886,2081, 143,4539,5081,5082, 157,3989, 496, # 896
4224, 57, 840, 540,2039,4540,4541,3468,2118,1445, 970,2264,1748,1966,2082,4225, # 912
3144,1234,1776,3284,2829,3695, 773,1206,2130,1066,2040,1326,3990,1738,1725,4226, # 928
279,3145, 51,1544,2604, 423,1578,2131,2067, 173,4542,1880,5083,5084,1583, 264, # 944
610,3696,4543,2444, 280, 154,5085,5086,5087,1739, 338,1282,3096, 693,2871,1411, # 960
1074,3826,2445,5088,4544,5089,5090,1240, 952,2399,5091,2914,1538,2688, 685,1483, # 976
4227,2475,1436, 953,4228,2055,4545, 671,2400, 79,4229,2446,3285, 608, 567,2689, # 992
3469,4230,4231,1691, 393,1261,1792,2401,5092,4546,5093,5094,5095,5096,1383,1672, # 1008
3827,3213,1464, 522,1119, 661,1150, 216, 675,4547,3991,1432,3574, 609,4548,2690, # 1024
2402,5097,5098,5099,4232,3045, 0,5100,2476, 315, 231,2447, 301,3356,4549,2385, # 1040
5101, 233,4233,3697,1819,4550,4551,5102, 96,1777,1315,2083,5103, 257,5104,1810, # 1056
3698,2718,1139,1820,4234,2022,1124,2164,2791,1778,2659,5105,3097, 363,1655,3214, # 1072
5106,2993,5107,5108,5109,3992,1567,3993, 718, 103,3215, 849,1443, 341,3357,2949, # 1088
1484,5110,1712, 127, 67, 339,4235,2403, 679,1412, 821,5111,5112, 834, 738, 351, # 1104
2994,2147, 846, 235,1497,1881, 418,1993,3828,2719, 186,1100,2148,2756,3575,1545, # 1120
1355,2950,2872,1377, 583,3994,4236,2581,2995,5113,1298,3699,1078,2557,3700,2363, # 1136
78,3829,3830, 267,1289,2100,2002,1594,4237, 348, 369,1274,2197,2178,1838,4552, # 1152
1821,2830,3701,2757,2288,2003,4553,2951,2758, 144,3358, 882,4554,3995,2759,3470, # 1168
4555,2915,5114,4238,1726, 320,5115,3996,3046, 788,2996,5116,2831,1774,1327,2873, # 1184
3997,2832,5117,1306,4556,2004,1700,3831,3576,2364,2660, 787,2023, 506, 824,3702, # 1200
534, 323,4557,1044,3359,2024,1901, 946,3471,5118,1779,1500,1678,5119,1882,4558, # 1216
165, 243,4559,3703,2528, 123, 683,4239, 764,4560, 36,3998,1793, 589,2916, 816, # 1232
626,1667,3047,2237,1639,1555,1622,3832,3999,5120,4000,2874,1370,1228,1933, 891, # 1248
2084,2917, 304,4240,5121, 292,2997,2720,3577, 691,2101,4241,1115,4561, 118, 662, # 1264
5122, 611,1156, 854,2386,1316,2875, 2, 386, 515,2918,5123,5124,3286, 868,2238, # 1280
1486, 855,2661, 785,2216,3048,5125,1040,3216,3578,5126,3146, 448,5127,1525,5128, # 1296
2165,4562,5129,3833,5130,4242,2833,3579,3147, 503, 818,4001,3148,1568, 814, 676, # 1312
1444, 306,1749,5131,3834,1416,1030, 197,1428, 805,2834,1501,4563,5132,5133,5134, # 1328
1994,5135,4564,5136,5137,2198, 13,2792,3704,2998,3149,1229,1917,5138,3835,2132, # 1344
5139,4243,4565,2404,3580,5140,2217,1511,1727,1120,5141,5142, 646,3836,2448, 307, # 1360
5143,5144,1595,3217,5145,5146,5147,3705,1113,1356,4002,1465,2529,2530,5148, 519, # 1376
5149, 128,2133, 92,2289,1980,5150,4003,1512, 342,3150,2199,5151,2793,2218,1981, # 1392
3360,4244, 290,1656,1317, 789, 827,2365,5152,3837,4566, 562, 581,4004,5153, 401, # 1408
4567,2252, 94,4568,5154,1399,2794,5155,1463,2025,4569,3218,1944,5156, 828,1105, # 1424
4245,1262,1394,5157,4246, 605,4570,5158,1784,2876,5159,2835, 819,2102, 578,2200, # 1440
2952,5160,1502, 436,3287,4247,3288,2836,4005,2919,3472,3473,5161,2721,2320,5162, # 1456
5163,2337,2068, 23,4571, 193, 826,3838,2103, 699,1630,4248,3098, 390,1794,1064, # 1472
3581,5164,1579,3099,3100,1400,5165,4249,1839,1640,2877,5166,4572,4573, 137,4250, # 1488
598,3101,1967, 780, 104, 974,2953,5167, 278, 899, 253, 402, 572, 504, 493,1339, # 1504
5168,4006,1275,4574,2582,2558,5169,3706,3049,3102,2253, 565,1334,2722, 863, 41, # 1520
5170,5171,4575,5172,1657,2338, 19, 463,2760,4251, 606,5173,2999,3289,1087,2085, # 1536
1323,2662,3000,5174,1631,1623,1750,4252,2691,5175,2878, 791,2723,2663,2339, 232, # 1552
2421,5176,3001,1498,5177,2664,2630, 755,1366,3707,3290,3151,2026,1609, 119,1918, # 1568
3474, 862,1026,4253,5178,4007,3839,4576,4008,4577,2265,1952,2477,5179,1125, 817, # 1584
4254,4255,4009,1513,1766,2041,1487,4256,3050,3291,2837,3840,3152,5180,5181,1507, # 1600
5182,2692, 733, 40,1632,1106,2879, 345,4257, 841,2531, 230,4578,3002,1847,3292, # 1616
3475,5183,1263, 986,3476,5184, 735, 879, 254,1137, 857, 622,1300,1180,1388,1562, # 1632
4010,4011,2954, 967,2761,2665,1349, 592,2134,1692,3361,3003,1995,4258,1679,4012, # 1648
1902,2188,5185, 739,3708,2724,1296,1290,5186,4259,2201,2202,1922,1563,2605,2559, # 1664
1871,2762,3004,5187, 435,5188, 343,1108, 596, 17,1751,4579,2239,3477,3709,5189, # 1680
4580, 294,3582,2955,1693, 477, 979, 281,2042,3583, 643,2043,3710,2631,2795,2266, # 1696
1031,2340,2135,2303,3584,4581, 367,1249,2560,5190,3585,5191,4582,1283,3362,2005, # 1712
240,1762,3363,4583,4584, 836,1069,3153, 474,5192,2149,2532, 268,3586,5193,3219, # 1728
1521,1284,5194,1658,1546,4260,5195,3587,3588,5196,4261,3364,2693,1685,4262, 961, # 1744
1673,2632, 190,2006,2203,3841,4585,4586,5197, 570,2504,3711,1490,5198,4587,2633, # 1760
3293,1957,4588, 584,1514, 396,1045,1945,5199,4589,1968,2449,5200,5201,4590,4013, # 1776
619,5202,3154,3294, 215,2007,2796,2561,3220,4591,3221,4592, 763,4263,3842,4593, # 1792
5203,5204,1958,1767,2956,3365,3712,1174, 452,1477,4594,3366,3155,5205,2838,1253, # 1808
2387,2189,1091,2290,4264, 492,5206, 638,1169,1825,2136,1752,4014, 648, 926,1021, # 1824
1324,4595, 520,4596, 997, 847,1007, 892,4597,3843,2267,1872,3713,2405,1785,4598, # 1840
1953,2957,3103,3222,1728,4265,2044,3714,4599,2008,1701,3156,1551, 30,2268,4266, # 1856
5207,2027,4600,3589,5208, 501,5209,4267, 594,3478,2166,1822,3590,3479,3591,3223, # 1872
829,2839,4268,5210,1680,3157,1225,4269,5211,3295,4601,4270,3158,2341,5212,4602, # 1888
4271,5213,4015,4016,5214,1848,2388,2606,3367,5215,4603, 374,4017, 652,4272,4273, # 1904
375,1140, 798,5216,5217,5218,2366,4604,2269, 546,1659, 138,3051,2450,4605,5219, # 1920
2254, 612,1849, 910, 796,3844,1740,1371, 825,3845,3846,5220,2920,2562,5221, 692, # 1936
444,3052,2634, 801,4606,4274,5222,1491, 244,1053,3053,4275,4276, 340,5223,4018, # 1952
1041,3005, 293,1168, 87,1357,5224,1539, 959,5225,2240, 721, 694,4277,3847, 219, # 1968
1478, 644,1417,3368,2666,1413,1401,1335,1389,4019,5226,5227,3006,2367,3159,1826, # 1984
730,1515, 184,2840, 66,4607,5228,1660,2958, 246,3369, 378,1457, 226,3480, 975, # 2000
4020,2959,1264,3592, 674, 696,5229, 163,5230,1141,2422,2167, 713,3593,3370,4608, # 2016
4021,5231,5232,1186, 15,5233,1079,1070,5234,1522,3224,3594, 276,1050,2725, 758, # 2032
1126, 653,2960,3296,5235,2342, 889,3595,4022,3104,3007, 903,1250,4609,4023,3481, # 2048
3596,1342,1681,1718, 766,3297, 286, 89,2961,3715,5236,1713,5237,2607,3371,3008, # 2064
5238,2962,2219,3225,2880,5239,4610,2505,2533, 181, 387,1075,4024, 731,2190,3372, # 2080
5240,3298, 310, 313,3482,2304, 770,4278, 54,3054, 189,4611,3105,3848,4025,5241, # 2096
1230,1617,1850, 355,3597,4279,4612,3373, 111,4280,3716,1350,3160,3483,3055,4281, # 2112
2150,3299,3598,5242,2797,4026,4027,3009, 722,2009,5243,1071, 247,1207,2343,2478, # 2128
1378,4613,2010, 864,1437,1214,4614, 373,3849,1142,2220, 667,4615, 442,2763,2563, # 2144
3850,4028,1969,4282,3300,1840, 837, 170,1107, 934,1336,1883,5244,5245,2119,4283, # 2160
2841, 743,1569,5246,4616,4284, 582,2389,1418,3484,5247,1803,5248, 357,1395,1729, # 2176
3717,3301,2423,1564,2241,5249,3106,3851,1633,4617,1114,2086,4285,1532,5250, 482, # 2192
2451,4618,5251,5252,1492, 833,1466,5253,2726,3599,1641,2842,5254,1526,1272,3718, # 2208
4286,1686,1795, 416,2564,1903,1954,1804,5255,3852,2798,3853,1159,2321,5256,2881, # 2224
4619,1610,1584,3056,2424,2764, 443,3302,1163,3161,5257,5258,4029,5259,4287,2506, # 2240
3057,4620,4030,3162,2104,1647,3600,2011,1873,4288,5260,4289, 431,3485,5261, 250, # 2256
97, 81,4290,5262,1648,1851,1558, 160, 848,5263, 866, 740,1694,5264,2204,2843, # 2272
3226,4291,4621,3719,1687, 950,2479, 426, 469,3227,3720,3721,4031,5265,5266,1188, # 2288
424,1996, 861,3601,4292,3854,2205,2694, 168,1235,3602,4293,5267,2087,1674,4622, # 2304
3374,3303, 220,2565,1009,5268,3855, 670,3010, 332,1208, 717,5269,5270,3603,2452, # 2320
4032,3375,5271, 513,5272,1209,2882,3376,3163,4623,1080,5273,5274,5275,5276,2534, # 2336
3722,3604, 815,1587,4033,4034,5277,3605,3486,3856,1254,4624,1328,3058,1390,4035, # 2352
1741,4036,3857,4037,5278, 236,3858,2453,3304,5279,5280,3723,3859,1273,3860,4625, # 2368
5281, 308,5282,4626, 245,4627,1852,2480,1307,2583, 430, 715,2137,2454,5283, 270, # 2384
199,2883,4038,5284,3606,2727,1753, 761,1754, 725,1661,1841,4628,3487,3724,5285, # 2400
5286, 587, 14,3305, 227,2608, 326, 480,2270, 943,2765,3607, 291, 650,1884,5287, # 2416
1702,1226, 102,1547, 62,3488, 904,4629,3489,1164,4294,5288,5289,1224,1548,2766, # 2432
391, 498,1493,5290,1386,1419,5291,2056,1177,4630, 813, 880,1081,2368, 566,1145, # 2448
4631,2291,1001,1035,2566,2609,2242, 394,1286,5292,5293,2069,5294, 86,1494,1730, # 2464
4039, 491,1588, 745, 897,2963, 843,3377,4040,2767,2884,3306,1768, 998,2221,2070, # 2480
397,1827,1195,1970,3725,3011,3378, 284,5295,3861,2507,2138,2120,1904,5296,4041, # 2496
2151,4042,4295,1036,3490,1905, 114,2567,4296, 209,1527,5297,5298,2964,2844,2635, # 2512
2390,2728,3164, 812,2568,5299,3307,5300,1559, 737,1885,3726,1210, 885, 28,2695, # 2528
3608,3862,5301,4297,1004,1780,4632,5302, 346,1982,2222,2696,4633,3863,1742, 797, # 2544
1642,4043,1934,1072,1384,2152, 896,4044,3308,3727,3228,2885,3609,5303,2569,1959, # 2560
4634,2455,1786,5304,5305,5306,4045,4298,1005,1308,3728,4299,2729,4635,4636,1528, # 2576
2610, 161,1178,4300,1983, 987,4637,1101,4301, 631,4046,1157,3229,2425,1343,1241, # 2592
1016,2243,2570, 372, 877,2344,2508,1160, 555,1935, 911,4047,5307, 466,1170, 169, # 2608
1051,2921,2697,3729,2481,3012,1182,2012,2571,1251,2636,5308, 992,2345,3491,1540, # 2624
2730,1201,2071,2406,1997,2482,5309,4638, 528,1923,2191,1503,1874,1570,2369,3379, # 2640
3309,5310, 557,1073,5311,1828,3492,2088,2271,3165,3059,3107, 767,3108,2799,4639, # 2656
1006,4302,4640,2346,1267,2179,3730,3230, 778,4048,3231,2731,1597,2667,5312,4641, # 2672
5313,3493,5314,5315,5316,3310,2698,1433,3311, 131, 95,1504,4049, 723,4303,3166, # 2688
1842,3610,2768,2192,4050,2028,2105,3731,5317,3013,4051,1218,5318,3380,3232,4052, # 2704
4304,2584, 248,1634,3864, 912,5319,2845,3732,3060,3865, 654, 53,5320,3014,5321, # 2720
1688,4642, 777,3494,1032,4053,1425,5322, 191, 820,2121,2846, 971,4643, 931,3233, # 2736
135, 664, 783,3866,1998, 772,2922,1936,4054,3867,4644,2923,3234, 282,2732, 640, # 2752
1372,3495,1127, 922, 325,3381,5323,5324, 711,2045,5325,5326,4055,2223,2800,1937, # 2768
4056,3382,2224,2255,3868,2305,5327,4645,3869,1258,3312,4057,3235,2139,2965,4058, # 2784
4059,5328,2225, 258,3236,4646, 101,1227,5329,3313,1755,5330,1391,3314,5331,2924, # 2800
2057, 893,5332,5333,5334,1402,4305,2347,5335,5336,3237,3611,5337,5338, 878,1325, # 2816
1781,2801,4647, 259,1385,2585, 744,1183,2272,4648,5339,4060,2509,5340, 684,1024, # 2832
4306,5341, 472,3612,3496,1165,3315,4061,4062, 322,2153, 881, 455,1695,1152,1340, # 2848
660, 554,2154,4649,1058,4650,4307, 830,1065,3383,4063,4651,1924,5342,1703,1919, # 2864
5343, 932,2273, 122,5344,4652, 947, 677,5345,3870,2637, 297,1906,1925,2274,4653, # 2880
2322,3316,5346,5347,4308,5348,4309, 84,4310, 112, 989,5349, 547,1059,4064, 701, # 2896
3613,1019,5350,4311,5351,3497, 942, 639, 457,2306,2456, 993,2966, 407, 851, 494, # 2912
4654,3384, 927,5352,1237,5353,2426,3385, 573,4312, 680, 921,2925,1279,1875, 285, # 2928
790,1448,1984, 719,2168,5354,5355,4655,4065,4066,1649,5356,1541, 563,5357,1077, # 2944
5358,3386,3061,3498, 511,3015,4067,4068,3733,4069,1268,2572,3387,3238,4656,4657, # 2960
5359, 535,1048,1276,1189,2926,2029,3167,1438,1373,2847,2967,1134,2013,5360,4313, # 2976
1238,2586,3109,1259,5361, 700,5362,2968,3168,3734,4314,5363,4315,1146,1876,1907, # 2992
4658,2611,4070, 781,2427, 132,1589, 203, 147, 273,2802,2407, 898,1787,2155,4071, # 3008
4072,5364,3871,2803,5365,5366,4659,4660,5367,3239,5368,1635,3872, 965,5369,1805, # 3024
2699,1516,3614,1121,1082,1329,3317,4073,1449,3873, 65,1128,2848,2927,2769,1590, # 3040
3874,5370,5371, 12,2668, 45, 976,2587,3169,4661, 517,2535,1013,1037,3240,5372, # 3056
3875,2849,5373,3876,5374,3499,5375,2612, 614,1999,2323,3877,3110,2733,2638,5376, # 3072
2588,4316, 599,1269,5377,1811,3735,5378,2700,3111, 759,1060, 489,1806,3388,3318, # 3088
1358,5379,5380,2391,1387,1215,2639,2256, 490,5381,5382,4317,1759,2392,2348,5383, # 3104
4662,3878,1908,4074,2640,1807,3241,4663,3500,3319,2770,2349, 874,5384,5385,3501, # 3120
3736,1859, 91,2928,3737,3062,3879,4664,5386,3170,4075,2669,5387,3502,1202,1403, # 3136
3880,2969,2536,1517,2510,4665,3503,2511,5388,4666,5389,2701,1886,1495,1731,4076, # 3152
2370,4667,5390,2030,5391,5392,4077,2702,1216, 237,2589,4318,2324,4078,3881,4668, # 3168
4669,2703,3615,3504, 445,4670,5393,5394,5395,5396,2771, 61,4079,3738,1823,4080, # 3184
5397, 687,2046, 935, 925, 405,2670, 703,1096,1860,2734,4671,4081,1877,1367,2704, # 3200
3389, 918,2106,1782,2483, 334,3320,1611,1093,4672, 564,3171,3505,3739,3390, 945, # 3216
2641,2058,4673,5398,1926, 872,4319,5399,3506,2705,3112, 349,4320,3740,4082,4674, # 3232
3882,4321,3741,2156,4083,4675,4676,4322,4677,2408,2047, 782,4084, 400, 251,4323, # 3248
1624,5400,5401, 277,3742, 299,1265, 476,1191,3883,2122,4324,4325,1109, 205,5402, # 3264
2590,1000,2157,3616,1861,5403,5404,5405,4678,5406,4679,2573, 107,2484,2158,4085, # 3280
3507,3172,5407,1533, 541,1301, 158, 753,4326,2886,3617,5408,1696, 370,1088,4327, # 3296
4680,3618, 579, 327, 440, 162,2244, 269,1938,1374,3508, 968,3063, 56,1396,3113, # 3312
2107,3321,3391,5409,1927,2159,4681,3016,5410,3619,5411,5412,3743,4682,2485,5413, # 3328
2804,5414,1650,4683,5415,2613,5416,5417,4086,2671,3392,1149,3393,4087,3884,4088, # 3344
5418,1076, 49,5419, 951,3242,3322,3323, 450,2850, 920,5420,1812,2805,2371,4328, # 3360
1909,1138,2372,3885,3509,5421,3243,4684,1910,1147,1518,2428,4685,3886,5422,4686, # 3376
2393,2614, 260,1796,3244,5423,5424,3887,3324, 708,5425,3620,1704,5426,3621,1351, # 3392
1618,3394,3017,1887, 944,4329,3395,4330,3064,3396,4331,5427,3744, 422, 413,1714, # 3408
3325, 500,2059,2350,4332,2486,5428,1344,1911, 954,5429,1668,5430,5431,4089,2409, # 3424
4333,3622,3888,4334,5432,2307,1318,2512,3114, 133,3115,2887,4687, 629, 31,2851, # 3440
2706,3889,4688, 850, 949,4689,4090,2970,1732,2089,4335,1496,1853,5433,4091, 620, # 3456
3245, 981,1242,3745,3397,1619,3746,1643,3326,2140,2457,1971,1719,3510,2169,5434, # 3472
3246,5435,5436,3398,1829,5437,1277,4690,1565,2048,5438,1636,3623,3116,5439, 869, # 3488
2852, 655,3890,3891,3117,4092,3018,3892,1310,3624,4691,5440,5441,5442,1733, 558, # 3504
4692,3747, 335,1549,3065,1756,4336,3748,1946,3511,1830,1291,1192, 470,2735,2108, # 3520
2806, 913,1054,4093,5443,1027,5444,3066,4094,4693, 982,2672,3399,3173,3512,3247, # 3536
3248,1947,2807,5445, 571,4694,5446,1831,5447,3625,2591,1523,2429,5448,2090, 984, # 3552
4695,3749,1960,5449,3750, 852, 923,2808,3513,3751, 969,1519, 999,2049,2325,1705, # 3568
5450,3118, 615,1662, 151, 597,4095,2410,2326,1049, 275,4696,3752,4337, 568,3753, # 3584
3626,2487,4338,3754,5451,2430,2275, 409,3249,5452,1566,2888,3514,1002, 769,2853, # 3600
194,2091,3174,3755,2226,3327,4339, 628,1505,5453,5454,1763,2180,3019,4096, 521, # 3616
1161,2592,1788,2206,2411,4697,4097,1625,4340,4341, 412, 42,3119, 464,5455,2642, # 3632
4698,3400,1760,1571,2889,3515,2537,1219,2207,3893,2643,2141,2373,4699,4700,3328, # 3648
1651,3401,3627,5456,5457,3628,2488,3516,5458,3756,5459,5460,2276,2092, 460,5461, # 3664
4701,5462,3020, 962, 588,3629, 289,3250,2644,1116, 52,5463,3067,1797,5464,5465, # 3680
5466,1467,5467,1598,1143,3757,4342,1985,1734,1067,4702,1280,3402, 465,4703,1572, # 3696
510,5468,1928,2245,1813,1644,3630,5469,4704,3758,5470,5471,2673,1573,1534,5472, # 3712
5473, 536,1808,1761,3517,3894,3175,2645,5474,5475,5476,4705,3518,2929,1912,2809, # 3728
5477,3329,1122, 377,3251,5478, 360,5479,5480,4343,1529, 551,5481,2060,3759,1769, # 3744
2431,5482,2930,4344,3330,3120,2327,2109,2031,4706,1404, 136,1468,1479, 672,1171, # 3760
3252,2308, 271,3176,5483,2772,5484,2050, 678,2736, 865,1948,4707,5485,2014,4098, # 3776
2971,5486,2737,2227,1397,3068,3760,4708,4709,1735,2931,3403,3631,5487,3895, 509, # 3792
2854,2458,2890,3896,5488,5489,3177,3178,4710,4345,2538,4711,2309,1166,1010, 552, # 3808
681,1888,5490,5491,2972,2973,4099,1287,1596,1862,3179, 358, 453, 736, 175, 478, # 3824
1117, 905,1167,1097,5492,1854,1530,5493,1706,5494,2181,3519,2292,3761,3520,3632, # 3840
4346,2093,4347,5495,3404,1193,2489,4348,1458,2193,2208,1863,1889,1421,3331,2932, # 3856
3069,2182,3521, 595,2123,5496,4100,5497,5498,4349,1707,2646, 223,3762,1359, 751, # 3872
3121, 183,3522,5499,2810,3021, 419,2374, 633, 704,3897,2394, 241,5500,5501,5502, # 3888
838,3022,3763,2277,2773,2459,3898,1939,2051,4101,1309,3122,2246,1181,5503,1136, # 3904
2209,3899,2375,1446,4350,2310,4712,5504,5505,4351,1055,2615, 484,3764,5506,4102, # 3920
625,4352,2278,3405,1499,4353,4103,5507,4104,4354,3253,2279,2280,3523,5508,5509, # 3936
2774, 808,2616,3765,3406,4105,4355,3123,2539, 526,3407,3900,4356, 955,5510,1620, # 3952
4357,2647,2432,5511,1429,3766,1669,1832, 994, 928,5512,3633,1260,5513,5514,5515, # 3968
1949,2293, 741,2933,1626,4358,2738,2460, 867,1184, 362,3408,1392,5516,5517,4106, # 3984
4359,1770,1736,3254,2934,4713,4714,1929,2707,1459,1158,5518,3070,3409,2891,1292, # 4000
1930,2513,2855,3767,1986,1187,2072,2015,2617,4360,5519,2574,2514,2170,3768,2490, # 4016
3332,5520,3769,4715,5521,5522, 666,1003,3023,1022,3634,4361,5523,4716,1814,2257, # 4032
574,3901,1603, 295,1535, 705,3902,4362, 283, 858, 417,5524,5525,3255,4717,4718, # 4048
3071,1220,1890,1046,2281,2461,4107,1393,1599, 689,2575, 388,4363,5526,2491, 802, # 4064
5527,2811,3903,2061,1405,2258,5528,4719,3904,2110,1052,1345,3256,1585,5529, 809, # 4080
5530,5531,5532, 575,2739,3524, 956,1552,1469,1144,2328,5533,2329,1560,2462,3635, # 4096
3257,4108, 616,2210,4364,3180,2183,2294,5534,1833,5535,3525,4720,5536,1319,3770, # 4112
3771,1211,3636,1023,3258,1293,2812,5537,5538,5539,3905, 607,2311,3906, 762,2892, # 4128
1439,4365,1360,4721,1485,3072,5540,4722,1038,4366,1450,2062,2648,4367,1379,4723, # 4144
2593,5541,5542,4368,1352,1414,2330,2935,1172,5543,5544,3907,3908,4724,1798,1451, # 4160
5545,5546,5547,5548,2936,4109,4110,2492,2351, 411,4111,4112,3637,3333,3124,4725, # 4176
1561,2674,1452,4113,1375,5549,5550, 47,2974, 316,5551,1406,1591,2937,3181,5552, # 4192
1025,2142,3125,3182, 354,2740, 884,2228,4369,2412, 508,3772, 726,3638, 996,2433, # 4208
3639, 729,5553, 392,2194,1453,4114,4726,3773,5554,5555,2463,3640,2618,1675,2813, # 4224
919,2352,2975,2353,1270,4727,4115, 73,5556,5557, 647,5558,3259,2856,2259,1550, # 4240
1346,3024,5559,1332, 883,3526,5560,5561,5562,5563,3334,2775,5564,1212, 831,1347, # 4256
4370,4728,2331,3909,1864,3073, 720,3910,4729,4730,3911,5565,4371,5566,5567,4731, # 4272
5568,5569,1799,4732,3774,2619,4733,3641,1645,2376,4734,5570,2938, 669,2211,2675, # 4288
2434,5571,2893,5572,5573,1028,3260,5574,4372,2413,5575,2260,1353,5576,5577,4735, # 4304
3183, 518,5578,4116,5579,4373,1961,5580,2143,4374,5581,5582,3025,2354,2355,3912, # 4320
516,1834,1454,4117,2708,4375,4736,2229,2620,1972,1129,3642,5583,2776,5584,2976, # 4336
1422, 577,1470,3026,1524,3410,5585,5586, 432,4376,3074,3527,5587,2594,1455,2515, # 4352
2230,1973,1175,5588,1020,2741,4118,3528,4737,5589,2742,5590,1743,1361,3075,3529, # 4368
2649,4119,4377,4738,2295, 895, 924,4378,2171, 331,2247,3076, 166,1627,3077,1098, # 4384
5591,1232,2894,2231,3411,4739, 657, 403,1196,2377, 542,3775,3412,1600,4379,3530, # 4400
5592,4740,2777,3261, 576, 530,1362,4741,4742,2540,2676,3776,4120,5593, 842,3913, # 4416
5594,2814,2032,1014,4121, 213,2709,3413, 665, 621,4380,5595,3777,2939,2435,5596, # 4432
2436,3335,3643,3414,4743,4381,2541,4382,4744,3644,1682,4383,3531,1380,5597, 724, # 4448
2282, 600,1670,5598,1337,1233,4745,3126,2248,5599,1621,4746,5600, 651,4384,5601, # 4464
1612,4385,2621,5602,2857,5603,2743,2312,3078,5604, 716,2464,3079, 174,1255,2710, # 4480
4122,3645, 548,1320,1398, 728,4123,1574,5605,1891,1197,3080,4124,5606,3081,3082, # 4496
3778,3646,3779, 747,5607, 635,4386,4747,5608,5609,5610,4387,5611,5612,4748,5613, # 4512
3415,4749,2437, 451,5614,3780,2542,2073,4388,2744,4389,4125,5615,1764,4750,5616, # 4528
4390, 350,4751,2283,2395,2493,5617,4391,4126,2249,1434,4127, 488,4752, 458,4392, # 4544
4128,3781, 771,1330,2396,3914,2576,3184,2160,2414,1553,2677,3185,4393,5618,2494, # 4560
2895,2622,1720,2711,4394,3416,4753,5619,2543,4395,5620,3262,4396,2778,5621,2016, # 4576
2745,5622,1155,1017,3782,3915,5623,3336,2313, 201,1865,4397,1430,5624,4129,5625, # 4592
5626,5627,5628,5629,4398,1604,5630, 414,1866, 371,2595,4754,4755,3532,2017,3127, # 4608
4756,1708, 960,4399, 887, 389,2172,1536,1663,1721,5631,2232,4130,2356,2940,1580, # 4624
5632,5633,1744,4757,2544,4758,4759,5634,4760,5635,2074,5636,4761,3647,3417,2896, # 4640
4400,5637,4401,2650,3418,2815, 673,2712,2465, 709,3533,4131,3648,4402,5638,1148, # 4656
502, 634,5639,5640,1204,4762,3649,1575,4763,2623,3783,5641,3784,3128, 948,3263, # 4672
121,1745,3916,1110,5642,4403,3083,2516,3027,4132,3785,1151,1771,3917,1488,4133, # 4688
1987,5643,2438,3534,5644,5645,2094,5646,4404,3918,1213,1407,2816, 531,2746,2545, # 4704
3264,1011,1537,4764,2779,4405,3129,1061,5647,3786,3787,1867,2897,5648,2018, 120, # 4720
4406,4407,2063,3650,3265,2314,3919,2678,3419,1955,4765,4134,5649,3535,1047,2713, # 4736
1266,5650,1368,4766,2858, 649,3420,3920,2546,2747,1102,2859,2679,5651,5652,2000, # 4752
5653,1111,3651,2977,5654,2495,3921,3652,2817,1855,3421,3788,5655,5656,3422,2415, # 4768
2898,3337,3266,3653,5657,2577,5658,3654,2818,4135,1460, 856,5659,3655,5660,2899, # 4784
2978,5661,2900,3922,5662,4408, 632,2517, 875,3923,1697,3924,2296,5663,5664,4767, # 4800
3028,1239, 580,4768,4409,5665, 914, 936,2075,1190,4136,1039,2124,5666,5667,5668, # 4816
5669,3423,1473,5670,1354,4410,3925,4769,2173,3084,4137, 915,3338,4411,4412,3339, # 4832
1605,1835,5671,2748, 398,3656,4413,3926,4138, 328,1913,2860,4139,3927,1331,4414, # 4848
3029, 937,4415,5672,3657,4140,4141,3424,2161,4770,3425, 524, 742, 538,3085,1012, # 4864
5673,5674,3928,2466,5675, 658,1103, 225,3929,5676,5677,4771,5678,4772,5679,3267, # 4880
1243,5680,4142, 963,2250,4773,5681,2714,3658,3186,5682,5683,2596,2332,5684,4774, # 4896
5685,5686,5687,3536, 957,3426,2547,2033,1931,2941,2467, 870,2019,3659,1746,2780, # 4912
2781,2439,2468,5688,3930,5689,3789,3130,3790,3537,3427,3791,5690,1179,3086,5691, # 4928
3187,2378,4416,3792,2548,3188,3131,2749,4143,5692,3428,1556,2549,2297, 977,2901, # 4944
2034,4144,1205,3429,5693,1765,3430,3189,2125,1271, 714,1689,4775,3538,5694,2333, # 4960
3931, 533,4417,3660,2184, 617,5695,2469,3340,3539,2315,5696,5697,3190,5698,5699, # 4976
3932,1988, 618, 427,2651,3540,3431,5700,5701,1244,1690,5702,2819,4418,4776,5703, # 4992
3541,4777,5704,2284,1576, 473,3661,4419,3432, 972,5705,3662,5706,3087,5707,5708, # 5008
4778,4779,5709,3793,4145,4146,5710, 153,4780, 356,5711,1892,2902,4420,2144, 408, # 5024
803,2357,5712,3933,5713,4421,1646,2578,2518,4781,4782,3934,5714,3935,4422,5715, # 5040
2416,3433, 752,5716,5717,1962,3341,2979,5718, 746,3030,2470,4783,4423,3794, 698, # 5056
4784,1893,4424,3663,2550,4785,3664,3936,5719,3191,3434,5720,1824,1302,4147,2715, # 5072
3937,1974,4425,5721,4426,3192, 823,1303,1288,1236,2861,3542,4148,3435, 774,3938, # 5088
5722,1581,4786,1304,2862,3939,4787,5723,2440,2162,1083,3268,4427,4149,4428, 344, # 5104
1173, 288,2316, 454,1683,5724,5725,1461,4788,4150,2597,5726,5727,4789, 985, 894, # 5120
5728,3436,3193,5729,1914,2942,3795,1989,5730,2111,1975,5731,4151,5732,2579,1194, # 5136
425,5733,4790,3194,1245,3796,4429,5734,5735,2863,5736, 636,4791,1856,3940, 760, # 5152
1800,5737,4430,2212,1508,4792,4152,1894,1684,2298,5738,5739,4793,4431,4432,2213, # 5168
479,5740,5741, 832,5742,4153,2496,5743,2980,2497,3797, 990,3132, 627,1815,2652, # 5184
4433,1582,4434,2126,2112,3543,4794,5744, 799,4435,3195,5745,4795,2113,1737,3031, # 5200
1018, 543, 754,4436,3342,1676,4796,4797,4154,4798,1489,5746,3544,5747,2624,2903, # 5216
4155,5748,5749,2981,5750,5751,5752,5753,3196,4799,4800,2185,1722,5754,3269,3270, # 5232
1843,3665,1715, 481, 365,1976,1857,5755,5756,1963,2498,4801,5757,2127,3666,3271, # 5248
433,1895,2064,2076,5758, 602,2750,5759,5760,5761,5762,5763,3032,1628,3437,5764, # 5264
3197,4802,4156,2904,4803,2519,5765,2551,2782,5766,5767,5768,3343,4804,2905,5769, # 5280
4805,5770,2864,4806,4807,1221,2982,4157,2520,5771,5772,5773,1868,1990,5774,5775, # 5296
5776,1896,5777,5778,4808,1897,4158, 318,5779,2095,4159,4437,5780,5781, 485,5782, # 5312
938,3941, 553,2680, 116,5783,3942,3667,5784,3545,2681,2783,3438,3344,2820,5785, # 5328
3668,2943,4160,1747,2944,2983,5786,5787, 207,5788,4809,5789,4810,2521,5790,3033, # 5344
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
)

View File

@ -0,0 +1,47 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import Big5DistributionAnalysis
from .mbcssm import BIG5_SM_MODEL
class Big5Prober(MultiByteCharSetProber):
def __init__(self):
super(Big5Prober, self).__init__()
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
self.distribution_analyzer = Big5DistributionAnalysis()
self.reset()
@property
def charset_name(self):
return "Big5"
@property
def language(self):
return "Chinese"

View File

@ -0,0 +1,233 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE,
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE,
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE,
GB2312_TYPICAL_DISTRIBUTION_RATIO)
from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
class CharDistributionAnalysis(object):
ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99
SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
def __init__(self):
# Mapping table to get frequency order from char order (get from
# GetOrder())
self._char_to_freq_order = None
self._table_size = None # Size of above table
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self.typical_distribution_ratio = None
self._done = None
self._total_chars = None
self._freq_chars = None
self.reset()
def reset(self):
"""reset analyser, clear any state"""
# If this flag is set to True, detection is done and conclusion has
# been made
self._done = False
self._total_chars = 0 # Total characters encountered
# The number of characters whose frequency order is less than 512
self._freq_chars = 0
def feed(self, char, char_len):
"""feed a character with known length"""
if char_len == 2:
# we only care about 2-bytes character in our distribution analysis
order = self.get_order(char)
else:
order = -1
if order >= 0:
self._total_chars += 1
# order is valid
if order < self._table_size:
if 512 > self._char_to_freq_order[order]:
self._freq_chars += 1
def get_confidence(self):
"""return confidence based on existing data"""
# if we didn't receive any character in our consideration range,
# return negative answer
if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD:
return self.SURE_NO
if self._total_chars != self._freq_chars:
r = (self._freq_chars / ((self._total_chars - self._freq_chars)
* self.typical_distribution_ratio))
if r < self.SURE_YES:
return r
# normalize confidence (we don't want to be 100% sure)
return self.SURE_YES
def got_enough_data(self):
# It is not necessary to receive all data to draw conclusion.
# For charset detection, certain amount of data is enough
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
def get_order(self, byte_str):
# We do not handle characters based on the original encoding string,
# but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency
# table.
return -1
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(EUCTWDistributionAnalysis, self).__init__()
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
self._table_size = EUCTW_TABLE_SIZE
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
# for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char = byte_str[0]
if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
else:
return -1
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(EUCKRDistributionAnalysis, self).__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
# for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char = byte_str[0]
if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
else:
return -1
class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(GB2312DistributionAnalysis, self).__init__()
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
self._table_size = GB2312_TABLE_SIZE
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
# for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else:
return -1
class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(Big5DistributionAnalysis, self).__init__()
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
self._table_size = BIG5_TABLE_SIZE
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
# for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char, second_char = byte_str[0], byte_str[1]
if first_char >= 0xA4:
if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
else:
return 157 * (first_char - 0xA4) + second_char - 0x40
else:
return -1
class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(SJISDistributionAnalysis, self).__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
# for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0x81) and (first_char <= 0x9F):
order = 188 * (first_char - 0x81)
elif (first_char >= 0xE0) and (first_char <= 0xEF):
order = 188 * (first_char - 0xE0 + 31)
else:
return -1
order = order + second_char - 0x40
if second_char > 0x7F:
order = -1
return order
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super(EUCJPDistributionAnalysis, self).__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
# for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
char = byte_str[0]
if char >= 0xA0:
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
else:
return -1

View File

@ -0,0 +1,107 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .enums import ProbingState
from .charsetprober import CharSetProber
class CharSetGroupProber(CharSetProber):
def __init__(self, lang_filter=None):
super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
self._active_num = 0
self.probers = []
self._best_guess_prober = None
def reset(self):
super(CharSetGroupProber, self).reset()
self._active_num = 0
for prober in self.probers:
if prober:
prober.reset()
prober.active = True
self._active_num += 1
self._best_guess_prober = None
@property
def charset_name(self):
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
return None
return self._best_guess_prober.charset_name
@property
def language(self):
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
return None
return self._best_guess_prober.language
def feed(self, byte_str):
for prober in self.probers:
if not prober:
continue
if not prober.active:
continue
state = prober.feed(byte_str)
if not state:
continue
if state == ProbingState.FOUND_IT:
self._best_guess_prober = prober
self._state = ProbingState.FOUND_IT
return self.state
elif state == ProbingState.NOT_ME:
prober.active = False
self._active_num -= 1
if self._active_num <= 0:
self._state = ProbingState.NOT_ME
return self.state
return self.state
def get_confidence(self):
state = self.state
if state == ProbingState.FOUND_IT:
return 0.99
elif state == ProbingState.NOT_ME:
return 0.01
best_conf = 0.0
self._best_guess_prober = None
for prober in self.probers:
if not prober:
continue
if not prober.active:
self.logger.debug('%s not active', prober.charset_name)
continue
conf = prober.get_confidence()
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
if best_conf < conf:
best_conf = conf
self._best_guess_prober = prober
if not self._best_guess_prober:
return 0.0
return best_conf

View File

@ -0,0 +1,145 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import logging
import re
from .enums import ProbingState
class CharSetProber(object):
SHORTCUT_THRESHOLD = 0.95
def __init__(self, lang_filter=None):
self._state = None
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
def reset(self):
self._state = ProbingState.DETECTING
@property
def charset_name(self):
return None
def feed(self, buf):
pass
@property
def state(self):
return self._state
def get_confidence(self):
return 0.0
@staticmethod
def filter_high_byte_only(buf):
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
return buf
@staticmethod
def filter_international_words(buf):
"""
We define three types of bytes:
alphabet: english alphabets [a-zA-Z]
international: international characters [\x80-\xFF]
marker: everything else [^a-zA-Z\x80-\xFF]
The input buffer can be thought to contain a series of words delimited
by markers. This function works to filter all words that contain at
least one international character. All contiguous sequences of markers
are replaced by a single space ascii character.
This filter applies to all scripts which do not use English characters.
"""
filtered = bytearray()
# This regex expression filters out only words that have at-least one
# international character. The word may include one marker character at
# the end.
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
buf)
for word in words:
filtered.extend(word[:-1])
# If the last character in the word is a marker, replace it with a
# space as markers shouldn't affect our analysis (they are used
# similarly across all languages and may thus have similar
# frequencies).
last_char = word[-1:]
if not last_char.isalpha() and last_char < b'\x80':
last_char = b' '
filtered.extend(last_char)
return filtered
@staticmethod
def filter_with_english_letters(buf):
"""
Returns a copy of ``buf`` that retains only the sequences of English
alphabet and high byte characters that are not between <> characters.
Also retains English alphabet and high byte characters immediately
before occurrences of >.
This filter can be applied to all scripts which contain both English
characters and extended ASCII characters, but is currently only used by
``Latin1Prober``.
"""
filtered = bytearray()
in_tag = False
prev = 0
for curr in range(len(buf)):
# Slice here to get bytes instead of an int with Python 3
buf_char = buf[curr:curr + 1]
# Check if we're coming out of or entering an HTML tag
if buf_char == b'>':
in_tag = False
elif buf_char == b'<':
in_tag = True
# If current character is not extended-ASCII and not alphabetic...
if buf_char < b'\x80' and not buf_char.isalpha():
# ...and we're not in a tag
if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII,
# non-alphabetic character
filtered.extend(buf[prev:curr])
# Output a space to delimit stretch we kept
filtered.extend(b' ')
prev = curr + 1
# If we're not in a tag...
if not in_tag:
# Keep everything after last non-extended-ASCII, non-alphabetic
# character
filtered.extend(buf[prev:])
return filtered

View File

@ -0,0 +1,84 @@
"""
Script which takes one or more file paths and reports on their detected
encodings
Example::
% chardetect somefile someotherfile
somefile: windows-1252 with confidence 0.5
someotherfile: ascii with confidence 1.0
If no paths are provided, it takes its input from stdin.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from pip._vendor.chardet import __version__
from pip._vendor.chardet.compat import PY2
from pip._vendor.chardet.universaldetector import UniversalDetector
def description_of(lines, name='stdin'):
"""
Return a string describing the probable encoding of a file or
list of strings.
:param lines: The lines to get the encoding of.
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
"""
u = UniversalDetector()
for line in lines:
line = bytearray(line)
u.feed(line)
# shortcut out of the loop to save reading further - particularly useful if we read a BOM.
if u.done:
break
u.close()
result = u.result
if PY2:
name = name.decode(sys.getfilesystemencoding(), 'ignore')
if result['encoding']:
return '{}: {} with confidence {}'.format(name, result['encoding'],
result['confidence'])
else:
return '{}: no result'.format(name)
def main(argv=None):
"""
Handles command line arguments and gets things started.
:param argv: List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
:type argv: list of str
"""
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings")
parser.add_argument('input',
help='File whose encoding we would like to determine. \
(default: stdin)',
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin if PY2 else sys.stdin.buffer])
parser.add_argument('--version', action='version',
version='%(prog)s {}'.format(__version__))
args = parser.parse_args(argv)
for f in args.input:
if f.isatty():
print("You are running chardetect interactively. Press " +
"CTRL-D twice at the start of a blank line to signal the " +
"end of your input. If you want help, run chardetect " +
"--help\n", file=sys.stderr)
print(description_of(f, f.name))
if __name__ == '__main__':
main()

View File

@ -0,0 +1,88 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import logging
from .enums import MachineState
class CodingStateMachine(object):
"""
A state machine to verify a byte sequence for a particular encoding. For
each byte the detector receives, it will feed that byte to every active
state machine available, one byte at a time. The state machine changes its
state based on its previous state and the byte it receives. There are 3
states in a state machine that are of interest to an auto-detector:
START state: This is the state to start with, or a legal byte sequence
(i.e. a valid code point) for character has been identified.
ME state: This indicates that the state machine identified a byte sequence
that is specific to the charset it is designed for and that
there is no other possible encoding which can contain this byte
sequence. This will to lead to an immediate positive answer for
the detector.
ERROR state: This indicates the state machine identified an illegal byte
sequence for that encoding. This will lead to an immediate
negative answer for this encoding. Detector will exclude this
encoding from consideration from here on.
"""
def __init__(self, sm):
self._model = sm
self._curr_byte_pos = 0
self._curr_char_len = 0
self._curr_state = None
self.logger = logging.getLogger(__name__)
self.reset()
def reset(self):
self._curr_state = MachineState.START
def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
byte_class = self._model['class_table'][c]
if self._curr_state == MachineState.START:
self._curr_byte_pos = 0
self._curr_char_len = self._model['char_len_table'][byte_class]
# from byte's class and state_table, we get its next state
curr_state = (self._curr_state * self._model['class_factor']
+ byte_class)
self._curr_state = self._model['state_table'][curr_state]
self._curr_byte_pos += 1
return self._curr_state
def get_current_charlen(self):
return self._curr_char_len
def get_coding_state_machine(self):
return self._model['name']
@property
def language(self):
return self._model['language']

View File

@ -0,0 +1,36 @@
######################## BEGIN LICENSE BLOCK ########################
# Contributor(s):
# Dan Blanchard
# Ian Cordasco
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import sys
if sys.version_info < (3, 0):
PY2 = True
PY3 = False
string_types = (str, unicode)
text_type = unicode
iteritems = dict.iteritems
else:
PY2 = False
PY3 = True
string_types = (bytes, str)
text_type = str
iteritems = dict.items

View File

@ -0,0 +1,49 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .chardistribution import EUCKRDistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import CP949_SM_MODEL
class CP949Prober(MultiByteCharSetProber):
def __init__(self):
super(CP949Prober, self).__init__()
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
# not different.
self.distribution_analyzer = EUCKRDistributionAnalysis()
self.reset()
@property
def charset_name(self):
return "CP949"
@property
def language(self):
return "Korean"

View File

@ -0,0 +1,76 @@
"""
All of the Enums that are used throughout the chardet package.
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
class InputState(object):
"""
This enum represents the different states a universal detector can be in.
"""
PURE_ASCII = 0
ESC_ASCII = 1
HIGH_BYTE = 2
class LanguageFilter(object):
"""
This enum represents the different language filters we can apply to a
``UniversalDetector``.
"""
CHINESE_SIMPLIFIED = 0x01
CHINESE_TRADITIONAL = 0x02
JAPANESE = 0x04
KOREAN = 0x08
NON_CJK = 0x10
ALL = 0x1F
CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL
CJK = CHINESE | JAPANESE | KOREAN
class ProbingState(object):
"""
This enum represents the different states a prober can be in.
"""
DETECTING = 0
FOUND_IT = 1
NOT_ME = 2
class MachineState(object):
"""
This enum represents the different states a state machine can be in.
"""
START = 0
ERROR = 1
ITS_ME = 2
class SequenceLikelihood(object):
"""
This enum represents the likelihood of a character following the previous one.
"""
NEGATIVE = 0
UNLIKELY = 1
LIKELY = 2
POSITIVE = 3
@classmethod
def get_num_categories(cls):
""":returns: The number of likelihood categories in the enum."""
return 4
class CharacterCategory(object):
"""
This enum represents the different categories language models for
``SingleByteCharsetProber`` put characters into.
Anything less than CONTROL is considered a letter.
"""
UNDEFINED = 255
LINE_BREAK = 254
SYMBOL = 253
DIGIT = 252
CONTROL = 251

View File

@ -0,0 +1,101 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .enums import LanguageFilter, ProbingState, MachineState
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
ISO2022KR_SM_MODEL)
class EscCharSetProber(CharSetProber):
"""
This CharSetProber uses a "code scheme" approach for detecting encodings,
whereby easily recognizable escape or shift sequences are relied on to
identify these encodings.
"""
def __init__(self, lang_filter=None):
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
self.coding_sm = []
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
if self.lang_filter & LanguageFilter.JAPANESE:
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
if self.lang_filter & LanguageFilter.KOREAN:
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
self.active_sm_count = None
self._detected_charset = None
self._detected_language = None
self._state = None
self.reset()
def reset(self):
super(EscCharSetProber, self).reset()
for coding_sm in self.coding_sm:
if not coding_sm:
continue
coding_sm.active = True
coding_sm.reset()
self.active_sm_count = len(self.coding_sm)
self._detected_charset = None
self._detected_language = None
@property
def charset_name(self):
return self._detected_charset
@property
def language(self):
return self._detected_language
def get_confidence(self):
if self._detected_charset:
return 0.99
else:
return 0.00
def feed(self, byte_str):
for c in byte_str:
for coding_sm in self.coding_sm:
if not coding_sm or not coding_sm.active:
continue
coding_state = coding_sm.next_state(c)
if coding_state == MachineState.ERROR:
coding_sm.active = False
self.active_sm_count -= 1
if self.active_sm_count <= 0:
self._state = ProbingState.NOT_ME
return self.state
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
self._detected_charset = coding_sm.get_coding_state_machine()
self._detected_language = coding_sm.language
return self.state
return self.state

View File

@ -0,0 +1,246 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .enums import MachineState
HZ_CLS = (
1,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,4,0,5,2,0, # 78 - 7f
1,1,1,1,1,1,1,1, # 80 - 87
1,1,1,1,1,1,1,1, # 88 - 8f
1,1,1,1,1,1,1,1, # 90 - 97
1,1,1,1,1,1,1,1, # 98 - 9f
1,1,1,1,1,1,1,1, # a0 - a7
1,1,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf
1,1,1,1,1,1,1,1, # c0 - c7
1,1,1,1,1,1,1,1, # c8 - cf
1,1,1,1,1,1,1,1, # d0 - d7
1,1,1,1,1,1,1,1, # d8 - df
1,1,1,1,1,1,1,1, # e0 - e7
1,1,1,1,1,1,1,1, # e8 - ef
1,1,1,1,1,1,1,1, # f0 - f7
1,1,1,1,1,1,1,1, # f8 - ff
)
HZ_ST = (
MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
)
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
HZ_SM_MODEL = {'class_table': HZ_CLS,
'class_factor': 6,
'state_table': HZ_ST,
'char_len_table': HZ_CHAR_LEN_TABLE,
'name': "HZ-GB-2312",
'language': 'Chinese'}
ISO2022CN_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,4,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022CN_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
)
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
'class_factor': 9,
'state_table': ISO2022CN_ST,
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
'name': "ISO-2022-CN",
'language': 'Chinese'}
ISO2022JP_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,2,2, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,7,0,0,0, # 20 - 27
3,0,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
6,0,4,0,8,0,0,0, # 40 - 47
0,9,5,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022JP_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
)
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
'class_factor': 10,
'state_table': ISO2022JP_ST,
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
'name': "ISO-2022-JP",
'language': 'Japanese'}
ISO2022KR_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,1,0,0,0,0, # 18 - 1f
0,0,0,0,3,0,0,0, # 20 - 27
0,4,0,0,0,0,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,5,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
2,2,2,2,2,2,2,2, # 80 - 87
2,2,2,2,2,2,2,2, # 88 - 8f
2,2,2,2,2,2,2,2, # 90 - 97
2,2,2,2,2,2,2,2, # 98 - 9f
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022KR_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
)
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
'class_factor': 6,
'state_table': ISO2022KR_ST,
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
'name': "ISO-2022-KR",
'language': 'Korean'}

View File

@ -0,0 +1,92 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .enums import ProbingState, MachineState
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCJPDistributionAnalysis
from .jpcntx import EUCJPContextAnalysis
from .mbcssm import EUCJP_SM_MODEL
class EUCJPProber(MultiByteCharSetProber):
def __init__(self):
super(EUCJPProber, self).__init__()
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
self.distribution_analyzer = EUCJPDistributionAnalysis()
self.context_analyzer = EUCJPContextAnalysis()
self.reset()
def reset(self):
super(EUCJPProber, self).reset()
self.context_analyzer.reset()
@property
def charset_name(self):
return "EUC-JP"
@property
def language(self):
return "Japanese"
def feed(self, byte_str):
for i in range(len(byte_str)):
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
self.context_analyzer.feed(self._last_char, char_len)
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self.context_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.FOUND_IT
return self.state
def get_confidence(self):
context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf)

View File

@ -0,0 +1,195 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
# Sampling from about 20M text materials include literature and computer technology
# 128 --> 0.79
# 256 --> 0.92
# 512 --> 0.986
# 1024 --> 0.99944
# 2048 --> 0.99999
#
# Idea Distribution Ratio = 0.98653 / (1-0.98653) = 73.24
# Random Distribution Ration = 512 / (2350-512) = 0.279.
#
# Typical Distribution Ratio
EUCKR_TYPICAL_DISTRIBUTION_RATIO = 6.0
EUCKR_TABLE_SIZE = 2352
# Char to FreqOrder table ,
EUCKR_CHAR_TO_FREQ_ORDER = (
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
1399,1729,1730,1731, 141, 621, 326,1057, 368,1732, 267, 488, 20,1733,1269,1734,
945,1400,1735, 47, 904,1270,1736,1737, 773, 248,1738, 409, 313, 786, 429,1739,
116, 987, 813,1401, 683, 75,1204, 145,1740,1741,1742,1743, 16, 847, 667, 622,
708,1744,1745,1746, 966, 787, 304, 129,1747, 60, 820, 123, 676,1748,1749,1750,
1751, 617,1752, 626,1753,1754,1755,1756, 653,1757,1758,1759,1760,1761,1762, 856,
344,1763,1764,1765,1766, 89, 401, 418, 806, 905, 848,1767,1768,1769, 946,1205,
709,1770,1118,1771, 241,1772,1773,1774,1271,1775, 569,1776, 999,1777,1778,1779,
1780, 337, 751,1058, 28, 628, 254,1781, 177, 906, 270, 349, 891,1079,1782, 19,
1783, 379,1784, 315,1785, 629, 754,1402, 559,1786, 636, 203,1206,1787, 710, 567,
1788, 935, 814,1789,1790,1207, 766, 528,1791,1792,1208,1793,1794,1795,1796,1797,
1403,1798,1799, 533,1059,1404,1405,1156,1406, 936, 884,1080,1800, 351,1801,1802,
1803,1804,1805, 801,1806,1807,1808,1119,1809,1157, 714, 474,1407,1810, 298, 899,
885,1811,1120, 802,1158,1812, 892,1813,1814,1408, 659,1815,1816,1121,1817,1818,
1819,1820,1821,1822, 319,1823, 594, 545,1824, 815, 937,1209,1825,1826, 573,1409,
1022,1827,1210,1828,1829,1830,1831,1832,1833, 556, 722, 807,1122,1060,1834, 697,
1835, 900, 557, 715,1836,1410, 540,1411, 752,1159, 294, 597,1211, 976, 803, 770,
1412,1837,1838, 39, 794,1413, 358,1839, 371, 925,1840, 453, 661, 788, 531, 723,
544,1023,1081, 869, 91,1841, 392, 430, 790, 602,1414, 677,1082, 457,1415,1416,
1842,1843, 475, 327,1024,1417, 795, 121,1844, 733, 403,1418,1845,1846,1847, 300,
119, 711,1212, 627,1848,1272, 207,1849,1850, 796,1213, 382,1851, 519,1852,1083,
893,1853,1854,1855, 367, 809, 487, 671,1856, 663,1857,1858, 956, 471, 306, 857,
1859,1860,1160,1084,1861,1862,1863,1864,1865,1061,1866,1867,1868,1869,1870,1871,
282, 96, 574,1872, 502,1085,1873,1214,1874, 907,1875,1876, 827, 977,1419,1420,
1421, 268,1877,1422,1878,1879,1880, 308,1881, 2, 537,1882,1883,1215,1884,1885,
127, 791,1886,1273,1423,1887, 34, 336, 404, 643,1888, 571, 654, 894, 840,1889,
0, 886,1274, 122, 575, 260, 908, 938,1890,1275, 410, 316,1891,1892, 100,1893,
1894,1123, 48,1161,1124,1025,1895, 633, 901,1276,1896,1897, 115, 816,1898, 317,
1899, 694,1900, 909, 734,1424, 572, 866,1425, 691, 85, 524,1010, 543, 394, 841,
1901,1902,1903,1026,1904,1905,1906,1907,1908,1909, 30, 451, 651, 988, 310,1910,
1911,1426, 810,1216, 93,1912,1913,1277,1217,1914, 858, 759, 45, 58, 181, 610,
269,1915,1916, 131,1062, 551, 443,1000, 821,1427, 957, 895,1086,1917,1918, 375,
1919, 359,1920, 687,1921, 822,1922, 293,1923,1924, 40, 662, 118, 692, 29, 939,
887, 640, 482, 174,1925, 69,1162, 728,1428, 910,1926,1278,1218,1279, 386, 870,
217, 854,1163, 823,1927,1928,1929,1930, 834,1931, 78,1932, 859,1933,1063,1934,
1935,1936,1937, 438,1164, 208, 595,1938,1939,1940,1941,1219,1125,1942, 280, 888,
1429,1430,1220,1431,1943,1944,1945,1946,1947,1280, 150, 510,1432,1948,1949,1950,
1951,1952,1953,1954,1011,1087,1955,1433,1043,1956, 881,1957, 614, 958,1064,1065,
1221,1958, 638,1001, 860, 967, 896,1434, 989, 492, 553,1281,1165,1959,1282,1002,
1283,1222,1960,1961,1962,1963, 36, 383, 228, 753, 247, 454,1964, 876, 678,1965,
1966,1284, 126, 464, 490, 835, 136, 672, 529, 940,1088,1435, 473,1967,1968, 467,
50, 390, 227, 587, 279, 378, 598, 792, 968, 240, 151, 160, 849, 882,1126,1285,
639,1044, 133, 140, 288, 360, 811, 563,1027, 561, 142, 523,1969,1970,1971, 7,
103, 296, 439, 407, 506, 634, 990,1972,1973,1974,1975, 645,1976,1977,1978,1979,
1980,1981, 236,1982,1436,1983,1984,1089, 192, 828, 618, 518,1166, 333,1127,1985,
818,1223,1986,1987,1988,1989,1990,1991,1992,1993, 342,1128,1286, 746, 842,1994,
1995, 560, 223,1287, 98, 8, 189, 650, 978,1288,1996,1437,1997, 17, 345, 250,
423, 277, 234, 512, 226, 97, 289, 42, 167,1998, 201,1999,2000, 843, 836, 824,
532, 338, 783,1090, 182, 576, 436,1438,1439, 527, 500,2001, 947, 889,2002,2003,
2004,2005, 262, 600, 314, 447,2006, 547,2007, 693, 738,1129,2008, 71,1440, 745,
619, 688,2009, 829,2010,2011, 147,2012, 33, 948,2013,2014, 74, 224,2015, 61,
191, 918, 399, 637,2016,1028,1130, 257, 902,2017,2018,2019,2020,2021,2022,2023,
2024,2025,2026, 837,2027,2028,2029,2030, 179, 874, 591, 52, 724, 246,2031,2032,
2033,2034,1167, 969,2035,1289, 630, 605, 911,1091,1168,2036,2037,2038,1441, 912,
2039, 623,2040,2041, 253,1169,1290,2042,1442, 146, 620, 611, 577, 433,2043,1224,
719,1170, 959, 440, 437, 534, 84, 388, 480,1131, 159, 220, 198, 679,2044,1012,
819,1066,1443, 113,1225, 194, 318,1003,1029,2045,2046,2047,2048,1067,2049,2050,
2051,2052,2053, 59, 913, 112,2054, 632,2055, 455, 144, 739,1291,2056, 273, 681,
499,2057, 448,2058,2059, 760,2060,2061, 970, 384, 169, 245,1132,2062,2063, 414,
1444,2064,2065, 41, 235,2066, 157, 252, 877, 568, 919, 789, 580,2067, 725,2068,
2069,1292,2070,2071,1445,2072,1446,2073,2074, 55, 588, 66,1447, 271,1092,2075,
1226,2076, 960,1013, 372,2077,2078,2079,2080,2081,1293,2082,2083,2084,2085, 850,
2086,2087,2088,2089,2090, 186,2091,1068, 180,2092,2093,2094, 109,1227, 522, 606,
2095, 867,1448,1093, 991,1171, 926, 353,1133,2096, 581,2097,2098,2099,1294,1449,
1450,2100, 596,1172,1014,1228,2101,1451,1295,1173,1229,2102,2103,1296,1134,1452,
949,1135,2104,2105,1094,1453,1454,1455,2106,1095,2107,2108,2109,2110,2111,2112,
2113,2114,2115,2116,2117, 804,2118,2119,1230,1231, 805,1456, 405,1136,2120,2121,
2122,2123,2124, 720, 701,1297, 992,1457, 927,1004,2125,2126,2127,2128,2129,2130,
22, 417,2131, 303,2132, 385,2133, 971, 520, 513,2134,1174, 73,1096, 231, 274,
962,1458, 673,2135,1459,2136, 152,1137,2137,2138,2139,2140,1005,1138,1460,1139,
2141,2142,2143,2144, 11, 374, 844,2145, 154,1232, 46,1461,2146, 838, 830, 721,
1233, 106,2147, 90, 428, 462, 578, 566,1175, 352,2148,2149, 538,1234, 124,1298,
2150,1462, 761, 565,2151, 686,2152, 649,2153, 72, 173,2154, 460, 415,2155,1463,
2156,1235, 305,2157,2158,2159,2160,2161,2162, 579,2163,2164,2165,2166,2167, 747,
2168,2169,2170,2171,1464, 669,2172,2173,2174,2175,2176,1465,2177, 23, 530, 285,
2178, 335, 729,2179, 397,2180,2181,2182,1030,2183,2184, 698,2185,2186, 325,2187,
2188, 369,2189, 799,1097,1015, 348,2190,1069, 680,2191, 851,1466,2192,2193, 10,
2194, 613, 424,2195, 979, 108, 449, 589, 27, 172, 81,1031, 80, 774, 281, 350,
1032, 525, 301, 582,1176,2196, 674,1045,2197,2198,1467, 730, 762,2199,2200,2201,
2202,1468,2203, 993,2204,2205, 266,1070, 963,1140,2206,2207,2208, 664,1098, 972,
2209,2210,2211,1177,1469,1470, 871,2212,2213,2214,2215,2216,1471,2217,2218,2219,
2220,2221,2222,2223,2224,2225,2226,2227,1472,1236,2228,2229,2230,2231,2232,2233,
2234,2235,1299,2236,2237, 200,2238, 477, 373,2239,2240, 731, 825, 777,2241,2242,
2243, 521, 486, 548,2244,2245,2246,1473,1300, 53, 549, 137, 875, 76, 158,2247,
1301,1474, 469, 396,1016, 278, 712,2248, 321, 442, 503, 767, 744, 941,1237,1178,
1475,2249, 82, 178,1141,1179, 973,2250,1302,2251, 297,2252,2253, 570,2254,2255,
2256, 18, 450, 206,2257, 290, 292,1142,2258, 511, 162, 99, 346, 164, 735,2259,
1476,1477, 4, 554, 343, 798,1099,2260,1100,2261, 43, 171,1303, 139, 215,2262,
2263, 717, 775,2264,1033, 322, 216,2265, 831,2266, 149,2267,1304,2268,2269, 702,
1238, 135, 845, 347, 309,2270, 484,2271, 878, 655, 238,1006,1478,2272, 67,2273,
295,2274,2275, 461,2276, 478, 942, 412,2277,1034,2278,2279,2280, 265,2281, 541,
2282,2283,2284,2285,2286, 70, 852,1071,2287,2288,2289,2290, 21, 56, 509, 117,
432,2291,2292, 331, 980, 552,1101, 148, 284, 105, 393,1180,1239, 755,2293, 187,
2294,1046,1479,2295, 340,2296, 63,1047, 230,2297,2298,1305, 763,1306, 101, 800,
808, 494,2299,2300,2301, 903,2302, 37,1072, 14, 5,2303, 79, 675,2304, 312,
2305,2306,2307,2308,2309,1480, 6,1307,2310,2311,2312, 1, 470, 35, 24, 229,
2313, 695, 210, 86, 778, 15, 784, 592, 779, 32, 77, 855, 964,2314, 259,2315,
501, 380,2316,2317, 83, 981, 153, 689,1308,1481,1482,1483,2318,2319, 716,1484,
2320,2321,2322,2323,2324,2325,1485,2326,2327, 128, 57, 68, 261,1048, 211, 170,
1240, 31,2328, 51, 435, 742,2329,2330,2331, 635,2332, 264, 456,2333,2334,2335,
425,2336,1486, 143, 507, 263, 943,2337, 363, 920,1487, 256,1488,1102, 243, 601,
1489,2338,2339,2340,2341,2342,2343,2344, 861,2345,2346,2347,2348,2349,2350, 395,
2351,1490,1491, 62, 535, 166, 225,2352,2353, 668, 419,1241, 138, 604, 928,2354,
1181,2355,1492,1493,2356,2357,2358,1143,2359, 696,2360, 387, 307,1309, 682, 476,
2361,2362, 332, 12, 222, 156,2363, 232,2364, 641, 276, 656, 517,1494,1495,1035,
416, 736,1496,2365,1017, 586,2366,2367,2368,1497,2369, 242,2370,2371,2372,1498,
2373, 965, 713,2374,2375,2376,2377, 740, 982,1499, 944,1500,1007,2378,2379,1310,
1501,2380,2381,2382, 785, 329,2383,2384,1502,2385,2386,2387, 932,2388,1503,2389,
2390,2391,2392,1242,2393,2394,2395,2396,2397, 994, 950,2398,2399,2400,2401,1504,
1311,2402,2403,2404,2405,1049, 749,2406,2407, 853, 718,1144,1312,2408,1182,1505,
2409,2410, 255, 516, 479, 564, 550, 214,1506,1507,1313, 413, 239, 444, 339,1145,
1036,1508,1509,1314,1037,1510,1315,2411,1511,2412,2413,2414, 176, 703, 497, 624,
593, 921, 302,2415, 341, 165,1103,1512,2416,1513,2417,2418,2419, 376,2420, 700,
2421,2422,2423, 258, 768,1316,2424,1183,2425, 995, 608,2426,2427,2428,2429, 221,
2430,2431,2432,2433,2434,2435,2436,2437, 195, 323, 726, 188, 897, 983,1317, 377,
644,1050, 879,2438, 452,2439,2440,2441,2442,2443,2444, 914,2445,2446,2447,2448,
915, 489,2449,1514,1184,2450,2451, 515, 64, 427, 495,2452, 583,2453, 483, 485,
1038, 562, 213,1515, 748, 666,2454,2455,2456,2457, 334,2458, 780, 996,1008, 705,
1243,2459,2460,2461,2462,2463, 114,2464, 493,1146, 366, 163,1516, 961,1104,2465,
291,2466,1318,1105,2467,1517, 365,2468, 355, 951,1244,2469,1319,2470, 631,2471,
2472, 218,1320, 364, 320, 756,1518,1519,1321,1520,1322,2473,2474,2475,2476, 997,
2477,2478,2479,2480, 665,1185,2481, 916,1521,2482,2483,2484, 584, 684,2485,2486,
797,2487,1051,1186,2488,2489,2490,1522,2491,2492, 370,2493,1039,1187, 65,2494,
434, 205, 463,1188,2495, 125, 812, 391, 402, 826, 699, 286, 398, 155, 781, 771,
585,2496, 590, 505,1073,2497, 599, 244, 219, 917,1018, 952, 646,1523,2498,1323,
2499,2500, 49, 984, 354, 741,2501, 625,2502,1324,2503,1019, 190, 357, 757, 491,
95, 782, 868,2504,2505,2506,2507,2508,2509, 134,1524,1074, 422,1525, 898,2510,
161,2511,2512,2513,2514, 769,2515,1526,2516,2517, 411,1325,2518, 472,1527,2519,
2520,2521,2522,2523,2524, 985,2525,2526,2527,2528,2529,2530, 764,2531,1245,2532,
2533, 25, 204, 311,2534, 496,2535,1052,2536,2537,2538,2539,2540,2541,2542, 199,
704, 504, 468, 758, 657,1528, 196, 44, 839,1246, 272, 750,2543, 765, 862,2544,
2545,1326,2546, 132, 615, 933,2547, 732,2548,2549,2550,1189,1529,2551, 283,1247,
1053, 607, 929,2552,2553,2554, 930, 183, 872, 616,1040,1147,2555,1148,1020, 441,
249,1075,2556,2557,2558, 466, 743,2559,2560,2561, 92, 514, 426, 420, 526,2562,
2563,2564,2565,2566,2567,2568, 185,2569,2570,2571,2572, 776,1530, 658,2573, 362,
2574, 361, 922,1076, 793,2575,2576,2577,2578,2579,2580,1531, 251,2581,2582,2583,
2584,1532, 54, 612, 237,1327,2585,2586, 275, 408, 647, 111,2587,1533,1106, 465,
3, 458, 9, 38,2588, 107, 110, 890, 209, 26, 737, 498,2589,1534,2590, 431,
202, 88,1535, 356, 287,1107, 660,1149,2591, 381,1536, 986,1150, 445,1248,1151,
974,2592,2593, 846,2594, 446, 953, 184,1249,1250, 727,2595, 923, 193, 883,2596,
2597,2598, 102, 324, 539, 817,2599, 421,1041,2600, 832,2601, 94, 175, 197, 406,
2602, 459,2603,2604,2605,2606,2607, 330, 555,2608,2609,2610, 706,1108, 389,2611,
2612,2613,2614, 233,2615, 833, 558, 931, 954,1251,2616,2617,1537, 546,2618,2619,
1009,2620,2621,2622,1538, 690,1328,2623, 955,2624,1539,2625,2626, 772,2627,2628,
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256
)

View File

@ -0,0 +1,47 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import EUCKR_SM_MODEL
class EUCKRProber(MultiByteCharSetProber):
def __init__(self):
super(EUCKRProber, self).__init__()
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
self.distribution_analyzer = EUCKRDistributionAnalysis()
self.reset()
@property
def charset_name(self):
return "EUC-KR"
@property
def language(self):
return "Korean"

View File

@ -0,0 +1,387 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
# EUCTW frequency table
# Converted from big5 work
# by Taiwan's Mandarin Promotion Council
# <http:#www.edu.tw:81/mandr/>
# 128 --> 0.42261
# 256 --> 0.57851
# 512 --> 0.74851
# 1024 --> 0.89384
# 2048 --> 0.97583
#
# Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98
# Random Distribution Ration = 512/(5401-512)=0.105
#
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
# Char to FreqOrder table ,
EUCTW_TABLE_SIZE = 5376
EUCTW_CHAR_TO_FREQ_ORDER = (
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
63,7312,7313, 317,1614, 75, 222, 159,4061,2412,1480,7314,3500,3068, 224,2809, # 2790
3616, 3, 10,3870,1471, 29,2774,1135,2852,1939, 873, 130,3242,1123, 312,7315, # 2806
4297,2051, 507, 252, 682,7316, 142,1914, 124, 206,2932, 34,3501,3173, 64, 604, # 2822
7317,2494,1976,1977, 155,1990, 645, 641,1606,7318,3405, 337, 72, 406,7319, 80, # 2838
630, 238,3174,1509, 263, 939,1092,2644, 756,1440,1094,3406, 449, 69,2969, 591, # 2854
179,2095, 471, 115,2034,1843, 60, 50,2970, 134, 806,1868, 734,2035,3407, 180, # 2870
995,1607, 156, 537,2893, 688,7320, 319,1305, 779,2144, 514,2374, 298,4298, 359, # 2886
2495, 90,2707,1338, 663, 11, 906,1099,2545, 20,2436, 182, 532,1716,7321, 732, # 2902
1376,4062,1311,1420,3175, 25,2312,1056, 113, 399, 382,1949, 242,3408,2467, 529, # 2918
3243, 475,1447,3617,7322, 117, 21, 656, 810,1297,2295,2329,3502,7323, 126,4063, # 2934
706, 456, 150, 613,4299, 71,1118,2036,4064, 145,3069, 85, 835, 486,2114,1246, # 2950
1426, 428, 727,1285,1015, 800, 106, 623, 303,1281,7324,2127,2354, 347,3736, 221, # 2966
3503,3110,7325,1955,1153,4065, 83, 296,1199,3070, 192, 624, 93,7326, 822,1897, # 2982
2810,3111, 795,2064, 991,1554,1542,1592, 27, 43,2853, 859, 139,1456, 860,4300, # 2998
437, 712,3871, 164,2392,3112, 695, 211,3017,2096, 195,3872,1608,3504,3505,3618, # 3014
3873, 234, 811,2971,2097,3874,2229,1441,3506,1615,2375, 668,2076,1638, 305, 228, # 3030
1664,4301, 467, 415,7327, 262,2098,1593, 239, 108, 300, 200,1033, 512,1247,2077, # 3046
7328,7329,2173,3176,3619,2673, 593, 845,1062,3244, 88,1723,2037,3875,1950, 212, # 3062
266, 152, 149, 468,1898,4066,4302, 77, 187,7330,3018, 37, 5,2972,7331,3876, # 3078
7332,7333, 39,2517,4303,2894,3177,2078, 55, 148, 74,4304, 545, 483,1474,1029, # 3094
1665, 217,1869,1531,3113,1104,2645,4067, 24, 172,3507, 900,3877,3508,3509,4305, # 3110
32,1408,2811,1312, 329, 487,2355,2247,2708, 784,2674, 4,3019,3314,1427,1788, # 3126
188, 109, 499,7334,3620,1717,1789, 888,1217,3020,4306,7335,3510,7336,3315,1520, # 3142
3621,3878, 196,1034, 775,7337,7338, 929,1815, 249, 439, 38,7339,1063,7340, 794, # 3158
3879,1435,2296, 46, 178,3245,2065,7341,2376,7342, 214,1709,4307, 804, 35, 707, # 3174
324,3622,1601,2546, 140, 459,4068,7343,7344,1365, 839, 272, 978,2257,2572,3409, # 3190
2128,1363,3623,1423, 697, 100,3071, 48, 70,1231, 495,3114,2193,7345,1294,7346, # 3206
2079, 462, 586,1042,3246, 853, 256, 988, 185,2377,3410,1698, 434,1084,7347,3411, # 3222
314,2615,2775,4308,2330,2331, 569,2280, 637,1816,2518, 757,1162,1878,1616,3412, # 3238
287,1577,2115, 768,4309,1671,2854,3511,2519,1321,3737, 909,2413,7348,4069, 933, # 3254
3738,7349,2052,2356,1222,4310, 765,2414,1322, 786,4311,7350,1919,1462,1677,2895, # 3270
1699,7351,4312,1424,2437,3115,3624,2590,3316,1774,1940,3413,3880,4070, 309,1369, # 3286
1130,2812, 364,2230,1653,1299,3881,3512,3882,3883,2646, 525,1085,3021, 902,2000, # 3302
1475, 964,4313, 421,1844,1415,1057,2281, 940,1364,3116, 376,4314,4315,1381, 7, # 3318
2520, 983,2378, 336,1710,2675,1845, 321,3414, 559,1131,3022,2742,1808,1132,1313, # 3334
265,1481,1857,7352, 352,1203,2813,3247, 167,1089, 420,2814, 776, 792,1724,3513, # 3350
4071,2438,3248,7353,4072,7354, 446, 229, 333,2743, 901,3739,1200,1557,4316,2647, # 3366
1920, 395,2744,2676,3740,4073,1835, 125, 916,3178,2616,4317,7355,7356,3741,7357, # 3382
7358,7359,4318,3117,3625,1133,2547,1757,3415,1510,2313,1409,3514,7360,2145, 438, # 3398
2591,2896,2379,3317,1068, 958,3023, 461, 311,2855,2677,4074,1915,3179,4075,1978, # 3414
383, 750,2745,2617,4076, 274, 539, 385,1278,1442,7361,1154,1964, 384, 561, 210, # 3430
98,1295,2548,3515,7362,1711,2415,1482,3416,3884,2897,1257, 129,7363,3742, 642, # 3446
523,2776,2777,2648,7364, 141,2231,1333, 68, 176, 441, 876, 907,4077, 603,2592, # 3462
710, 171,3417, 404, 549, 18,3118,2393,1410,3626,1666,7365,3516,4319,2898,4320, # 3478
7366,2973, 368,7367, 146, 366, 99, 871,3627,1543, 748, 807,1586,1185, 22,2258, # 3494
379,3743,3180,7368,3181, 505,1941,2618,1991,1382,2314,7369, 380,2357, 218, 702, # 3510
1817,1248,3418,3024,3517,3318,3249,7370,2974,3628, 930,3250,3744,7371, 59,7372, # 3526
585, 601,4078, 497,3419,1112,1314,4321,1801,7373,1223,1472,2174,7374, 749,1836, # 3542
690,1899,3745,1772,3885,1476, 429,1043,1790,2232,2116, 917,4079, 447,1086,1629, # 3558
7375, 556,7376,7377,2020,1654, 844,1090, 105, 550, 966,1758,2815,1008,1782, 686, # 3574
1095,7378,2282, 793,1602,7379,3518,2593,4322,4080,2933,2297,4323,3746, 980,2496, # 3590
544, 353, 527,4324, 908,2678,2899,7380, 381,2619,1942,1348,7381,1341,1252, 560, # 3606
3072,7382,3420,2856,7383,2053, 973, 886,2080, 143,4325,7384,7385, 157,3886, 496, # 3622
4081, 57, 840, 540,2038,4326,4327,3421,2117,1445, 970,2259,1748,1965,2081,4082, # 3638
3119,1234,1775,3251,2816,3629, 773,1206,2129,1066,2039,1326,3887,1738,1725,4083, # 3654
279,3120, 51,1544,2594, 423,1578,2130,2066, 173,4328,1879,7386,7387,1583, 264, # 3670
610,3630,4329,2439, 280, 154,7388,7389,7390,1739, 338,1282,3073, 693,2857,1411, # 3686
1074,3747,2440,7391,4330,7392,7393,1240, 952,2394,7394,2900,1538,2679, 685,1483, # 3702
4084,2468,1436, 953,4085,2054,4331, 671,2395, 79,4086,2441,3252, 608, 567,2680, # 3718
3422,4087,4088,1691, 393,1261,1791,2396,7395,4332,7396,7397,7398,7399,1383,1672, # 3734
3748,3182,1464, 522,1119, 661,1150, 216, 675,4333,3888,1432,3519, 609,4334,2681, # 3750
2397,7400,7401,7402,4089,3025, 0,7403,2469, 315, 231,2442, 301,3319,4335,2380, # 3766
7404, 233,4090,3631,1818,4336,4337,7405, 96,1776,1315,2082,7406, 257,7407,1809, # 3782
3632,2709,1139,1819,4091,2021,1124,2163,2778,1777,2649,7408,3074, 363,1655,3183, # 3798
7409,2975,7410,7411,7412,3889,1567,3890, 718, 103,3184, 849,1443, 341,3320,2934, # 3814
1484,7413,1712, 127, 67, 339,4092,2398, 679,1412, 821,7414,7415, 834, 738, 351, # 3830
2976,2146, 846, 235,1497,1880, 418,1992,3749,2710, 186,1100,2147,2746,3520,1545, # 3846
1355,2935,2858,1377, 583,3891,4093,2573,2977,7416,1298,3633,1078,2549,3634,2358, # 3862
78,3750,3751, 267,1289,2099,2001,1594,4094, 348, 369,1274,2194,2175,1837,4338, # 3878
1820,2817,3635,2747,2283,2002,4339,2936,2748, 144,3321, 882,4340,3892,2749,3423, # 3894
4341,2901,7417,4095,1726, 320,7418,3893,3026, 788,2978,7419,2818,1773,1327,2859, # 3910
3894,2819,7420,1306,4342,2003,1700,3752,3521,2359,2650, 787,2022, 506, 824,3636, # 3926
534, 323,4343,1044,3322,2023,1900, 946,3424,7421,1778,1500,1678,7422,1881,4344, # 3942
165, 243,4345,3637,2521, 123, 683,4096, 764,4346, 36,3895,1792, 589,2902, 816, # 3958
626,1667,3027,2233,1639,1555,1622,3753,3896,7423,3897,2860,1370,1228,1932, 891, # 3974
2083,2903, 304,4097,7424, 292,2979,2711,3522, 691,2100,4098,1115,4347, 118, 662, # 3990
7425, 611,1156, 854,2381,1316,2861, 2, 386, 515,2904,7426,7427,3253, 868,2234, # 4006
1486, 855,2651, 785,2212,3028,7428,1040,3185,3523,7429,3121, 448,7430,1525,7431, # 4022
2164,4348,7432,3754,7433,4099,2820,3524,3122, 503, 818,3898,3123,1568, 814, 676, # 4038
1444, 306,1749,7434,3755,1416,1030, 197,1428, 805,2821,1501,4349,7435,7436,7437, # 4054
1993,7438,4350,7439,7440,2195, 13,2779,3638,2980,3124,1229,1916,7441,3756,2131, # 4070
7442,4100,4351,2399,3525,7443,2213,1511,1727,1120,7444,7445, 646,3757,2443, 307, # 4086
7446,7447,1595,3186,7448,7449,7450,3639,1113,1356,3899,1465,2522,2523,7451, 519, # 4102
7452, 128,2132, 92,2284,1979,7453,3900,1512, 342,3125,2196,7454,2780,2214,1980, # 4118
3323,7455, 290,1656,1317, 789, 827,2360,7456,3758,4352, 562, 581,3901,7457, 401, # 4134
4353,2248, 94,4354,1399,2781,7458,1463,2024,4355,3187,1943,7459, 828,1105,4101, # 4150
1262,1394,7460,4102, 605,4356,7461,1783,2862,7462,2822, 819,2101, 578,2197,2937, # 4166
7463,1502, 436,3254,4103,3255,2823,3902,2905,3425,3426,7464,2712,2315,7465,7466, # 4182
2332,2067, 23,4357, 193, 826,3759,2102, 699,1630,4104,3075, 390,1793,1064,3526, # 4198
7467,1579,3076,3077,1400,7468,4105,1838,1640,2863,7469,4358,4359, 137,4106, 598, # 4214
3078,1966, 780, 104, 974,2938,7470, 278, 899, 253, 402, 572, 504, 493,1339,7471, # 4230
3903,1275,4360,2574,2550,7472,3640,3029,3079,2249, 565,1334,2713, 863, 41,7473, # 4246
7474,4361,7475,1657,2333, 19, 463,2750,4107, 606,7476,2981,3256,1087,2084,1323, # 4262
2652,2982,7477,1631,1623,1750,4108,2682,7478,2864, 791,2714,2653,2334, 232,2416, # 4278
7479,2983,1498,7480,2654,2620, 755,1366,3641,3257,3126,2025,1609, 119,1917,3427, # 4294
862,1026,4109,7481,3904,3760,4362,3905,4363,2260,1951,2470,7482,1125, 817,4110, # 4310
4111,3906,1513,1766,2040,1487,4112,3030,3258,2824,3761,3127,7483,7484,1507,7485, # 4326
2683, 733, 40,1632,1106,2865, 345,4113, 841,2524, 230,4364,2984,1846,3259,3428, # 4342
7486,1263, 986,3429,7487, 735, 879, 254,1137, 857, 622,1300,1180,1388,1562,3907, # 4358
3908,2939, 967,2751,2655,1349, 592,2133,1692,3324,2985,1994,4114,1679,3909,1901, # 4374
2185,7488, 739,3642,2715,1296,1290,7489,4115,2198,2199,1921,1563,2595,2551,1870, # 4390
2752,2986,7490, 435,7491, 343,1108, 596, 17,1751,4365,2235,3430,3643,7492,4366, # 4406
294,3527,2940,1693, 477, 979, 281,2041,3528, 643,2042,3644,2621,2782,2261,1031, # 4422
2335,2134,2298,3529,4367, 367,1249,2552,7493,3530,7494,4368,1283,3325,2004, 240, # 4438
1762,3326,4369,4370, 836,1069,3128, 474,7495,2148,2525, 268,3531,7496,3188,1521, # 4454
1284,7497,1658,1546,4116,7498,3532,3533,7499,4117,3327,2684,1685,4118, 961,1673, # 4470
2622, 190,2005,2200,3762,4371,4372,7500, 570,2497,3645,1490,7501,4373,2623,3260, # 4486
1956,4374, 584,1514, 396,1045,1944,7502,4375,1967,2444,7503,7504,4376,3910, 619, # 4502
7505,3129,3261, 215,2006,2783,2553,3189,4377,3190,4378, 763,4119,3763,4379,7506, # 4518
7507,1957,1767,2941,3328,3646,1174, 452,1477,4380,3329,3130,7508,2825,1253,2382, # 4534
2186,1091,2285,4120, 492,7509, 638,1169,1824,2135,1752,3911, 648, 926,1021,1324, # 4550
4381, 520,4382, 997, 847,1007, 892,4383,3764,2262,1871,3647,7510,2400,1784,4384, # 4566
1952,2942,3080,3191,1728,4121,2043,3648,4385,2007,1701,3131,1551, 30,2263,4122, # 4582
7511,2026,4386,3534,7512, 501,7513,4123, 594,3431,2165,1821,3535,3432,3536,3192, # 4598
829,2826,4124,7514,1680,3132,1225,4125,7515,3262,4387,4126,3133,2336,7516,4388, # 4614
4127,7517,3912,3913,7518,1847,2383,2596,3330,7519,4389, 374,3914, 652,4128,4129, # 4630
375,1140, 798,7520,7521,7522,2361,4390,2264, 546,1659, 138,3031,2445,4391,7523, # 4646
2250, 612,1848, 910, 796,3765,1740,1371, 825,3766,3767,7524,2906,2554,7525, 692, # 4662
444,3032,2624, 801,4392,4130,7526,1491, 244,1053,3033,4131,4132, 340,7527,3915, # 4678
1041,2987, 293,1168, 87,1357,7528,1539, 959,7529,2236, 721, 694,4133,3768, 219, # 4694
1478, 644,1417,3331,2656,1413,1401,1335,1389,3916,7530,7531,2988,2362,3134,1825, # 4710
730,1515, 184,2827, 66,4393,7532,1660,2943, 246,3332, 378,1457, 226,3433, 975, # 4726
3917,2944,1264,3537, 674, 696,7533, 163,7534,1141,2417,2166, 713,3538,3333,4394, # 4742
3918,7535,7536,1186, 15,7537,1079,1070,7538,1522,3193,3539, 276,1050,2716, 758, # 4758
1126, 653,2945,3263,7539,2337, 889,3540,3919,3081,2989, 903,1250,4395,3920,3434, # 4774
3541,1342,1681,1718, 766,3264, 286, 89,2946,3649,7540,1713,7541,2597,3334,2990, # 4790
7542,2947,2215,3194,2866,7543,4396,2498,2526, 181, 387,1075,3921, 731,2187,3335, # 4806
7544,3265, 310, 313,3435,2299, 770,4134, 54,3034, 189,4397,3082,3769,3922,7545, # 4822
1230,1617,1849, 355,3542,4135,4398,3336, 111,4136,3650,1350,3135,3436,3035,4137, # 4838
2149,3266,3543,7546,2784,3923,3924,2991, 722,2008,7547,1071, 247,1207,2338,2471, # 4854
1378,4399,2009, 864,1437,1214,4400, 373,3770,1142,2216, 667,4401, 442,2753,2555, # 4870
3771,3925,1968,4138,3267,1839, 837, 170,1107, 934,1336,1882,7548,7549,2118,4139, # 4886
2828, 743,1569,7550,4402,4140, 582,2384,1418,3437,7551,1802,7552, 357,1395,1729, # 4902
3651,3268,2418,1564,2237,7553,3083,3772,1633,4403,1114,2085,4141,1532,7554, 482, # 4918
2446,4404,7555,7556,1492, 833,1466,7557,2717,3544,1641,2829,7558,1526,1272,3652, # 4934
4142,1686,1794, 416,2556,1902,1953,1803,7559,3773,2785,3774,1159,2316,7560,2867, # 4950
4405,1610,1584,3036,2419,2754, 443,3269,1163,3136,7561,7562,3926,7563,4143,2499, # 4966
3037,4406,3927,3137,2103,1647,3545,2010,1872,4144,7564,4145, 431,3438,7565, 250, # 4982
97, 81,4146,7566,1648,1850,1558, 160, 848,7567, 866, 740,1694,7568,2201,2830, # 4998
3195,4147,4407,3653,1687, 950,2472, 426, 469,3196,3654,3655,3928,7569,7570,1188, # 5014
424,1995, 861,3546,4148,3775,2202,2685, 168,1235,3547,4149,7571,2086,1674,4408, # 5030
3337,3270, 220,2557,1009,7572,3776, 670,2992, 332,1208, 717,7573,7574,3548,2447, # 5046
3929,3338,7575, 513,7576,1209,2868,3339,3138,4409,1080,7577,7578,7579,7580,2527, # 5062
3656,3549, 815,1587,3930,3931,7581,3550,3439,3777,1254,4410,1328,3038,1390,3932, # 5078
1741,3933,3778,3934,7582, 236,3779,2448,3271,7583,7584,3657,3780,1273,3781,4411, # 5094
7585, 308,7586,4412, 245,4413,1851,2473,1307,2575, 430, 715,2136,2449,7587, 270, # 5110
199,2869,3935,7588,3551,2718,1753, 761,1754, 725,1661,1840,4414,3440,3658,7589, # 5126
7590, 587, 14,3272, 227,2598, 326, 480,2265, 943,2755,3552, 291, 650,1883,7591, # 5142
1702,1226, 102,1547, 62,3441, 904,4415,3442,1164,4150,7592,7593,1224,1548,2756, # 5158
391, 498,1493,7594,1386,1419,7595,2055,1177,4416, 813, 880,1081,2363, 566,1145, # 5174
4417,2286,1001,1035,2558,2599,2238, 394,1286,7596,7597,2068,7598, 86,1494,1730, # 5190
3936, 491,1588, 745, 897,2948, 843,3340,3937,2757,2870,3273,1768, 998,2217,2069, # 5206
397,1826,1195,1969,3659,2993,3341, 284,7599,3782,2500,2137,2119,1903,7600,3938, # 5222
2150,3939,4151,1036,3443,1904, 114,2559,4152, 209,1527,7601,7602,2949,2831,2625, # 5238
2385,2719,3139, 812,2560,7603,3274,7604,1559, 737,1884,3660,1210, 885, 28,2686, # 5254
3553,3783,7605,4153,1004,1779,4418,7606, 346,1981,2218,2687,4419,3784,1742, 797, # 5270
1642,3940,1933,1072,1384,2151, 896,3941,3275,3661,3197,2871,3554,7607,2561,1958, # 5286
4420,2450,1785,7608,7609,7610,3942,4154,1005,1308,3662,4155,2720,4421,4422,1528, # 5302
2600, 161,1178,4156,1982, 987,4423,1101,4157, 631,3943,1157,3198,2420,1343,1241, # 5318
1016,2239,2562, 372, 877,2339,2501,1160, 555,1934, 911,3944,7611, 466,1170, 169, # 5334
1051,2907,2688,3663,2474,2994,1182,2011,2563,1251,2626,7612, 992,2340,3444,1540, # 5350
2721,1201,2070,2401,1996,2475,7613,4424, 528,1922,2188,1503,1873,1570,2364,3342, # 5366
3276,7614, 557,1073,7615,1827,3445,2087,2266,3140,3039,3084, 767,3085,2786,4425, # 5382
1006,4158,4426,2341,1267,2176,3664,3199, 778,3945,3200,2722,1597,2657,7616,4427, # 5398
7617,3446,7618,7619,7620,3277,2689,1433,3278, 131, 95,1504,3946, 723,4159,3141, # 5414
1841,3555,2758,2189,3947,2027,2104,3665,7621,2995,3948,1218,7622,3343,3201,3949, # 5430
4160,2576, 248,1634,3785, 912,7623,2832,3666,3040,3786, 654, 53,7624,2996,7625, # 5446
1688,4428, 777,3447,1032,3950,1425,7626, 191, 820,2120,2833, 971,4429, 931,3202, # 5462
135, 664, 783,3787,1997, 772,2908,1935,3951,3788,4430,2909,3203, 282,2723, 640, # 5478
1372,3448,1127, 922, 325,3344,7627,7628, 711,2044,7629,7630,3952,2219,2787,1936, # 5494
3953,3345,2220,2251,3789,2300,7631,4431,3790,1258,3279,3954,3204,2138,2950,3955, # 5510
3956,7632,2221, 258,3205,4432, 101,1227,7633,3280,1755,7634,1391,3281,7635,2910, # 5526
2056, 893,7636,7637,7638,1402,4161,2342,7639,7640,3206,3556,7641,7642, 878,1325, # 5542
1780,2788,4433, 259,1385,2577, 744,1183,2267,4434,7643,3957,2502,7644, 684,1024, # 5558
4162,7645, 472,3557,3449,1165,3282,3958,3959, 322,2152, 881, 455,1695,1152,1340, # 5574
660, 554,2153,4435,1058,4436,4163, 830,1065,3346,3960,4437,1923,7646,1703,1918, # 5590
7647, 932,2268, 122,7648,4438, 947, 677,7649,3791,2627, 297,1905,1924,2269,4439, # 5606
2317,3283,7650,7651,4164,7652,4165, 84,4166, 112, 989,7653, 547,1059,3961, 701, # 5622
3558,1019,7654,4167,7655,3450, 942, 639, 457,2301,2451, 993,2951, 407, 851, 494, # 5638
4440,3347, 927,7656,1237,7657,2421,3348, 573,4168, 680, 921,2911,1279,1874, 285, # 5654
790,1448,1983, 719,2167,7658,7659,4441,3962,3963,1649,7660,1541, 563,7661,1077, # 5670
7662,3349,3041,3451, 511,2997,3964,3965,3667,3966,1268,2564,3350,3207,4442,4443, # 5686
7663, 535,1048,1276,1189,2912,2028,3142,1438,1373,2834,2952,1134,2012,7664,4169, # 5702
1238,2578,3086,1259,7665, 700,7666,2953,3143,3668,4170,7667,4171,1146,1875,1906, # 5718
4444,2601,3967, 781,2422, 132,1589, 203, 147, 273,2789,2402, 898,1786,2154,3968, # 5734
3969,7668,3792,2790,7669,7670,4445,4446,7671,3208,7672,1635,3793, 965,7673,1804, # 5750
2690,1516,3559,1121,1082,1329,3284,3970,1449,3794, 65,1128,2835,2913,2759,1590, # 5766
3795,7674,7675, 12,2658, 45, 976,2579,3144,4447, 517,2528,1013,1037,3209,7676, # 5782
3796,2836,7677,3797,7678,3452,7679,2602, 614,1998,2318,3798,3087,2724,2628,7680, # 5798
2580,4172, 599,1269,7681,1810,3669,7682,2691,3088, 759,1060, 489,1805,3351,3285, # 5814
1358,7683,7684,2386,1387,1215,2629,2252, 490,7685,7686,4173,1759,2387,2343,7687, # 5830
4448,3799,1907,3971,2630,1806,3210,4449,3453,3286,2760,2344, 874,7688,7689,3454, # 5846
3670,1858, 91,2914,3671,3042,3800,4450,7690,3145,3972,2659,7691,3455,1202,1403, # 5862
3801,2954,2529,1517,2503,4451,3456,2504,7692,4452,7693,2692,1885,1495,1731,3973, # 5878
2365,4453,7694,2029,7695,7696,3974,2693,1216, 237,2581,4174,2319,3975,3802,4454, # 5894
4455,2694,3560,3457, 445,4456,7697,7698,7699,7700,2761, 61,3976,3672,1822,3977, # 5910
7701, 687,2045, 935, 925, 405,2660, 703,1096,1859,2725,4457,3978,1876,1367,2695, # 5926
3352, 918,2105,1781,2476, 334,3287,1611,1093,4458, 564,3146,3458,3673,3353, 945, # 5942
2631,2057,4459,7702,1925, 872,4175,7703,3459,2696,3089, 349,4176,3674,3979,4460, # 5958
3803,4177,3675,2155,3980,4461,4462,4178,4463,2403,2046, 782,3981, 400, 251,4179, # 5974
1624,7704,7705, 277,3676, 299,1265, 476,1191,3804,2121,4180,4181,1109, 205,7706, # 5990
2582,1000,2156,3561,1860,7707,7708,7709,4464,7710,4465,2565, 107,2477,2157,3982, # 6006
3460,3147,7711,1533, 541,1301, 158, 753,4182,2872,3562,7712,1696, 370,1088,4183, # 6022
4466,3563, 579, 327, 440, 162,2240, 269,1937,1374,3461, 968,3043, 56,1396,3090, # 6038
2106,3288,3354,7713,1926,2158,4467,2998,7714,3564,7715,7716,3677,4468,2478,7717, # 6054
2791,7718,1650,4469,7719,2603,7720,7721,3983,2661,3355,1149,3356,3984,3805,3985, # 6070
7722,1076, 49,7723, 951,3211,3289,3290, 450,2837, 920,7724,1811,2792,2366,4184, # 6086
1908,1138,2367,3806,3462,7725,3212,4470,1909,1147,1518,2423,4471,3807,7726,4472, # 6102
2388,2604, 260,1795,3213,7727,7728,3808,3291, 708,7729,3565,1704,7730,3566,1351, # 6118
1618,3357,2999,1886, 944,4185,3358,4186,3044,3359,4187,7731,3678, 422, 413,1714, # 6134
3292, 500,2058,2345,4188,2479,7732,1344,1910, 954,7733,1668,7734,7735,3986,2404, # 6150
4189,3567,3809,4190,7736,2302,1318,2505,3091, 133,3092,2873,4473, 629, 31,2838, # 6166
2697,3810,4474, 850, 949,4475,3987,2955,1732,2088,4191,1496,1852,7737,3988, 620, # 6182
3214, 981,1242,3679,3360,1619,3680,1643,3293,2139,2452,1970,1719,3463,2168,7738, # 6198
3215,7739,7740,3361,1828,7741,1277,4476,1565,2047,7742,1636,3568,3093,7743, 869, # 6214
2839, 655,3811,3812,3094,3989,3000,3813,1310,3569,4477,7744,7745,7746,1733, 558, # 6230
4478,3681, 335,1549,3045,1756,4192,3682,1945,3464,1829,1291,1192, 470,2726,2107, # 6246
2793, 913,1054,3990,7747,1027,7748,3046,3991,4479, 982,2662,3362,3148,3465,3216, # 6262
3217,1946,2794,7749, 571,4480,7750,1830,7751,3570,2583,1523,2424,7752,2089, 984, # 6278
4481,3683,1959,7753,3684, 852, 923,2795,3466,3685, 969,1519, 999,2048,2320,1705, # 6294
7754,3095, 615,1662, 151, 597,3992,2405,2321,1049, 275,4482,3686,4193, 568,3687, # 6310
3571,2480,4194,3688,7755,2425,2270, 409,3218,7756,1566,2874,3467,1002, 769,2840, # 6326
194,2090,3149,3689,2222,3294,4195, 628,1505,7757,7758,1763,2177,3001,3993, 521, # 6342
1161,2584,1787,2203,2406,4483,3994,1625,4196,4197, 412, 42,3096, 464,7759,2632, # 6358
4484,3363,1760,1571,2875,3468,2530,1219,2204,3814,2633,2140,2368,4485,4486,3295, # 6374
1651,3364,3572,7760,7761,3573,2481,3469,7762,3690,7763,7764,2271,2091, 460,7765, # 6390
4487,7766,3002, 962, 588,3574, 289,3219,2634,1116, 52,7767,3047,1796,7768,7769, # 6406
7770,1467,7771,1598,1143,3691,4198,1984,1734,1067,4488,1280,3365, 465,4489,1572, # 6422
510,7772,1927,2241,1812,1644,3575,7773,4490,3692,7774,7775,2663,1573,1534,7776, # 6438
7777,4199, 536,1807,1761,3470,3815,3150,2635,7778,7779,7780,4491,3471,2915,1911, # 6454
2796,7781,3296,1122, 377,3220,7782, 360,7783,7784,4200,1529, 551,7785,2059,3693, # 6470
1769,2426,7786,2916,4201,3297,3097,2322,2108,2030,4492,1404, 136,1468,1479, 672, # 6486
1171,3221,2303, 271,3151,7787,2762,7788,2049, 678,2727, 865,1947,4493,7789,2013, # 6502
3995,2956,7790,2728,2223,1397,3048,3694,4494,4495,1735,2917,3366,3576,7791,3816, # 6518
509,2841,2453,2876,3817,7792,7793,3152,3153,4496,4202,2531,4497,2304,1166,1010, # 6534
552, 681,1887,7794,7795,2957,2958,3996,1287,1596,1861,3154, 358, 453, 736, 175, # 6550
478,1117, 905,1167,1097,7796,1853,1530,7797,1706,7798,2178,3472,2287,3695,3473, # 6566
3577,4203,2092,4204,7799,3367,1193,2482,4205,1458,2190,2205,1862,1888,1421,3298, # 6582
2918,3049,2179,3474, 595,2122,7800,3997,7801,7802,4206,1707,2636, 223,3696,1359, # 6598
751,3098, 183,3475,7803,2797,3003, 419,2369, 633, 704,3818,2389, 241,7804,7805, # 6614
7806, 838,3004,3697,2272,2763,2454,3819,1938,2050,3998,1309,3099,2242,1181,7807, # 6630
1136,2206,3820,2370,1446,4207,2305,4498,7808,7809,4208,1055,2605, 484,3698,7810, # 6646
3999, 625,4209,2273,3368,1499,4210,4000,7811,4001,4211,3222,2274,2275,3476,7812, # 6662
7813,2764, 808,2606,3699,3369,4002,4212,3100,2532, 526,3370,3821,4213, 955,7814, # 6678
1620,4214,2637,2427,7815,1429,3700,1669,1831, 994, 928,7816,3578,1260,7817,7818, # 6694
7819,1948,2288, 741,2919,1626,4215,2729,2455, 867,1184, 362,3371,1392,7820,7821, # 6710
4003,4216,1770,1736,3223,2920,4499,4500,1928,2698,1459,1158,7822,3050,3372,2877, # 6726
1292,1929,2506,2842,3701,1985,1187,2071,2014,2607,4217,7823,2566,2507,2169,3702, # 6742
2483,3299,7824,3703,4501,7825,7826, 666,1003,3005,1022,3579,4218,7827,4502,1813, # 6758
2253, 574,3822,1603, 295,1535, 705,3823,4219, 283, 858, 417,7828,7829,3224,4503, # 6774
4504,3051,1220,1889,1046,2276,2456,4004,1393,1599, 689,2567, 388,4220,7830,2484, # 6790
802,7831,2798,3824,2060,1405,2254,7832,4505,3825,2109,1052,1345,3225,1585,7833, # 6806
809,7834,7835,7836, 575,2730,3477, 956,1552,1469,1144,2323,7837,2324,1560,2457, # 6822
3580,3226,4005, 616,2207,3155,2180,2289,7838,1832,7839,3478,4506,7840,1319,3704, # 6838
3705,1211,3581,1023,3227,1293,2799,7841,7842,7843,3826, 607,2306,3827, 762,2878, # 6854
1439,4221,1360,7844,1485,3052,7845,4507,1038,4222,1450,2061,2638,4223,1379,4508, # 6870
2585,7846,7847,4224,1352,1414,2325,2921,1172,7848,7849,3828,3829,7850,1797,1451, # 6886
7851,7852,7853,7854,2922,4006,4007,2485,2346, 411,4008,4009,3582,3300,3101,4509, # 6902
1561,2664,1452,4010,1375,7855,7856, 47,2959, 316,7857,1406,1591,2923,3156,7858, # 6918
1025,2141,3102,3157, 354,2731, 884,2224,4225,2407, 508,3706, 726,3583, 996,2428, # 6934
3584, 729,7859, 392,2191,1453,4011,4510,3707,7860,7861,2458,3585,2608,1675,2800, # 6950
919,2347,2960,2348,1270,4511,4012, 73,7862,7863, 647,7864,3228,2843,2255,1550, # 6966
1346,3006,7865,1332, 883,3479,7866,7867,7868,7869,3301,2765,7870,1212, 831,1347, # 6982
4226,4512,2326,3830,1863,3053, 720,3831,4513,4514,3832,7871,4227,7872,7873,4515, # 6998
7874,7875,1798,4516,3708,2609,4517,3586,1645,2371,7876,7877,2924, 669,2208,2665, # 7014
2429,7878,2879,7879,7880,1028,3229,7881,4228,2408,7882,2256,1353,7883,7884,4518, # 7030
3158, 518,7885,4013,7886,4229,1960,7887,2142,4230,7888,7889,3007,2349,2350,3833, # 7046
516,1833,1454,4014,2699,4231,4519,2225,2610,1971,1129,3587,7890,2766,7891,2961, # 7062
1422, 577,1470,3008,1524,3373,7892,7893, 432,4232,3054,3480,7894,2586,1455,2508, # 7078
2226,1972,1175,7895,1020,2732,4015,3481,4520,7896,2733,7897,1743,1361,3055,3482, # 7094
2639,4016,4233,4521,2290, 895, 924,4234,2170, 331,2243,3056, 166,1627,3057,1098, # 7110
7898,1232,2880,2227,3374,4522, 657, 403,1196,2372, 542,3709,3375,1600,4235,3483, # 7126
7899,4523,2767,3230, 576, 530,1362,7900,4524,2533,2666,3710,4017,7901, 842,3834, # 7142
7902,2801,2031,1014,4018, 213,2700,3376, 665, 621,4236,7903,3711,2925,2430,7904, # 7158
2431,3302,3588,3377,7905,4237,2534,4238,4525,3589,1682,4239,3484,1380,7906, 724, # 7174
2277, 600,1670,7907,1337,1233,4526,3103,2244,7908,1621,4527,7909, 651,4240,7910, # 7190
1612,4241,2611,7911,2844,7912,2734,2307,3058,7913, 716,2459,3059, 174,1255,2701, # 7206
4019,3590, 548,1320,1398, 728,4020,1574,7914,1890,1197,3060,4021,7915,3061,3062, # 7222
3712,3591,3713, 747,7916, 635,4242,4528,7917,7918,7919,4243,7920,7921,4529,7922, # 7238
3378,4530,2432, 451,7923,3714,2535,2072,4244,2735,4245,4022,7924,1764,4531,7925, # 7254
4246, 350,7926,2278,2390,2486,7927,4247,4023,2245,1434,4024, 488,4532, 458,4248, # 7270
4025,3715, 771,1330,2391,3835,2568,3159,2159,2409,1553,2667,3160,4249,7928,2487, # 7286
2881,2612,1720,2702,4250,3379,4533,7929,2536,4251,7930,3231,4252,2768,7931,2015, # 7302
2736,7932,1155,1017,3716,3836,7933,3303,2308, 201,1864,4253,1430,7934,4026,7935, # 7318
7936,7937,7938,7939,4254,1604,7940, 414,1865, 371,2587,4534,4535,3485,2016,3104, # 7334
4536,1708, 960,4255, 887, 389,2171,1536,1663,1721,7941,2228,4027,2351,2926,1580, # 7350
7942,7943,7944,1744,7945,2537,4537,4538,7946,4539,7947,2073,7948,7949,3592,3380, # 7366
2882,4256,7950,4257,2640,3381,2802, 673,2703,2460, 709,3486,4028,3593,4258,7951, # 7382
1148, 502, 634,7952,7953,1204,4540,3594,1575,4541,2613,3717,7954,3718,3105, 948, # 7398
3232, 121,1745,3837,1110,7955,4259,3063,2509,3009,4029,3719,1151,1771,3838,1488, # 7414
4030,1986,7956,2433,3487,7957,7958,2093,7959,4260,3839,1213,1407,2803, 531,2737, # 7430
2538,3233,1011,1537,7960,2769,4261,3106,1061,7961,3720,3721,1866,2883,7962,2017, # 7446
120,4262,4263,2062,3595,3234,2309,3840,2668,3382,1954,4542,7963,7964,3488,1047, # 7462
2704,1266,7965,1368,4543,2845, 649,3383,3841,2539,2738,1102,2846,2669,7966,7967, # 7478
1999,7968,1111,3596,2962,7969,2488,3842,3597,2804,1854,3384,3722,7970,7971,3385, # 7494
2410,2884,3304,3235,3598,7972,2569,7973,3599,2805,4031,1460, 856,7974,3600,7975, # 7510
2885,2963,7976,2886,3843,7977,4264, 632,2510, 875,3844,1697,3845,2291,7978,7979, # 7526
4544,3010,1239, 580,4545,4265,7980, 914, 936,2074,1190,4032,1039,2123,7981,7982, # 7542
7983,3386,1473,7984,1354,4266,3846,7985,2172,3064,4033, 915,3305,4267,4268,3306, # 7558
1605,1834,7986,2739, 398,3601,4269,3847,4034, 328,1912,2847,4035,3848,1331,4270, # 7574
3011, 937,4271,7987,3602,4036,4037,3387,2160,4546,3388, 524, 742, 538,3065,1012, # 7590
7988,7989,3849,2461,7990, 658,1103, 225,3850,7991,7992,4547,7993,4548,7994,3236, # 7606
1243,7995,4038, 963,2246,4549,7996,2705,3603,3161,7997,7998,2588,2327,7999,4550, # 7622
8000,8001,8002,3489,3307, 957,3389,2540,2032,1930,2927,2462, 870,2018,3604,1746, # 7638
2770,2771,2434,2463,8003,3851,8004,3723,3107,3724,3490,3390,3725,8005,1179,3066, # 7654
8006,3162,2373,4272,3726,2541,3163,3108,2740,4039,8007,3391,1556,2542,2292, 977, # 7670
2887,2033,4040,1205,3392,8008,1765,3393,3164,2124,1271,1689, 714,4551,3491,8009, # 7686
2328,3852, 533,4273,3605,2181, 617,8010,2464,3308,3492,2310,8011,8012,3165,8013, # 7702
8014,3853,1987, 618, 427,2641,3493,3394,8015,8016,1244,1690,8017,2806,4274,4552, # 7718
8018,3494,8019,8020,2279,1576, 473,3606,4275,3395, 972,8021,3607,8022,3067,8023, # 7734
8024,4553,4554,8025,3727,4041,4042,8026, 153,4555, 356,8027,1891,2888,4276,2143, # 7750
408, 803,2352,8028,3854,8029,4277,1646,2570,2511,4556,4557,3855,8030,3856,4278, # 7766
8031,2411,3396, 752,8032,8033,1961,2964,8034, 746,3012,2465,8035,4279,3728, 698, # 7782
4558,1892,4280,3608,2543,4559,3609,3857,8036,3166,3397,8037,1823,1302,4043,2706, # 7798
3858,1973,4281,8038,4282,3167, 823,1303,1288,1236,2848,3495,4044,3398, 774,3859, # 7814
8039,1581,4560,1304,2849,3860,4561,8040,2435,2161,1083,3237,4283,4045,4284, 344, # 7830
1173, 288,2311, 454,1683,8041,8042,1461,4562,4046,2589,8043,8044,4563, 985, 894, # 7846
8045,3399,3168,8046,1913,2928,3729,1988,8047,2110,1974,8048,4047,8049,2571,1194, # 7862
425,8050,4564,3169,1245,3730,4285,8051,8052,2850,8053, 636,4565,1855,3861, 760, # 7878
1799,8054,4286,2209,1508,4566,4048,1893,1684,2293,8055,8056,8057,4287,4288,2210, # 7894
479,8058,8059, 832,8060,4049,2489,8061,2965,2490,3731, 990,3109, 627,1814,2642, # 7910
4289,1582,4290,2125,2111,3496,4567,8062, 799,4291,3170,8063,4568,2112,1737,3013, # 7926
1018, 543, 754,4292,3309,1676,4569,4570,4050,8064,1489,8065,3497,8066,2614,2889, # 7942
4051,8067,8068,2966,8069,8070,8071,8072,3171,4571,4572,2182,1722,8073,3238,3239, # 7958
1842,3610,1715, 481, 365,1975,1856,8074,8075,1962,2491,4573,8076,2126,3611,3240, # 7974
433,1894,2063,2075,8077, 602,2741,8078,8079,8080,8081,8082,3014,1628,3400,8083, # 7990
3172,4574,4052,2890,4575,2512,8084,2544,2772,8085,8086,8087,3310,4576,2891,8088, # 8006
4577,8089,2851,4578,4579,1221,2967,4053,2513,8090,8091,8092,1867,1989,8093,8094, # 8022
8095,1895,8096,8097,4580,1896,4054, 318,8098,2094,4055,4293,8099,8100, 485,8101, # 8038
938,3862, 553,2670, 116,8102,3863,3612,8103,3498,2671,2773,3401,3311,2807,8104, # 8054
3613,2929,4056,1747,2930,2968,8105,8106, 207,8107,8108,2672,4581,2514,8109,3015, # 8070
890,3614,3864,8110,1877,3732,3402,8111,2183,2353,3403,1652,8112,8113,8114, 941, # 8086
2294, 208,3499,4057,2019, 330,4294,3865,2892,2492,3733,4295,8115,8116,8117,8118, # 8102
)

View File

@ -0,0 +1,46 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCTWDistributionAnalysis
from .mbcssm import EUCTW_SM_MODEL
class EUCTWProber(MultiByteCharSetProber):
def __init__(self):
super(EUCTWProber, self).__init__()
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
self.distribution_analyzer = EUCTWDistributionAnalysis()
self.reset()
@property
def charset_name(self):
return "EUC-TW"
@property
def language(self):
return "Taiwan"

View File

@ -0,0 +1,283 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
# GB2312 most frequently used character table
#
# Char to FreqOrder table , from hz6763
# 512 --> 0.79 -- 0.79
# 1024 --> 0.92 -- 0.13
# 2048 --> 0.98 -- 0.06
# 6768 --> 1.00 -- 0.02
#
# Ideal Distribution Ratio = 0.79135/(1-0.79135) = 3.79
# Random Distribution Ration = 512 / (3755 - 512) = 0.157
#
# Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR
GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
GB2312_TABLE_SIZE = 3760
GB2312_CHAR_TO_FREQ_ORDER = (
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
249,4088,1746,1873,2047,1774, 581,1813, 358,1174,3590,1014,1561,4844,2245, 670,
1636,3112, 889,1286, 953, 556,2327,3060,1290,3141, 613, 185,3477,1367, 850,3820,
1715,2428,2642,2303,2732,3041,2562,2648,3566,3946,1349, 388,3098,2091,1360,3585,
152,1687,1539, 738,1559, 59,1232,2925,2267,1388,1249,1741,1679,2960, 151,1566,
1125,1352,4271, 924,4296, 385,3166,4459, 310,1245,2850, 70,3285,2729,3534,3575,
2398,3298,3466,1960,2265, 217,3647, 864,1909,2084,4401,2773,1010,3269,5152, 853,
3051,3121,1244,4251,1895, 364,1499,1540,2313,1180,3655,2268, 562, 715,2417,3061,
544, 336,3768,2380,1752,4075, 950, 280,2425,4382, 183,2759,3272, 333,4297,2155,
1688,2356,1444,1039,4540, 736,1177,3349,2443,2368,2144,2225, 565, 196,1482,3406,
927,1335,4147, 692, 878,1311,1653,3911,3622,1378,4200,1840,2969,3149,2126,1816,
2534,1546,2393,2760, 737,2494, 13, 447, 245,2747, 38,2765,2129,2589,1079, 606,
360, 471,3755,2890, 404, 848, 699,1785,1236, 370,2221,1023,3746,2074,2026,2023,
2388,1581,2119, 812,1141,3091,2536,1519, 804,2053, 406,1596,1090, 784, 548,4414,
1806,2264,2936,1100, 343,4114,5096, 622,3358, 743,3668,1510,1626,5020,3567,2513,
3195,4115,5627,2489,2991, 24,2065,2697,1087,2719, 48,1634, 315, 68, 985,2052,
198,2239,1347,1107,1439, 597,2366,2172, 871,3307, 919,2487,2790,1867, 236,2570,
1413,3794, 906,3365,3381,1701,1982,1818,1524,2924,1205, 616,2586,2072,2004, 575,
253,3099, 32,1365,1182, 197,1714,2454,1201, 554,3388,3224,2748, 756,2587, 250,
2567,1507,1517,3529,1922,2761,2337,3416,1961,1677,2452,2238,3153, 615, 911,1506,
1474,2495,1265,1906,2749,3756,3280,2161, 898,2714,1759,3450,2243,2444, 563, 26,
3286,2266,3769,3344,2707,3677, 611,1402, 531,1028,2871,4548,1375, 261,2948, 835,
1190,4134, 353, 840,2684,1900,3082,1435,2109,1207,1674, 329,1872,2781,4055,2686,
2104, 608,3318,2423,2957,2768,1108,3739,3512,3271,3985,2203,1771,3520,1418,2054,
1681,1153, 225,1627,2929, 162,2050,2511,3687,1954, 124,1859,2431,1684,3032,2894,
585,4805,3969,2869,2704,2088,2032,2095,3656,2635,4362,2209, 256, 518,2042,2105,
3777,3657, 643,2298,1148,1779, 190, 989,3544, 414, 11,2135,2063,2979,1471, 403,
3678, 126, 770,1563, 671,2499,3216,2877, 600,1179, 307,2805,4937,1268,1297,2694,
252,4032,1448,1494,1331,1394, 127,2256, 222,1647,1035,1481,3056,1915,1048, 873,
3651, 210, 33,1608,2516, 200,1520, 415, 102, 0,3389,1287, 817, 91,3299,2940,
836,1814, 549,2197,1396,1669,2987,3582,2297,2848,4528,1070, 687, 20,1819, 121,
1552,1364,1461,1968,2617,3540,2824,2083, 177, 948,4938,2291, 110,4549,2066, 648,
3359,1755,2110,2114,4642,4845,1693,3937,3308,1257,1869,2123, 208,1804,3159,2992,
2531,2549,3361,2418,1350,2347,2800,2568,1291,2036,2680, 72, 842,1990, 212,1233,
1154,1586, 75,2027,3410,4900,1823,1337,2710,2676, 728,2810,1522,3026,4995, 157,
755,1050,4022, 710, 785,1936,2194,2085,1406,2777,2400, 150,1250,4049,1206, 807,
1910, 534, 529,3309,1721,1660, 274, 39,2827, 661,2670,1578, 925,3248,3815,1094,
4278,4901,4252, 41,1150,3747,2572,2227,4501,3658,4902,3813,3357,3617,2884,2258,
887, 538,4187,3199,1294,2439,3042,2329,2343,2497,1255, 107, 543,1527, 521,3478,
3568, 194,5062, 15, 961,3870,1241,1192,2664, 66,5215,3260,2111,1295,1127,2152,
3805,4135, 901,1164,1976, 398,1278, 530,1460, 748, 904,1054,1966,1426, 53,2909,
509, 523,2279,1534, 536,1019, 239,1685, 460,2353, 673,1065,2401,3600,4298,2272,
1272,2363, 284,1753,3679,4064,1695, 81, 815,2677,2757,2731,1386, 859, 500,4221,
2190,2566, 757,1006,2519,2068,1166,1455, 337,2654,3203,1863,1682,1914,3025,1252,
1409,1366, 847, 714,2834,2038,3209, 964,2970,1901, 885,2553,1078,1756,3049, 301,
1572,3326, 688,2130,1996,2429,1805,1648,2930,3421,2750,3652,3088, 262,1158,1254,
389,1641,1812, 526,1719, 923,2073,1073,1902, 468, 489,4625,1140, 857,2375,3070,
3319,2863, 380, 116,1328,2693,1161,2244, 273,1212,1884,2769,3011,1775,1142, 461,
3066,1200,2147,2212, 790, 702,2695,4222,1601,1058, 434,2338,5153,3640, 67,2360,
4099,2502, 618,3472,1329, 416,1132, 830,2782,1807,2653,3211,3510,1662, 192,2124,
296,3979,1739,1611,3684, 23, 118, 324, 446,1239,1225, 293,2520,3814,3795,2535,
3116, 17,1074, 467,2692,2201, 387,2922, 45,1326,3055,1645,3659,2817, 958, 243,
1903,2320,1339,2825,1784,3289, 356, 576, 865,2315,2381,3377,3916,1088,3122,1713,
1655, 935, 628,4689,1034,1327, 441, 800, 720, 894,1979,2183,1528,5289,2702,1071,
4046,3572,2399,1571,3281, 79, 761,1103, 327, 134, 758,1899,1371,1615, 879, 442,
215,2605,2579, 173,2048,2485,1057,2975,3317,1097,2253,3801,4263,1403,1650,2946,
814,4968,3487,1548,2644,1567,1285, 2, 295,2636, 97, 946,3576, 832, 141,4257,
3273, 760,3821,3521,3156,2607, 949,1024,1733,1516,1803,1920,2125,2283,2665,3180,
1501,2064,3560,2171,1592, 803,3518,1416, 732,3897,4258,1363,1362,2458, 119,1427,
602,1525,2608,1605,1639,3175, 694,3064, 10, 465, 76,2000,4846,4208, 444,3781,
1619,3353,2206,1273,3796, 740,2483, 320,1723,2377,3660,2619,1359,1137,1762,1724,
2345,2842,1850,1862, 912, 821,1866, 612,2625,1735,2573,3369,1093, 844, 89, 937,
930,1424,3564,2413,2972,1004,3046,3019,2011, 711,3171,1452,4178, 428, 801,1943,
432, 445,2811, 206,4136,1472, 730, 349, 73, 397,2802,2547, 998,1637,1167, 789,
396,3217, 154,1218, 716,1120,1780,2819,4826,1931,3334,3762,2139,1215,2627, 552,
3664,3628,3232,1405,2383,3111,1356,2652,3577,3320,3101,1703, 640,1045,1370,1246,
4996, 371,1575,2436,1621,2210, 984,4033,1734,2638, 16,4529, 663,2755,3255,1451,
3917,2257,1253,1955,2234,1263,2951, 214,1229, 617, 485, 359,1831,1969, 473,2310,
750,2058, 165, 80,2864,2419, 361,4344,2416,2479,1134, 796,3726,1266,2943, 860,
2715, 938, 390,2734,1313,1384, 248, 202, 877,1064,2854, 522,3907, 279,1602, 297,
2357, 395,3740, 137,2075, 944,4089,2584,1267,3802, 62,1533,2285, 178, 176, 780,
2440, 201,3707, 590, 478,1560,4354,2117,1075, 30, 74,4643,4004,1635,1441,2745,
776,2596, 238,1077,1692,1912,2844, 605, 499,1742,3947, 241,3053, 980,1749, 936,
2640,4511,2582, 515,1543,2162,5322,2892,2993, 890,2148,1924, 665,1827,3581,1032,
968,3163, 339,1044,1896, 270, 583,1791,1720,4367,1194,3488,3669, 43,2523,1657,
163,2167, 290,1209,1622,3378, 550, 634,2508,2510, 695,2634,2384,2512,1476,1414,
220,1469,2341,2138,2852,3183,2900,4939,2865,3502,1211,3680, 854,3227,1299,2976,
3172, 186,2998,1459, 443,1067,3251,1495, 321,1932,3054, 909, 753,1410,1828, 436,
2441,1119,1587,3164,2186,1258, 227, 231,1425,1890,3200,3942, 247, 959, 725,5254,
2741, 577,2158,2079, 929, 120, 174, 838,2813, 591,1115, 417,2024, 40,3240,1536,
1037, 291,4151,2354, 632,1298,2406,2500,3535,1825,1846,3451, 205,1171, 345,4238,
18,1163, 811, 685,2208,1217, 425,1312,1508,1175,4308,2552,1033, 587,1381,3059,
2984,3482, 340,1316,4023,3972, 792,3176, 519, 777,4690, 918, 933,4130,2981,3741,
90,3360,2911,2200,5184,4550, 609,3079,2030, 272,3379,2736, 363,3881,1130,1447,
286, 779, 357,1169,3350,3137,1630,1220,2687,2391, 747,1277,3688,2618,2682,2601,
1156,3196,5290,4034,3102,1689,3596,3128, 874, 219,2783, 798, 508,1843,2461, 269,
1658,1776,1392,1913,2983,3287,2866,2159,2372, 829,4076, 46,4253,2873,1889,1894,
915,1834,1631,2181,2318, 298, 664,2818,3555,2735, 954,3228,3117, 527,3511,2173,
681,2712,3033,2247,2346,3467,1652, 155,2164,3382, 113,1994, 450, 899, 494, 994,
1237,2958,1875,2336,1926,3727, 545,1577,1550, 633,3473, 204,1305,3072,2410,1956,
2471, 707,2134, 841,2195,2196,2663,3843,1026,4940, 990,3252,4997, 368,1092, 437,
3212,3258,1933,1829, 675,2977,2893, 412, 943,3723,4644,3294,3283,2230,2373,5154,
2389,2241,2661,2323,1404,2524, 593, 787, 677,3008,1275,2059, 438,2709,2609,2240,
2269,2246,1446, 36,1568,1373,3892,1574,2301,1456,3962, 693,2276,5216,2035,1143,
2720,1919,1797,1811,2763,4137,2597,1830,1699,1488,1198,2090, 424,1694, 312,3634,
3390,4179,3335,2252,1214, 561,1059,3243,2295,2561, 975,5155,2321,2751,3772, 472,
1537,3282,3398,1047,2077,2348,2878,1323,3340,3076, 690,2906, 51, 369, 170,3541,
1060,2187,2688,3670,2541,1083,1683, 928,3918, 459, 109,4427, 599,3744,4286, 143,
2101,2730,2490, 82,1588,3036,2121, 281,1860, 477,4035,1238,2812,3020,2716,3312,
1530,2188,2055,1317, 843, 636,1808,1173,3495, 649, 181,1002, 147,3641,1159,2414,
3750,2289,2795, 813,3123,2610,1136,4368, 5,3391,4541,2174, 420, 429,1728, 754,
1228,2115,2219, 347,2223,2733, 735,1518,3003,2355,3134,1764,3948,3329,1888,2424,
1001,1234,1972,3321,3363,1672,1021,1450,1584, 226, 765, 655,2526,3404,3244,2302,
3665, 731, 594,2184, 319,1576, 621, 658,2656,4299,2099,3864,1279,2071,2598,2739,
795,3086,3699,3908,1707,2352,2402,1382,3136,2475,1465,4847,3496,3865,1085,3004,
2591,1084, 213,2287,1963,3565,2250, 822, 793,4574,3187,1772,1789,3050, 595,1484,
1959,2770,1080,2650, 456, 422,2996, 940,3322,4328,4345,3092,2742, 965,2784, 739,
4124, 952,1358,2498,2949,2565, 332,2698,2378, 660,2260,2473,4194,3856,2919, 535,
1260,2651,1208,1428,1300,1949,1303,2942, 433,2455,2450,1251,1946, 614,1269, 641,
1306,1810,2737,3078,2912, 564,2365,1419,1415,1497,4460,2367,2185,1379,3005,1307,
3218,2175,1897,3063, 682,1157,4040,4005,1712,1160,1941,1399, 394, 402,2952,1573,
1151,2986,2404, 862, 299,2033,1489,3006, 346, 171,2886,3401,1726,2932, 168,2533,
47,2507,1030,3735,1145,3370,1395,1318,1579,3609,4560,2857,4116,1457,2529,1965,
504,1036,2690,2988,2405, 745,5871, 849,2397,2056,3081, 863,2359,3857,2096, 99,
1397,1769,2300,4428,1643,3455,1978,1757,3718,1440, 35,4879,3742,1296,4228,2280,
160,5063,1599,2013, 166, 520,3479,1646,3345,3012, 490,1937,1545,1264,2182,2505,
1096,1188,1369,1436,2421,1667,2792,2460,1270,2122, 727,3167,2143, 806,1706,1012,
1800,3037, 960,2218,1882, 805, 139,2456,1139,1521, 851,1052,3093,3089, 342,2039,
744,5097,1468,1502,1585,2087, 223, 939, 326,2140,2577, 892,2481,1623,4077, 982,
3708, 135,2131, 87,2503,3114,2326,1106, 876,1616, 547,2997,2831,2093,3441,4530,
4314, 9,3256,4229,4148, 659,1462,1986,1710,2046,2913,2231,4090,4880,5255,3392,
3274,1368,3689,4645,1477, 705,3384,3635,1068,1529,2941,1458,3782,1509, 100,1656,
2548, 718,2339, 408,1590,2780,3548,1838,4117,3719,1345,3530, 717,3442,2778,3220,
2898,1892,4590,3614,3371,2043,1998,1224,3483, 891, 635, 584,2559,3355, 733,1766,
1729,1172,3789,1891,2307, 781,2982,2271,1957,1580,5773,2633,2005,4195,3097,1535,
3213,1189,1934,5693,3262, 586,3118,1324,1598, 517,1564,2217,1868,1893,4445,3728,
2703,3139,1526,1787,1992,3882,2875,1549,1199,1056,2224,1904,2711,5098,4287, 338,
1993,3129,3489,2689,1809,2815,1997, 957,1855,3898,2550,3275,3057,1105,1319, 627,
1505,1911,1883,3526, 698,3629,3456,1833,1431, 746, 77,1261,2017,2296,1977,1885,
125,1334,1600, 525,1798,1109,2222,1470,1945, 559,2236,1186,3443,2476,1929,1411,
2411,3135,1777,3372,2621,1841,1613,3229, 668,1430,1839,2643,2916, 195,1989,2671,
2358,1387, 629,3205,2293,5256,4439, 123,1310, 888,1879,4300,3021,3605,1003,1162,
3192,2910,2010, 140,2395,2859, 55,1082,2012,2901, 662, 419,2081,1438, 680,2774,
4654,3912,1620,1731,1625,5035,4065,2328, 512,1344, 802,5443,2163,2311,2537, 524,
3399, 98,1155,2103,1918,2606,3925,2816,1393,2465,1504,3773,2177,3963,1478,4346,
180,1113,4655,3461,2028,1698, 833,2696,1235,1322,1594,4408,3623,3013,3225,2040,
3022, 541,2881, 607,3632,2029,1665,1219, 639,1385,1686,1099,2803,3231,1938,3188,
2858, 427, 676,2772,1168,2025, 454,3253,2486,3556, 230,1950, 580, 791,1991,1280,
1086,1974,2034, 630, 257,3338,2788,4903,1017, 86,4790, 966,2789,1995,1696,1131,
259,3095,4188,1308, 179,1463,5257, 289,4107,1248, 42,3413,1725,2288, 896,1947,
774,4474,4254, 604,3430,4264, 392,2514,2588, 452, 237,1408,3018, 988,4531,1970,
3034,3310, 540,2370,1562,1288,2990, 502,4765,1147, 4,1853,2708, 207, 294,2814,
4078,2902,2509, 684, 34,3105,3532,2551, 644, 709,2801,2344, 573,1727,3573,3557,
2021,1081,3100,4315,2100,3681, 199,2263,1837,2385, 146,3484,1195,2776,3949, 997,
1939,3973,1008,1091,1202,1962,1847,1149,4209,5444,1076, 493, 117,5400,2521, 972,
1490,2934,1796,4542,2374,1512,2933,2657, 413,2888,1135,2762,2314,2156,1355,2369,
766,2007,2527,2170,3124,2491,2593,2632,4757,2437, 234,3125,3591,1898,1750,1376,
1942,3468,3138, 570,2127,2145,3276,4131, 962, 132,1445,4196, 19, 941,3624,3480,
3366,1973,1374,4461,3431,2629, 283,2415,2275, 808,2887,3620,2112,2563,1353,3610,
955,1089,3103,1053, 96, 88,4097, 823,3808,1583, 399, 292,4091,3313, 421,1128,
642,4006, 903,2539,1877,2082, 596, 29,4066,1790, 722,2157, 130, 995,1569, 769,
1485, 464, 513,2213, 288,1923,1101,2453,4316, 133, 486,2445, 50, 625, 487,2207,
57, 423, 481,2962, 159,3729,1558, 491, 303, 482, 501, 240,2837, 112,3648,2392,
1783, 362, 8,3433,3422, 610,2793,3277,1390,1284,1654, 21,3823, 734, 367, 623,
193, 287, 374,1009,1483, 816, 476, 313,2255,2340,1262,2150,2899,1146,2581, 782,
2116,1659,2018,1880, 255,3586,3314,1110,2867,2137,2564, 986,2767,5185,2006, 650,
158, 926, 762, 881,3157,2717,2362,3587, 306,3690,3245,1542,3077,2427,1691,2478,
2118,2985,3490,2438, 539,2305, 983, 129,1754, 355,4201,2386, 827,2923, 104,1773,
2838,2771, 411,2905,3919, 376, 767, 122,1114, 828,2422,1817,3506, 266,3460,1007,
1609,4998, 945,2612,4429,2274, 726,1247,1964,2914,2199,2070,4002,4108, 657,3323,
1422, 579, 455,2764,4737,1222,2895,1670, 824,1223,1487,2525, 558, 861,3080, 598,
2659,2515,1967, 752,2583,2376,2214,4180, 977, 704,2464,4999,2622,4109,1210,2961,
819,1541, 142,2284, 44, 418, 457,1126,3730,4347,4626,1644,1876,3671,1864, 302,
1063,5694, 624, 723,1984,3745,1314,1676,2488,1610,1449,3558,3569,2166,2098, 409,
1011,2325,3704,2306, 818,1732,1383,1824,1844,3757, 999,2705,3497,1216,1423,2683,
2426,2954,2501,2726,2229,1475,2554,5064,1971,1794,1666,2014,1343, 783, 724, 191,
2434,1354,2220,5065,1763,2752,2472,4152, 131, 175,2885,3434, 92,1466,4920,2616,
3871,3872,3866, 128,1551,1632, 669,1854,3682,4691,4125,1230, 188,2973,3290,1302,
1213, 560,3266, 917, 763,3909,3249,1760, 868,1958, 764,1782,2097, 145,2277,3774,
4462, 64,1491,3062, 971,2132,3606,2442, 221,1226,1617, 218, 323,1185,3207,3147,
571, 619,1473,1005,1744,2281, 449,1887,2396,3685, 275, 375,3816,1743,3844,3731,
845,1983,2350,4210,1377, 773, 967,3499,3052,3743,2725,4007,1697,1022,3943,1464,
3264,2855,2722,1952,1029,2839,2467, 84,4383,2215, 820,1391,2015,2448,3672, 377,
1948,2168, 797,2545,3536,2578,2645, 94,2874,1678, 405,1259,3071, 771, 546,1315,
470,1243,3083, 895,2468, 981, 969,2037, 846,4181, 653,1276,2928, 14,2594, 557,
3007,2474, 156, 902,1338,1740,2574, 537,2518, 973,2282,2216,2433,1928, 138,2903,
1293,2631,1612, 646,3457, 839,2935, 111, 496,2191,2847, 589,3186, 149,3994,2060,
4031,2641,4067,3145,1870, 37,3597,2136,1025,2051,3009,3383,3549,1121,1016,3261,
1301, 251,2446,2599,2153, 872,3246, 637, 334,3705, 831, 884, 921,3065,3140,4092,
2198,1944, 246,2964, 108,2045,1152,1921,2308,1031, 203,3173,4170,1907,3890, 810,
1401,2003,1690, 506, 647,1242,2828,1761,1649,3208,2249,1589,3709,2931,5156,1708,
498, 666,2613, 834,3817,1231, 184,2851,1124, 883,3197,2261,3710,1765,1553,2658,
1178,2639,2351, 93,1193, 942,2538,2141,4402, 235,1821, 870,1591,2192,1709,1871,
3341,1618,4126,2595,2334, 603, 651, 69, 701, 268,2662,3411,2555,1380,1606, 503,
448, 254,2371,2646, 574,1187,2309,1770, 322,2235,1292,1801, 305, 566,1133, 229,
2067,2057, 706, 167, 483,2002,2672,3295,1820,3561,3067, 316, 378,2746,3452,1112,
136,1981, 507,1651,2917,1117, 285,4591, 182,2580,3522,1304, 335,3303,1835,2504,
1795,1792,2248, 674,1018,2106,2449,1857,2292,2845, 976,3047,1781,2600,2727,1389,
1281, 52,3152, 153, 265,3950, 672,3485,3951,4463, 430,1183, 365, 278,2169, 27,
1407,1336,2304, 209,1340,1730,2202,1852,2403,2883, 979,1737,1062, 631,2829,2542,
3876,2592, 825,2086,2226,3048,3625, 352,1417,3724, 542, 991, 431,1351,3938,1861,
2294, 826,1361,2927,3142,3503,1738, 463,2462,2723, 582,1916,1595,2808, 400,3845,
3891,2868,3621,2254, 58,2492,1123, 910,2160,2614,1372,1603,1196,1072,3385,1700,
3267,1980, 696, 480,2430, 920, 799,1570,2920,1951,2041,4047,2540,1321,4223,2469,
3562,2228,1271,2602, 401,2833,3351,2575,5157, 907,2312,1256, 410, 263,3507,1582,
996, 678,1849,2316,1480, 908,3545,2237, 703,2322, 667,1826,2849,1531,2604,2999,
2407,3146,2151,2630,1786,3711, 469,3542, 497,3899,2409, 858, 837,4446,3393,1274,
786, 620,1845,2001,3311, 484, 308,3367,1204,1815,3691,2332,1532,2557,1842,2020,
2724,1927,2333,4440, 567, 22,1673,2728,4475,1987,1858,1144,1597, 101,1832,3601,
12, 974,3783,4391, 951,1412, 1,3720, 453,4608,4041, 528,1041,1027,3230,2628,
1129, 875,1051,3291,1203,2262,1069,2860,2799,2149,2615,3278, 144,1758,3040, 31,
475,1680, 366,2685,3184, 311,1642,4008,2466,5036,1593,1493,2809, 216,1420,1668,
233, 304,2128,3284, 232,1429,1768,1040,2008,3407,2740,2967,2543, 242,2133, 778,
1565,2022,2620, 505,2189,2756,1098,2273, 372,1614, 708, 553,2846,2094,2278, 169,
3626,2835,4161, 228,2674,3165, 809,1454,1309, 466,1705,1095, 900,3423, 880,2667,
3751,5258,2317,3109,2571,4317,2766,1503,1342, 866,4447,1118, 63,2076, 314,1881,
1348,1061, 172, 978,3515,1747, 532, 511,3970, 6, 601, 905,2699,3300,1751, 276,
1467,3725,2668, 65,4239,2544,2779,2556,1604, 578,2451,1802, 992,2331,2624,1320,
3446, 713,1513,1013, 103,2786,2447,1661, 886,1702, 916, 654,3574,2031,1556, 751,
2178,2821,2179,1498,1538,2176, 271, 914,2251,2080,1325, 638,1953,2937,3877,2432,
2754, 95,3265,1716, 260,1227,4083, 775, 106,1357,3254, 426,1607, 555,2480, 772,
1985, 244,2546, 474, 495,1046,2611,1851,2061, 71,2089,1675,2590, 742,3758,2843,
3222,1433, 267,2180,2576,2826,2233,2092,3913,2435, 956,1745,3075, 856,2113,1116,
451, 3,1988,2896,1398, 993,2463,1878,2049,1341,2718,2721,2870,2108, 712,2904,
4363,2753,2324, 277,2872,2349,2649, 384, 987, 435, 691,3000, 922, 164,3939, 652,
1500,1184,4153,2482,3373,2165,4848,2335,3775,3508,3154,2806,2830,1554,2102,1664,
2530,1434,2408, 893,1547,2623,3447,2832,2242,2532,3169,2856,3223,2078, 49,3770,
3469, 462, 318, 656,2259,3250,3069, 679,1629,2758, 344,1138,1104,3120,1836,1283,
3115,2154,1437,4448, 934, 759,1999, 794,2862,1038, 533,2560,1722,2342, 855,2626,
1197,1663,4476,3127, 85,4240,2528, 25,1111,1181,3673, 407,3470,4561,2679,2713,
768,1925,2841,3986,1544,1165, 932, 373,1240,2146,1930,2673, 721,4766, 354,4333,
391,2963, 187, 61,3364,1442,1102, 330,1940,1767, 341,3809,4118, 393,2496,2062,
2211, 105, 331, 300, 439, 913,1332, 626, 379,3304,1557, 328, 689,3952, 309,1555,
931, 317,2517,3027, 325, 569, 686,2107,3084, 60,1042,1333,2794, 264,3177,4014,
1628, 258,3712, 7,4464,1176,1043,1778, 683, 114,1975, 78,1492, 383,1886, 510,
386, 645,5291,2891,2069,3305,4138,3867,2939,2603,2493,1935,1066,1848,3588,1015,
1282,1289,4609, 697,1453,3044,2666,3611,1856,2412, 54, 719,1330, 568,3778,2459,
1748, 788, 492, 551,1191,1000, 488,3394,3763, 282,1799, 348,2016,1523,3155,2390,
1049, 382,2019,1788,1170, 729,2968,3523, 897,3926,2785,2938,3292, 350,2319,3238,
1718,1717,2655,3453,3143,4465, 161,2889,2980,2009,1421, 56,1908,1640,2387,2232,
1917,1874,2477,4921, 148, 83,3438, 592,4245,2882,1822,1055, 741, 115,1496,1624,
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512
)

View File

@ -0,0 +1,46 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import GB2312DistributionAnalysis
from .mbcssm import GB2312_SM_MODEL
class GB2312Prober(MultiByteCharSetProber):
def __init__(self):
super(GB2312Prober, self).__init__()
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
self.distribution_analyzer = GB2312DistributionAnalysis()
self.reset()
@property
def charset_name(self):
return "GB2312"
@property
def language(self):
return "Chinese"

View File

@ -0,0 +1,292 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Shy Shalom
# Portions created by the Initial Developer are Copyright (C) 2005
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .enums import ProbingState
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
### General ideas of the Hebrew charset recognition ###
#
# Four main charsets exist in Hebrew:
# "ISO-8859-8" - Visual Hebrew
# "windows-1255" - Logical Hebrew
# "ISO-8859-8-I" - Logical Hebrew
# "x-mac-hebrew" - ?? Logical Hebrew ??
#
# Both "ISO" charsets use a completely identical set of code points, whereas
# "windows-1255" and "x-mac-hebrew" are two different proper supersets of
# these code points. windows-1255 defines additional characters in the range
# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
# diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
# x-mac-hebrew defines similar additional code points but with a different
# mapping.
#
# As far as an average Hebrew text with no diacritics is concerned, all four
# charsets are identical with respect to code points. Meaning that for the
# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
# (including final letters).
#
# The dominant difference between these charsets is their directionality.
# "Visual" directionality means that the text is ordered as if the renderer is
# not aware of a BIDI rendering algorithm. The renderer sees the text and
# draws it from left to right. The text itself when ordered naturally is read
# backwards. A buffer of Visual Hebrew generally looks like so:
# "[last word of first line spelled backwards] [whole line ordered backwards
# and spelled backwards] [first word of first line spelled backwards]
# [end of line] [last word of second line] ... etc' "
# adding punctuation marks, numbers and English text to visual text is
# naturally also "visual" and from left to right.
#
# "Logical" directionality means the text is ordered "naturally" according to
# the order it is read. It is the responsibility of the renderer to display
# the text from right to left. A BIDI algorithm is used to place general
# punctuation marks, numbers and English text in the text.
#
# Texts in x-mac-hebrew are almost impossible to find on the Internet. From
# what little evidence I could find, it seems that its general directionality
# is Logical.
#
# To sum up all of the above, the Hebrew probing mechanism knows about two
# charsets:
# Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
# backwards while line order is natural. For charset recognition purposes
# the line order is unimportant (In fact, for this implementation, even
# word order is unimportant).
# Logical Hebrew - "windows-1255" - normal, naturally ordered text.
#
# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
# specifically identified.
# "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
# that contain special punctuation marks or diacritics is displayed with
# some unconverted characters showing as question marks. This problem might
# be corrected using another model prober for x-mac-hebrew. Due to the fact
# that x-mac-hebrew texts are so rare, writing another model prober isn't
# worth the effort and performance hit.
#
#### The Prober ####
#
# The prober is divided between two SBCharSetProbers and a HebrewProber,
# all of which are managed, created, fed data, inquired and deleted by the
# SBCSGroupProber. The two SBCharSetProbers identify that the text is in
# fact some kind of Hebrew, Logical or Visual. The final decision about which
# one is it is made by the HebrewProber by combining final-letter scores
# with the scores of the two SBCharSetProbers to produce a final answer.
#
# The SBCSGroupProber is responsible for stripping the original text of HTML
# tags, English characters, numbers, low-ASCII punctuation characters, spaces
# and new lines. It reduces any sequence of such characters to a single space.
# The buffer fed to each prober in the SBCS group prober is pure text in
# high-ASCII.
# The two SBCharSetProbers (model probers) share the same language model:
# Win1255Model.
# The first SBCharSetProber uses the model normally as any other
# SBCharSetProber does, to recognize windows-1255, upon which this model was
# built. The second SBCharSetProber is told to make the pair-of-letter
# lookup in the language model backwards. This in practice exactly simulates
# a visual Hebrew model using the windows-1255 logical Hebrew model.
#
# The HebrewProber is not using any language model. All it does is look for
# final-letter evidence suggesting the text is either logical Hebrew or visual
# Hebrew. Disjointed from the model probers, the results of the HebrewProber
# alone are meaningless. HebrewProber always returns 0.00 as confidence
# since it never identifies a charset by itself. Instead, the pointer to the
# HebrewProber is passed to the model probers as a helper "Name Prober".
# When the Group prober receives a positive identification from any prober,
# it asks for the name of the charset identified. If the prober queried is a
# Hebrew model prober, the model prober forwards the call to the
# HebrewProber to make the final decision. In the HebrewProber, the
# decision is made according to the final-letters scores maintained and Both
# model probers scores. The answer is returned in the form of the name of the
# charset identified, either "windows-1255" or "ISO-8859-8".
class HebrewProber(CharSetProber):
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = 0xea
NORMAL_KAF = 0xeb
FINAL_MEM = 0xed
NORMAL_MEM = 0xee
FINAL_NUN = 0xef
NORMAL_NUN = 0xf0
FINAL_PE = 0xf3
NORMAL_PE = 0xf4
FINAL_TSADI = 0xf5
NORMAL_TSADI = 0xf6
# Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score
# distance.
MIN_FINAL_CHAR_DISTANCE = 5
# Minimum Visual vs Logical model score difference.
# If the difference is below this, don't rely at all on the model score
# distance.
MIN_MODEL_DISTANCE = 0.01
VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255"
def __init__(self):
super(HebrewProber, self).__init__()
self._final_char_logical_score = None
self._final_char_visual_score = None
self._prev = None
self._before_prev = None
self._logical_prober = None
self._visual_prober = None
self.reset()
def reset(self):
self._final_char_logical_score = 0
self._final_char_visual_score = 0
# The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data
self._prev = ' '
self._before_prev = ' '
# These probers are owned by the group prober.
def set_model_probers(self, logicalProber, visualProber):
self._logical_prober = logicalProber
self._visual_prober = visualProber
def is_final(self, c):
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
self.FINAL_PE, self.FINAL_TSADI]
def is_non_final(self, c):
# The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters
# causing the Non-Final tsadi to appear at an end of a word even
# though this is not the case in the original text.
# The letters Pe and Kaf rarely display a related behavior of not being
# a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
# for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare.
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
self.NORMAL_NUN, self.NORMAL_PE]
def feed(self, byte_str):
# Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew
# or visual Hebrew.
# The following cases are checked:
# 1) A word longer than 1 letter, ending with a final letter. This is
# an indication that the text is laid out "naturally" since the
# final letter really appears at the end. +1 for logical score.
# 2) A word longer than 1 letter, ending with a Non-Final letter. In
# normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
# should not end with the Non-Final form of that letter. Exceptions
# to this rule are mentioned above in isNonFinal(). This is an
# indication that the text is laid out backwards. +1 for visual
# score
# 3) A word longer than 1 letter, starting with a final letter. Final
# letters should not appear at the beginning of a word. This is an
# indication that the text is laid out backwards. +1 for visual
# score.
#
# The visual score and logical score are accumulated throughout the
# text and are finally checked against each other in GetCharSetName().
# No checking for final letters in the middle of words is done since
# that case is not an indication for either Logical or Visual text.
#
# We automatically filter out all 7-bit characters (replace them with
# spaces) so the word boundary detection works properly. [MAP]
if self.state == ProbingState.NOT_ME:
# Both model probers say it's not them. No reason to continue.
return ProbingState.NOT_ME
byte_str = self.filter_high_byte_only(byte_str)
for cur in byte_str:
if cur == ' ':
# We stand on a space - a word just ended
if self._before_prev != ' ':
# next-to-last char was not a space so self._prev is not a
# 1 letter word
if self.is_final(self._prev):
# case (1) [-2:not space][-1:final letter][cur:space]
self._final_char_logical_score += 1
elif self.is_non_final(self._prev):
# case (2) [-2:not space][-1:Non-Final letter][
# cur:space]
self._final_char_visual_score += 1
else:
# Not standing on a space
if ((self._before_prev == ' ') and
(self.is_final(self._prev)) and (cur != ' ')):
# case (3) [-2:space][-1:final letter][cur:not space]
self._final_char_visual_score += 1
self._before_prev = self._prev
self._prev = cur
# Forever detecting, till the end or until both model probers return
# ProbingState.NOT_ME (handled above)
return ProbingState.DETECTING
@property
def charset_name(self):
# Make the decision: is it Logical or Visual?
# If the final letter score distance is dominant enough, rely on it.
finalsub = self._final_char_logical_score - self._final_char_visual_score
if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:
return self.LOGICAL_HEBREW_NAME
if finalsub <= -self.MIN_FINAL_CHAR_DISTANCE:
return self.VISUAL_HEBREW_NAME
# It's not dominant enough, try to rely on the model scores instead.
modelsub = (self._logical_prober.get_confidence()
- self._visual_prober.get_confidence())
if modelsub > self.MIN_MODEL_DISTANCE:
return self.LOGICAL_HEBREW_NAME
if modelsub < -self.MIN_MODEL_DISTANCE:
return self.VISUAL_HEBREW_NAME
# Still no good, back to final letter distance, maybe it'll save the
# day.
if finalsub < 0.0:
return self.VISUAL_HEBREW_NAME
# (finalsub > 0 - Logical) or (don't know what to do) default to
# Logical.
return self.LOGICAL_HEBREW_NAME
@property
def language(self):
return 'Hebrew'
@property
def state(self):
# Remain active as long as any of the model probers are active.
if (self._logical_prober.state == ProbingState.NOT_ME) and \
(self._visual_prober.state == ProbingState.NOT_ME):
return ProbingState.NOT_ME
return ProbingState.DETECTING

View File

@ -0,0 +1,325 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
# Sampling from about 20M text materials include literature and computer technology
#
# Japanese frequency table, applied to both S-JIS and EUC-JP
# They are sorted in order.
# 128 --> 0.77094
# 256 --> 0.85710
# 512 --> 0.92635
# 1024 --> 0.97130
# 2048 --> 0.99431
#
# Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58
# Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191
#
# Typical Distribution Ratio, 25% of IDR
JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
# Char to FreqOrder table ,
JIS_TABLE_SIZE = 4368
JIS_CHAR_TO_FREQ_ORDER = (
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
2042,1061,1062, 48, 49, 44, 45, 433, 434,1040,1041, 996, 787,2997,1255,4305, # 64
2108,4609,1684,1648,5073,5074,5075,5076,5077,5078,3687,5079,4610,5080,3927,3928, # 80
5081,3296,3432, 290,2285,1471,2187,5082,2580,2825,1303,2140,1739,1445,2691,3375, # 96
1691,3297,4306,4307,4611, 452,3376,1182,2713,3688,3069,4308,5083,5084,5085,5086, # 112
5087,5088,5089,5090,5091,5092,5093,5094,5095,5096,5097,5098,5099,5100,5101,5102, # 128
5103,5104,5105,5106,5107,5108,5109,5110,5111,5112,4097,5113,5114,5115,5116,5117, # 144
5118,5119,5120,5121,5122,5123,5124,5125,5126,5127,5128,5129,5130,5131,5132,5133, # 160
5134,5135,5136,5137,5138,5139,5140,5141,5142,5143,5144,5145,5146,5147,5148,5149, # 176
5150,5151,5152,4612,5153,5154,5155,5156,5157,5158,5159,5160,5161,5162,5163,5164, # 192
5165,5166,5167,5168,5169,5170,5171,5172,5173,5174,5175,1472, 598, 618, 820,1205, # 208
1309,1412,1858,1307,1692,5176,5177,5178,5179,5180,5181,5182,1142,1452,1234,1172, # 224
1875,2043,2149,1793,1382,2973, 925,2404,1067,1241, 960,1377,2935,1491, 919,1217, # 240
1865,2030,1406,1499,2749,4098,5183,5184,5185,5186,5187,5188,2561,4099,3117,1804, # 256
2049,3689,4309,3513,1663,5189,3166,3118,3298,1587,1561,3433,5190,3119,1625,2998, # 272
3299,4613,1766,3690,2786,4614,5191,5192,5193,5194,2161, 26,3377, 2,3929, 20, # 288
3691, 47,4100, 50, 17, 16, 35, 268, 27, 243, 42, 155, 24, 154, 29, 184, # 304
4, 91, 14, 92, 53, 396, 33, 289, 9, 37, 64, 620, 21, 39, 321, 5, # 320
12, 11, 52, 13, 3, 208, 138, 0, 7, 60, 526, 141, 151,1069, 181, 275, # 336
1591, 83, 132,1475, 126, 331, 829, 15, 69, 160, 59, 22, 157, 55,1079, 312, # 352
109, 38, 23, 25, 10, 19, 79,5195, 61, 382,1124, 8, 30,5196,5197,5198, # 368
5199,5200,5201,5202,5203,5204,5205,5206, 89, 62, 74, 34,2416, 112, 139, 196, # 384
271, 149, 84, 607, 131, 765, 46, 88, 153, 683, 76, 874, 101, 258, 57, 80, # 400
32, 364, 121,1508, 169,1547, 68, 235, 145,2999, 41, 360,3027, 70, 63, 31, # 416
43, 259, 262,1383, 99, 533, 194, 66, 93, 846, 217, 192, 56, 106, 58, 565, # 432
280, 272, 311, 256, 146, 82, 308, 71, 100, 128, 214, 655, 110, 261, 104,1140, # 448
54, 51, 36, 87, 67,3070, 185,2618,2936,2020, 28,1066,2390,2059,5207,5208, # 464
5209,5210,5211,5212,5213,5214,5215,5216,4615,5217,5218,5219,5220,5221,5222,5223, # 480
5224,5225,5226,5227,5228,5229,5230,5231,5232,5233,5234,5235,5236,3514,5237,5238, # 496
5239,5240,5241,5242,5243,5244,2297,2031,4616,4310,3692,5245,3071,5246,3598,5247, # 512
4617,3231,3515,5248,4101,4311,4618,3808,4312,4102,5249,4103,4104,3599,5250,5251, # 528
5252,5253,5254,5255,5256,5257,5258,5259,5260,5261,5262,5263,5264,5265,5266,5267, # 544
5268,5269,5270,5271,5272,5273,5274,5275,5276,5277,5278,5279,5280,5281,5282,5283, # 560
5284,5285,5286,5287,5288,5289,5290,5291,5292,5293,5294,5295,5296,5297,5298,5299, # 576
5300,5301,5302,5303,5304,5305,5306,5307,5308,5309,5310,5311,5312,5313,5314,5315, # 592
5316,5317,5318,5319,5320,5321,5322,5323,5324,5325,5326,5327,5328,5329,5330,5331, # 608
5332,5333,5334,5335,5336,5337,5338,5339,5340,5341,5342,5343,5344,5345,5346,5347, # 624
5348,5349,5350,5351,5352,5353,5354,5355,5356,5357,5358,5359,5360,5361,5362,5363, # 640
5364,5365,5366,5367,5368,5369,5370,5371,5372,5373,5374,5375,5376,5377,5378,5379, # 656
5380,5381, 363, 642,2787,2878,2788,2789,2316,3232,2317,3434,2011, 165,1942,3930, # 672
3931,3932,3933,5382,4619,5383,4620,5384,5385,5386,5387,5388,5389,5390,5391,5392, # 688
5393,5394,5395,5396,5397,5398,5399,5400,5401,5402,5403,5404,5405,5406,5407,5408, # 704
5409,5410,5411,5412,5413,5414,5415,5416,5417,5418,5419,5420,5421,5422,5423,5424, # 720
5425,5426,5427,5428,5429,5430,5431,5432,5433,5434,5435,5436,5437,5438,5439,5440, # 736
5441,5442,5443,5444,5445,5446,5447,5448,5449,5450,5451,5452,5453,5454,5455,5456, # 752
5457,5458,5459,5460,5461,5462,5463,5464,5465,5466,5467,5468,5469,5470,5471,5472, # 768
5473,5474,5475,5476,5477,5478,5479,5480,5481,5482,5483,5484,5485,5486,5487,5488, # 784
5489,5490,5491,5492,5493,5494,5495,5496,5497,5498,5499,5500,5501,5502,5503,5504, # 800
5505,5506,5507,5508,5509,5510,5511,5512,5513,5514,5515,5516,5517,5518,5519,5520, # 816
5521,5522,5523,5524,5525,5526,5527,5528,5529,5530,5531,5532,5533,5534,5535,5536, # 832
5537,5538,5539,5540,5541,5542,5543,5544,5545,5546,5547,5548,5549,5550,5551,5552, # 848
5553,5554,5555,5556,5557,5558,5559,5560,5561,5562,5563,5564,5565,5566,5567,5568, # 864
5569,5570,5571,5572,5573,5574,5575,5576,5577,5578,5579,5580,5581,5582,5583,5584, # 880
5585,5586,5587,5588,5589,5590,5591,5592,5593,5594,5595,5596,5597,5598,5599,5600, # 896
5601,5602,5603,5604,5605,5606,5607,5608,5609,5610,5611,5612,5613,5614,5615,5616, # 912
5617,5618,5619,5620,5621,5622,5623,5624,5625,5626,5627,5628,5629,5630,5631,5632, # 928
5633,5634,5635,5636,5637,5638,5639,5640,5641,5642,5643,5644,5645,5646,5647,5648, # 944
5649,5650,5651,5652,5653,5654,5655,5656,5657,5658,5659,5660,5661,5662,5663,5664, # 960
5665,5666,5667,5668,5669,5670,5671,5672,5673,5674,5675,5676,5677,5678,5679,5680, # 976
5681,5682,5683,5684,5685,5686,5687,5688,5689,5690,5691,5692,5693,5694,5695,5696, # 992
5697,5698,5699,5700,5701,5702,5703,5704,5705,5706,5707,5708,5709,5710,5711,5712, # 1008
5713,5714,5715,5716,5717,5718,5719,5720,5721,5722,5723,5724,5725,5726,5727,5728, # 1024
5729,5730,5731,5732,5733,5734,5735,5736,5737,5738,5739,5740,5741,5742,5743,5744, # 1040
5745,5746,5747,5748,5749,5750,5751,5752,5753,5754,5755,5756,5757,5758,5759,5760, # 1056
5761,5762,5763,5764,5765,5766,5767,5768,5769,5770,5771,5772,5773,5774,5775,5776, # 1072
5777,5778,5779,5780,5781,5782,5783,5784,5785,5786,5787,5788,5789,5790,5791,5792, # 1088
5793,5794,5795,5796,5797,5798,5799,5800,5801,5802,5803,5804,5805,5806,5807,5808, # 1104
5809,5810,5811,5812,5813,5814,5815,5816,5817,5818,5819,5820,5821,5822,5823,5824, # 1120
5825,5826,5827,5828,5829,5830,5831,5832,5833,5834,5835,5836,5837,5838,5839,5840, # 1136
5841,5842,5843,5844,5845,5846,5847,5848,5849,5850,5851,5852,5853,5854,5855,5856, # 1152
5857,5858,5859,5860,5861,5862,5863,5864,5865,5866,5867,5868,5869,5870,5871,5872, # 1168
5873,5874,5875,5876,5877,5878,5879,5880,5881,5882,5883,5884,5885,5886,5887,5888, # 1184
5889,5890,5891,5892,5893,5894,5895,5896,5897,5898,5899,5900,5901,5902,5903,5904, # 1200
5905,5906,5907,5908,5909,5910,5911,5912,5913,5914,5915,5916,5917,5918,5919,5920, # 1216
5921,5922,5923,5924,5925,5926,5927,5928,5929,5930,5931,5932,5933,5934,5935,5936, # 1232
5937,5938,5939,5940,5941,5942,5943,5944,5945,5946,5947,5948,5949,5950,5951,5952, # 1248
5953,5954,5955,5956,5957,5958,5959,5960,5961,5962,5963,5964,5965,5966,5967,5968, # 1264
5969,5970,5971,5972,5973,5974,5975,5976,5977,5978,5979,5980,5981,5982,5983,5984, # 1280
5985,5986,5987,5988,5989,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999,6000, # 1296
6001,6002,6003,6004,6005,6006,6007,6008,6009,6010,6011,6012,6013,6014,6015,6016, # 1312
6017,6018,6019,6020,6021,6022,6023,6024,6025,6026,6027,6028,6029,6030,6031,6032, # 1328
6033,6034,6035,6036,6037,6038,6039,6040,6041,6042,6043,6044,6045,6046,6047,6048, # 1344
6049,6050,6051,6052,6053,6054,6055,6056,6057,6058,6059,6060,6061,6062,6063,6064, # 1360
6065,6066,6067,6068,6069,6070,6071,6072,6073,6074,6075,6076,6077,6078,6079,6080, # 1376
6081,6082,6083,6084,6085,6086,6087,6088,6089,6090,6091,6092,6093,6094,6095,6096, # 1392
6097,6098,6099,6100,6101,6102,6103,6104,6105,6106,6107,6108,6109,6110,6111,6112, # 1408
6113,6114,2044,2060,4621, 997,1235, 473,1186,4622, 920,3378,6115,6116, 379,1108, # 1424
4313,2657,2735,3934,6117,3809, 636,3233, 573,1026,3693,3435,2974,3300,2298,4105, # 1440
854,2937,2463, 393,2581,2417, 539, 752,1280,2750,2480, 140,1161, 440, 708,1569, # 1456
665,2497,1746,1291,1523,3000, 164,1603, 847,1331, 537,1997, 486, 508,1693,2418, # 1472
1970,2227, 878,1220, 299,1030, 969, 652,2751, 624,1137,3301,2619, 65,3302,2045, # 1488
1761,1859,3120,1930,3694,3516, 663,1767, 852, 835,3695, 269, 767,2826,2339,1305, # 1504
896,1150, 770,1616,6118, 506,1502,2075,1012,2519, 775,2520,2975,2340,2938,4314, # 1520
3028,2086,1224,1943,2286,6119,3072,4315,2240,1273,1987,3935,1557, 175, 597, 985, # 1536
3517,2419,2521,1416,3029, 585, 938,1931,1007,1052,1932,1685,6120,3379,4316,4623, # 1552
804, 599,3121,1333,2128,2539,1159,1554,2032,3810, 687,2033,2904, 952, 675,1467, # 1568
3436,6121,2241,1096,1786,2440,1543,1924, 980,1813,2228, 781,2692,1879, 728,1918, # 1584
3696,4624, 548,1950,4625,1809,1088,1356,3303,2522,1944, 502, 972, 373, 513,2827, # 1600
586,2377,2391,1003,1976,1631,6122,2464,1084, 648,1776,4626,2141, 324, 962,2012, # 1616
2177,2076,1384, 742,2178,1448,1173,1810, 222, 102, 301, 445, 125,2420, 662,2498, # 1632
277, 200,1476,1165,1068, 224,2562,1378,1446, 450,1880, 659, 791, 582,4627,2939, # 1648
3936,1516,1274, 555,2099,3697,1020,1389,1526,3380,1762,1723,1787,2229, 412,2114, # 1664
1900,2392,3518, 512,2597, 427,1925,2341,3122,1653,1686,2465,2499, 697, 330, 273, # 1680
380,2162, 951, 832, 780, 991,1301,3073, 965,2270,3519, 668,2523,2636,1286, 535, # 1696
1407, 518, 671, 957,2658,2378, 267, 611,2197,3030,6123, 248,2299, 967,1799,2356, # 1712
850,1418,3437,1876,1256,1480,2828,1718,6124,6125,1755,1664,2405,6126,4628,2879, # 1728
2829, 499,2179, 676,4629, 557,2329,2214,2090, 325,3234, 464, 811,3001, 992,2342, # 1744
2481,1232,1469, 303,2242, 466,1070,2163, 603,1777,2091,4630,2752,4631,2714, 322, # 1760
2659,1964,1768, 481,2188,1463,2330,2857,3600,2092,3031,2421,4632,2318,2070,1849, # 1776
2598,4633,1302,2254,1668,1701,2422,3811,2905,3032,3123,2046,4106,1763,1694,4634, # 1792
1604, 943,1724,1454, 917, 868,2215,1169,2940, 552,1145,1800,1228,1823,1955, 316, # 1808
1080,2510, 361,1807,2830,4107,2660,3381,1346,1423,1134,4108,6127, 541,1263,1229, # 1824
1148,2540, 545, 465,1833,2880,3438,1901,3074,2482, 816,3937, 713,1788,2500, 122, # 1840
1575, 195,1451,2501,1111,6128, 859, 374,1225,2243,2483,4317, 390,1033,3439,3075, # 1856
2524,1687, 266, 793,1440,2599, 946, 779, 802, 507, 897,1081, 528,2189,1292, 711, # 1872
1866,1725,1167,1640, 753, 398,2661,1053, 246, 348,4318, 137,1024,3440,1600,2077, # 1888
2129, 825,4319, 698, 238, 521, 187,2300,1157,2423,1641,1605,1464,1610,1097,2541, # 1904
1260,1436, 759,2255,1814,2150, 705,3235, 409,2563,3304, 561,3033,2005,2564, 726, # 1920
1956,2343,3698,4109, 949,3812,3813,3520,1669, 653,1379,2525, 881,2198, 632,2256, # 1936
1027, 778,1074, 733,1957, 514,1481,2466, 554,2180, 702,3938,1606,1017,1398,6129, # 1952
1380,3521, 921, 993,1313, 594, 449,1489,1617,1166, 768,1426,1360, 495,1794,3601, # 1968
1177,3602,1170,4320,2344, 476, 425,3167,4635,3168,1424, 401,2662,1171,3382,1998, # 1984
1089,4110, 477,3169, 474,6130,1909, 596,2831,1842, 494, 693,1051,1028,1207,3076, # 2000
606,2115, 727,2790,1473,1115, 743,3522, 630, 805,1532,4321,2021, 366,1057, 838, # 2016
684,1114,2142,4322,2050,1492,1892,1808,2271,3814,2424,1971,1447,1373,3305,1090, # 2032
1536,3939,3523,3306,1455,2199, 336, 369,2331,1035, 584,2393, 902, 718,2600,6131, # 2048
2753, 463,2151,1149,1611,2467, 715,1308,3124,1268, 343,1413,3236,1517,1347,2663, # 2064
2093,3940,2022,1131,1553,2100,2941,1427,3441,2942,1323,2484,6132,1980, 872,2368, # 2080
2441,2943, 320,2369,2116,1082, 679,1933,3941,2791,3815, 625,1143,2023, 422,2200, # 2096
3816,6133, 730,1695, 356,2257,1626,2301,2858,2637,1627,1778, 937, 883,2906,2693, # 2112
3002,1769,1086, 400,1063,1325,3307,2792,4111,3077, 456,2345,1046, 747,6134,1524, # 2128
884,1094,3383,1474,2164,1059, 974,1688,2181,2258,1047, 345,1665,1187, 358, 875, # 2144
3170, 305, 660,3524,2190,1334,1135,3171,1540,1649,2542,1527, 927, 968,2793, 885, # 2160
1972,1850, 482, 500,2638,1218,1109,1085,2543,1654,2034, 876, 78,2287,1482,1277, # 2176
861,1675,1083,1779, 724,2754, 454, 397,1132,1612,2332, 893, 672,1237, 257,2259, # 2192
2370, 135,3384, 337,2244, 547, 352, 340, 709,2485,1400, 788,1138,2511, 540, 772, # 2208
1682,2260,2272,2544,2013,1843,1902,4636,1999,1562,2288,4637,2201,1403,1533, 407, # 2224
576,3308,1254,2071, 978,3385, 170, 136,1201,3125,2664,3172,2394, 213, 912, 873, # 2240
3603,1713,2202, 699,3604,3699, 813,3442, 493, 531,1054, 468,2907,1483, 304, 281, # 2256
4112,1726,1252,2094, 339,2319,2130,2639, 756,1563,2944, 748, 571,2976,1588,2425, # 2272
2715,1851,1460,2426,1528,1392,1973,3237, 288,3309, 685,3386, 296, 892,2716,2216, # 2288
1570,2245, 722,1747,2217, 905,3238,1103,6135,1893,1441,1965, 251,1805,2371,3700, # 2304
2601,1919,1078, 75,2182,1509,1592,1270,2640,4638,2152,6136,3310,3817, 524, 706, # 2320
1075, 292,3818,1756,2602, 317, 98,3173,3605,3525,1844,2218,3819,2502, 814, 567, # 2336
385,2908,1534,6137, 534,1642,3239, 797,6138,1670,1529, 953,4323, 188,1071, 538, # 2352
178, 729,3240,2109,1226,1374,2000,2357,2977, 731,2468,1116,2014,2051,6139,1261, # 2368
1593, 803,2859,2736,3443, 556, 682, 823,1541,6140,1369,2289,1706,2794, 845, 462, # 2384
2603,2665,1361, 387, 162,2358,1740, 739,1770,1720,1304,1401,3241,1049, 627,1571, # 2400
2427,3526,1877,3942,1852,1500, 431,1910,1503, 677, 297,2795, 286,1433,1038,1198, # 2416
2290,1133,1596,4113,4639,2469,1510,1484,3943,6141,2442, 108, 712,4640,2372, 866, # 2432
3701,2755,3242,1348, 834,1945,1408,3527,2395,3243,1811, 824, 994,1179,2110,1548, # 2448
1453, 790,3003, 690,4324,4325,2832,2909,3820,1860,3821, 225,1748, 310, 346,1780, # 2464
2470, 821,1993,2717,2796, 828, 877,3528,2860,2471,1702,2165,2910,2486,1789, 453, # 2480
359,2291,1676, 73,1164,1461,1127,3311, 421, 604, 314,1037, 589, 116,2487, 737, # 2496
837,1180, 111, 244, 735,6142,2261,1861,1362, 986, 523, 418, 581,2666,3822, 103, # 2512
855, 503,1414,1867,2488,1091, 657,1597, 979, 605,1316,4641,1021,2443,2078,2001, # 2528
1209, 96, 587,2166,1032, 260,1072,2153, 173, 94, 226,3244, 819,2006,4642,4114, # 2544
2203, 231,1744, 782, 97,2667, 786,3387, 887, 391, 442,2219,4326,1425,6143,2694, # 2560
633,1544,1202, 483,2015, 592,2052,1958,2472,1655, 419, 129,4327,3444,3312,1714, # 2576
1257,3078,4328,1518,1098, 865,1310,1019,1885,1512,1734, 469,2444, 148, 773, 436, # 2592
1815,1868,1128,1055,4329,1245,2756,3445,2154,1934,1039,4643, 579,1238, 932,2320, # 2608
353, 205, 801, 115,2428, 944,2321,1881, 399,2565,1211, 678, 766,3944, 335,2101, # 2624
1459,1781,1402,3945,2737,2131,1010, 844, 981,1326,1013, 550,1816,1545,2620,1335, # 2640
1008, 371,2881, 936,1419,1613,3529,1456,1395,2273,1834,2604,1317,2738,2503, 416, # 2656
1643,4330, 806,1126, 229, 591,3946,1314,1981,1576,1837,1666, 347,1790, 977,3313, # 2672
764,2861,1853, 688,2429,1920,1462, 77, 595, 415,2002,3034, 798,1192,4115,6144, # 2688
2978,4331,3035,2695,2582,2072,2566, 430,2430,1727, 842,1396,3947,3702, 613, 377, # 2704
278, 236,1417,3388,3314,3174, 757,1869, 107,3530,6145,1194, 623,2262, 207,1253, # 2720
2167,3446,3948, 492,1117,1935, 536,1838,2757,1246,4332, 696,2095,2406,1393,1572, # 2736
3175,1782, 583, 190, 253,1390,2230, 830,3126,3389, 934,3245,1703,1749,2979,1870, # 2752
2545,1656,2204, 869,2346,4116,3176,1817, 496,1764,4644, 942,1504, 404,1903,1122, # 2768
1580,3606,2945,1022, 515, 372,1735, 955,2431,3036,6146,2797,1110,2302,2798, 617, # 2784
6147, 441, 762,1771,3447,3607,3608,1904, 840,3037, 86, 939,1385, 572,1370,2445, # 2800
1336, 114,3703, 898, 294, 203,3315, 703,1583,2274, 429, 961,4333,1854,1951,3390, # 2816
2373,3704,4334,1318,1381, 966,1911,2322,1006,1155, 309, 989, 458,2718,1795,1372, # 2832
1203, 252,1689,1363,3177, 517,1936, 168,1490, 562, 193,3823,1042,4117,1835, 551, # 2848
470,4645, 395, 489,3448,1871,1465,2583,2641, 417,1493, 279,1295, 511,1236,1119, # 2864
72,1231,1982,1812,3004, 871,1564, 984,3449,1667,2696,2096,4646,2347,2833,1673, # 2880
3609, 695,3246,2668, 807,1183,4647, 890, 388,2333,1801,1457,2911,1765,1477,1031, # 2896
3316,3317,1278,3391,2799,2292,2526, 163,3450,4335,2669,1404,1802,6148,2323,2407, # 2912
1584,1728,1494,1824,1269, 298, 909,3318,1034,1632, 375, 776,1683,2061, 291, 210, # 2928
1123, 809,1249,1002,2642,3038, 206,1011,2132, 144, 975, 882,1565, 342, 667, 754, # 2944
1442,2143,1299,2303,2062, 447, 626,2205,1221,2739,2912,1144,1214,2206,2584, 760, # 2960
1715, 614, 950,1281,2670,2621, 810, 577,1287,2546,4648, 242,2168, 250,2643, 691, # 2976
123,2644, 647, 313,1029, 689,1357,2946,1650, 216, 771,1339,1306, 808,2063, 549, # 2992
913,1371,2913,2914,6149,1466,1092,1174,1196,1311,2605,2396,1783,1796,3079, 406, # 3008
2671,2117,3949,4649, 487,1825,2220,6150,2915, 448,2348,1073,6151,2397,1707, 130, # 3024
900,1598, 329, 176,1959,2527,1620,6152,2275,4336,3319,1983,2191,3705,3610,2155, # 3040
3706,1912,1513,1614,6153,1988, 646, 392,2304,1589,3320,3039,1826,1239,1352,1340, # 3056
2916, 505,2567,1709,1437,2408,2547, 906,6154,2672, 384,1458,1594,1100,1329, 710, # 3072
423,3531,2064,2231,2622,1989,2673,1087,1882, 333, 841,3005,1296,2882,2379, 580, # 3088
1937,1827,1293,2585, 601, 574, 249,1772,4118,2079,1120, 645, 901,1176,1690, 795, # 3104
2207, 478,1434, 516,1190,1530, 761,2080, 930,1264, 355, 435,1552, 644,1791, 987, # 3120
220,1364,1163,1121,1538, 306,2169,1327,1222, 546,2645, 218, 241, 610,1704,3321, # 3136
1984,1839,1966,2528, 451,6155,2586,3707,2568, 907,3178, 254,2947, 186,1845,4650, # 3152
745, 432,1757, 428,1633, 888,2246,2221,2489,3611,2118,1258,1265, 956,3127,1784, # 3168
4337,2490, 319, 510, 119, 457,3612, 274,2035,2007,4651,1409,3128, 970,2758, 590, # 3184
2800, 661,2247,4652,2008,3950,1420,1549,3080,3322,3951,1651,1375,2111, 485,2491, # 3200
1429,1156,6156,2548,2183,1495, 831,1840,2529,2446, 501,1657, 307,1894,3247,1341, # 3216
666, 899,2156,1539,2549,1559, 886, 349,2208,3081,2305,1736,3824,2170,2759,1014, # 3232
1913,1386, 542,1397,2948, 490, 368, 716, 362, 159, 282,2569,1129,1658,1288,1750, # 3248
2674, 276, 649,2016, 751,1496, 658,1818,1284,1862,2209,2087,2512,3451, 622,2834, # 3264
376, 117,1060,2053,1208,1721,1101,1443, 247,1250,3179,1792,3952,2760,2398,3953, # 3280
6157,2144,3708, 446,2432,1151,2570,3452,2447,2761,2835,1210,2448,3082, 424,2222, # 3296
1251,2449,2119,2836, 504,1581,4338, 602, 817, 857,3825,2349,2306, 357,3826,1470, # 3312
1883,2883, 255, 958, 929,2917,3248, 302,4653,1050,1271,1751,2307,1952,1430,2697, # 3328
2719,2359, 354,3180, 777, 158,2036,4339,1659,4340,4654,2308,2949,2248,1146,2232, # 3344
3532,2720,1696,2623,3827,6158,3129,1550,2698,1485,1297,1428, 637, 931,2721,2145, # 3360
914,2550,2587, 81,2450, 612, 827,2646,1242,4655,1118,2884, 472,1855,3181,3533, # 3376
3534, 569,1353,2699,1244,1758,2588,4119,2009,2762,2171,3709,1312,1531,6159,1152, # 3392
1938, 134,1830, 471,3710,2276,1112,1535,3323,3453,3535, 982,1337,2950, 488, 826, # 3408
674,1058,1628,4120,2017, 522,2399, 211, 568,1367,3454, 350, 293,1872,1139,3249, # 3424
1399,1946,3006,1300,2360,3324, 588, 736,6160,2606, 744, 669,3536,3828,6161,1358, # 3440
199, 723, 848, 933, 851,1939,1505,1514,1338,1618,1831,4656,1634,3613, 443,2740, # 3456
3829, 717,1947, 491,1914,6162,2551,1542,4121,1025,6163,1099,1223, 198,3040,2722, # 3472
370, 410,1905,2589, 998,1248,3182,2380, 519,1449,4122,1710, 947, 928,1153,4341, # 3488
2277, 344,2624,1511, 615, 105, 161,1212,1076,1960,3130,2054,1926,1175,1906,2473, # 3504
414,1873,2801,6164,2309, 315,1319,3325, 318,2018,2146,2157, 963, 631, 223,4342, # 3520
4343,2675, 479,3711,1197,2625,3712,2676,2361,6165,4344,4123,6166,2451,3183,1886, # 3536
2184,1674,1330,1711,1635,1506, 799, 219,3250,3083,3954,1677,3713,3326,2081,3614, # 3552
1652,2073,4657,1147,3041,1752, 643,1961, 147,1974,3955,6167,1716,2037, 918,3007, # 3568
1994, 120,1537, 118, 609,3184,4345, 740,3455,1219, 332,1615,3830,6168,1621,2980, # 3584
1582, 783, 212, 553,2350,3714,1349,2433,2082,4124, 889,6169,2310,1275,1410, 973, # 3600
166,1320,3456,1797,1215,3185,2885,1846,2590,2763,4658, 629, 822,3008, 763, 940, # 3616
1990,2862, 439,2409,1566,1240,1622, 926,1282,1907,2764, 654,2210,1607, 327,1130, # 3632
3956,1678,1623,6170,2434,2192, 686, 608,3831,3715, 903,3957,3042,6171,2741,1522, # 3648
1915,1105,1555,2552,1359, 323,3251,4346,3457, 738,1354,2553,2311,2334,1828,2003, # 3664
3832,1753,2351,1227,6172,1887,4125,1478,6173,2410,1874,1712,1847, 520,1204,2607, # 3680
264,4659, 836,2677,2102, 600,4660,3833,2278,3084,6174,4347,3615,1342, 640, 532, # 3696
543,2608,1888,2400,2591,1009,4348,1497, 341,1737,3616,2723,1394, 529,3252,1321, # 3712
983,4661,1515,2120, 971,2592, 924, 287,1662,3186,4349,2700,4350,1519, 908,1948, # 3728
2452, 156, 796,1629,1486,2223,2055, 694,4126,1259,1036,3392,1213,2249,2742,1889, # 3744
1230,3958,1015, 910, 408, 559,3617,4662, 746, 725, 935,4663,3959,3009,1289, 563, # 3760
867,4664,3960,1567,2981,2038,2626, 988,2263,2381,4351, 143,2374, 704,1895,6175, # 3776
1188,3716,2088, 673,3085,2362,4352, 484,1608,1921,2765,2918, 215, 904,3618,3537, # 3792
894, 509, 976,3043,2701,3961,4353,2837,2982, 498,6176,6177,1102,3538,1332,3393, # 3808
1487,1636,1637, 233, 245,3962, 383, 650, 995,3044, 460,1520,1206,2352, 749,3327, # 3824
530, 700, 389,1438,1560,1773,3963,2264, 719,2951,2724,3834, 870,1832,1644,1000, # 3840
839,2474,3717, 197,1630,3394, 365,2886,3964,1285,2133, 734, 922, 818,1106, 732, # 3856
480,2083,1774,3458, 923,2279,1350, 221,3086, 85,2233,2234,3835,1585,3010,2147, # 3872
1387,1705,2382,1619,2475, 133, 239,2802,1991,1016,2084,2383, 411,2838,1113, 651, # 3888
1985,1160,3328, 990,1863,3087,1048,1276,2647, 265,2627,1599,3253,2056, 150, 638, # 3904
2019, 656, 853, 326,1479, 680,1439,4354,1001,1759, 413,3459,3395,2492,1431, 459, # 3920
4355,1125,3329,2265,1953,1450,2065,2863, 849, 351,2678,3131,3254,3255,1104,1577, # 3936
227,1351,1645,2453,2193,1421,2887, 812,2121, 634, 95,2435, 201,2312,4665,1646, # 3952
1671,2743,1601,2554,2702,2648,2280,1315,1366,2089,3132,1573,3718,3965,1729,1189, # 3968
328,2679,1077,1940,1136, 558,1283, 964,1195, 621,2074,1199,1743,3460,3619,1896, # 3984
1916,1890,3836,2952,1154,2112,1064, 862, 378,3011,2066,2113,2803,1568,2839,6178, # 4000
3088,2919,1941,1660,2004,1992,2194, 142, 707,1590,1708,1624,1922,1023,1836,1233, # 4016
1004,2313, 789, 741,3620,6179,1609,2411,1200,4127,3719,3720,4666,2057,3721, 593, # 4032
2840, 367,2920,1878,6180,3461,1521, 628,1168, 692,2211,2649, 300, 720,2067,2571, # 4048
2953,3396, 959,2504,3966,3539,3462,1977, 701,6181, 954,1043, 800, 681, 183,3722, # 4064
1803,1730,3540,4128,2103, 815,2314, 174, 467, 230,2454,1093,2134, 755,3541,3397, # 4080
1141,1162,6182,1738,2039, 270,3256,2513,1005,1647,2185,3837, 858,1679,1897,1719, # 4096
2954,2324,1806, 402, 670, 167,4129,1498,2158,2104, 750,6183, 915, 189,1680,1551, # 4112
455,4356,1501,2455, 405,1095,2955, 338,1586,1266,1819, 570, 641,1324, 237,1556, # 4128
2650,1388,3723,6184,1368,2384,1343,1978,3089,2436, 879,3724, 792,1191, 758,3012, # 4144
1411,2135,1322,4357, 240,4667,1848,3725,1574,6185, 420,3045,1546,1391, 714,4358, # 4160
1967, 941,1864, 863, 664, 426, 560,1731,2680,1785,2864,1949,2363, 403,3330,1415, # 4176
1279,2136,1697,2335, 204, 721,2097,3838, 90,6186,2085,2505, 191,3967, 124,2148, # 4192
1376,1798,1178,1107,1898,1405, 860,4359,1243,1272,2375,2983,1558,2456,1638, 113, # 4208
3621, 578,1923,2609, 880, 386,4130, 784,2186,2266,1422,2956,2172,1722, 497, 263, # 4224
2514,1267,2412,2610, 177,2703,3542, 774,1927,1344, 616,1432,1595,1018, 172,4360, # 4240
2325, 911,4361, 438,1468,3622, 794,3968,2024,2173,1681,1829,2957, 945, 895,3090, # 4256
575,2212,2476, 475,2401,2681, 785,2744,1745,2293,2555,1975,3133,2865, 394,4668, # 4272
3839, 635,4131, 639, 202,1507,2195,2766,1345,1435,2572,3726,1908,1184,1181,2457, # 4288
3727,3134,4362, 843,2611, 437, 916,4669, 234, 769,1884,3046,3047,3623, 833,6187, # 4304
1639,2250,2402,1355,1185,2010,2047, 999, 525,1732,1290,1488,2612, 948,1578,3728, # 4320
2413,2477,1216,2725,2159, 334,3840,1328,3624,2921,1525,4132, 564,1056, 891,4363, # 4336
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, # 4352
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, # 4368 #last 512
)

View File

@ -0,0 +1,233 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = (
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
(0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
(0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
(0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
(0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4),
(1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4),
(0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3),
(0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3),
(0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3),
(0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4),
(0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3),
(2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4),
(0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3),
(0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5),
(0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3),
(2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5),
(0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4),
(1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4),
(0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3),
(0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3),
(0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3),
(0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5),
(0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4),
(0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5),
(0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3),
(0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4),
(0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4),
(0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4),
(0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1),
(0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0),
(1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3),
(0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0),
(0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3),
(0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3),
(0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5),
(0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4),
(2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5),
(0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3),
(0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3),
(0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3),
(0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3),
(0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4),
(0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4),
(0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2),
(0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3),
(0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3),
(0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3),
(0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3),
(0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4),
(0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3),
(0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4),
(0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3),
(0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3),
(0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4),
(0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4),
(0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3),
(2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4),
(0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4),
(0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3),
(0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4),
(0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4),
(1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4),
(0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3),
(0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2),
(0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2),
(0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3),
(0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3),
(0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5),
(0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3),
(0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4),
(1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4),
(0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
(0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3),
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1),
(0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2),
(0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3),
(0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1),
)
class JapaneseContextAnalysis(object):
NUM_OF_CATEGORY = 6
DONT_KNOW = -1
ENOUGH_REL_THRESHOLD = 100
MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4
def __init__(self):
self._total_rel = None
self._rel_sample = None
self._need_to_skip_char_num = None
self._last_char_order = None
self._done = None
self.reset()
def reset(self):
self._total_rel = 0 # total sequence received
# category counters, each integer counts sequence in its category
self._rel_sample = [0] * self.NUM_OF_CATEGORY
# if last byte in current buffer is not the last byte of a character,
# we need to know how many bytes to skip in next buffer
self._need_to_skip_char_num = 0
self._last_char_order = -1 # The order of previous char
# If this flag is set to True, detection is done and conclusion has
# been made
self._done = False
def feed(self, byte_str, num_bytes):
if self._done:
return
# The buffer we got is byte oriented, and a character may span in more than one
# buffers. In case the last one or two byte in last buffer is not
# complete, we record how many byte needed to complete that character
# and skip these bytes here. We can choose to record those bytes as
# well and analyse the character once it is complete, but since a
# character will not make much difference, by simply skipping
# this character will simply our logic and improve performance.
i = self._need_to_skip_char_num
while i < num_bytes:
order, char_len = self.get_order(byte_str[i:i + 2])
i += char_len
if i > num_bytes:
self._need_to_skip_char_num = i - num_bytes
self._last_char_order = -1
else:
if (order != -1) and (self._last_char_order != -1):
self._total_rel += 1
if self._total_rel > self.MAX_REL_THRESHOLD:
self._done = True
break
self._rel_sample[jp2CharContext[self._last_char_order][order]] += 1
self._last_char_order = order
def got_enough_data(self):
return self._total_rel > self.ENOUGH_REL_THRESHOLD
def get_confidence(self):
# This is just one way to calculate confidence. It works well for me.
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
return (self._total_rel - self._rel_sample[0]) / self._total_rel
else:
return self.DONT_KNOW
def get_order(self, byte_str):
return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis):
def __init__(self):
super(SJISContextAnalysis, self).__init__()
self._charset_name = "SHIFT_JIS"
@property
def charset_name(self):
return self._charset_name
def get_order(self, byte_str):
if not byte_str:
return -1, 1
# find out current char's byte length
first_char = byte_str[0]
if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
char_len = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
self._charset_name = "CP932"
else:
char_len = 1
# return its order if it is hiragana
if len(byte_str) > 1:
second_char = byte_str[1]
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
return second_char - 0x9F, char_len
return -1, char_len
class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, byte_str):
if not byte_str:
return -1, 1
# find out current char's byte length
first_char = byte_str[0]
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
char_len = 2
elif first_char == 0x8F:
char_len = 3
else:
char_len = 1
# return its order if it is hiragana
if len(byte_str) > 1:
second_char = byte_str[1]
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
return second_char - 0xA1, char_len
return -1, char_len

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,145 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .enums import ProbingState
FREQ_CAT_NUM = 4
UDF = 0 # undefined
OTH = 1 # other
ASC = 2 # ascii capital letter
ASS = 3 # ascii small letter
ACV = 4 # accent capital vowel
ACO = 5 # accent capital other
ASV = 6 # accent small vowel
ASO = 7 # accent small other
CLASS_NUM = 8 # total classes
Latin1_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
)
# 0 : illegal
# 1 : very unlikely
# 2 : normal
# 3 : very likely
Latin1ClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO
0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, # OTH
0, 3, 3, 3, 3, 3, 3, 3, # ASC
0, 3, 3, 3, 1, 1, 3, 3, # ASS
0, 3, 3, 3, 1, 2, 1, 2, # ACV
0, 3, 3, 3, 3, 3, 3, 3, # ACO
0, 3, 1, 3, 1, 1, 1, 3, # ASV
0, 3, 1, 3, 1, 1, 3, 3, # ASO
)
class Latin1Prober(CharSetProber):
def __init__(self):
super(Latin1Prober, self).__init__()
self._last_char_class = None
self._freq_counter = None
self.reset()
def reset(self):
self._last_char_class = OTH
self._freq_counter = [0] * FREQ_CAT_NUM
CharSetProber.reset(self)
@property
def charset_name(self):
return "ISO-8859-1"
@property
def language(self):
return ""
def feed(self, byte_str):
byte_str = self.filter_with_english_letters(byte_str)
for c in byte_str:
char_class = Latin1_CharToClass[c]
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
+ char_class]
if freq == 0:
self._state = ProbingState.NOT_ME
break
self._freq_counter[freq] += 1
self._last_char_class = char_class
return self.state
def get_confidence(self):
if self.state == ProbingState.NOT_ME:
return 0.01
total = sum(self._freq_counter)
if total < 0.01:
confidence = 0.0
else:
confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0)
/ total)
if confidence < 0.0:
confidence = 0.0
# lower the confidence of latin1 so that other more accurate
# detector can take priority.
confidence = confidence * 0.73
return confidence

View File

@ -0,0 +1,91 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
# Proofpoint, Inc.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .enums import ProbingState, MachineState
class MultiByteCharSetProber(CharSetProber):
"""
MultiByteCharSetProber
"""
def __init__(self, lang_filter=None):
super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter)
self.distribution_analyzer = None
self.coding_sm = None
self._last_char = [0, 0]
def reset(self):
super(MultiByteCharSetProber, self).reset()
if self.coding_sm:
self.coding_sm.reset()
if self.distribution_analyzer:
self.distribution_analyzer.reset()
self._last_char = [0, 0]
@property
def charset_name(self):
raise NotImplementedError
@property
def language(self):
raise NotImplementedError
def feed(self, byte_str):
for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.DETECTING:
if (self.distribution_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.FOUND_IT
return self.state
def get_confidence(self):
return self.distribution_analyzer.get_confidence()

View File

@ -0,0 +1,54 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
# Proofpoint, Inc.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetgroupprober import CharSetGroupProber
from .utf8prober import UTF8Prober
from .sjisprober import SJISProber
from .eucjpprober import EUCJPProber
from .gb2312prober import GB2312Prober
from .euckrprober import EUCKRProber
from .cp949prober import CP949Prober
from .big5prober import Big5Prober
from .euctwprober import EUCTWProber
class MBCSGroupProber(CharSetGroupProber):
def __init__(self, lang_filter=None):
super(MBCSGroupProber, self).__init__(lang_filter=lang_filter)
self.probers = [
UTF8Prober(),
SJISProber(),
EUCJPProber(),
GB2312Prober(),
EUCKRProber(),
CP949Prober(),
Big5Prober(),
EUCTWProber()
]
self.reset()

View File

@ -0,0 +1,572 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .enums import MachineState
# BIG5
BIG5_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
4,4,4,4,4,4,4,4, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f
4,3,3,3,3,3,3,3, # a0 - a7
3,3,3,3,3,3,3,3, # a8 - af
3,3,3,3,3,3,3,3, # b0 - b7
3,3,3,3,3,3,3,3, # b8 - bf
3,3,3,3,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0 # f8 - ff
)
BIG5_ST = (
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
)
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
BIG5_SM_MODEL = {'class_table': BIG5_CLS,
'class_factor': 5,
'state_table': BIG5_ST,
'char_len_table': BIG5_CHAR_LEN_TABLE,
'name': 'Big5'}
# CP949
CP949_CLS = (
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
)
CP949_ST = (
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START, 4, 5,MachineState.ERROR, 6, # MachineState.START
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # MachineState.ERROR
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 3
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 4
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
)
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
CP949_SM_MODEL = {'class_table': CP949_CLS,
'class_factor': 10,
'state_table': CP949_ST,
'char_len_table': CP949_CHAR_LEN_TABLE,
'name': 'CP949'}
# EUC-JP
EUCJP_CLS = (
4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,5,5, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17
4,4,4,5,4,4,4,4, # 18 - 1f
4,4,4,4,4,4,4,4, # 20 - 27
4,4,4,4,4,4,4,4, # 28 - 2f
4,4,4,4,4,4,4,4, # 30 - 37
4,4,4,4,4,4,4,4, # 38 - 3f
4,4,4,4,4,4,4,4, # 40 - 47
4,4,4,4,4,4,4,4, # 48 - 4f
4,4,4,4,4,4,4,4, # 50 - 57
4,4,4,4,4,4,4,4, # 58 - 5f
4,4,4,4,4,4,4,4, # 60 - 67
4,4,4,4,4,4,4,4, # 68 - 6f
4,4,4,4,4,4,4,4, # 70 - 77
4,4,4,4,4,4,4,4, # 78 - 7f
5,5,5,5,5,5,5,5, # 80 - 87
5,5,5,5,5,5,1,3, # 88 - 8f
5,5,5,5,5,5,5,5, # 90 - 97
5,5,5,5,5,5,5,5, # 98 - 9f
5,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,0,5 # f8 - ff
)
EUCJP_ST = (
3, 4, 3, 5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
)
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
EUCJP_SM_MODEL = {'class_table': EUCJP_CLS,
'class_factor': 6,
'state_table': EUCJP_ST,
'char_len_table': EUCJP_CHAR_LEN_TABLE,
'name': 'EUC-JP'}
# EUC-KR
EUCKR_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,3,3,3, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,3,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
2,2,2,2,2,2,2,2, # e0 - e7
2,2,2,2,2,2,2,2, # e8 - ef
2,2,2,2,2,2,2,2, # f0 - f7
2,2,2,2,2,2,2,0 # f8 - ff
)
EUCKR_ST = (
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
)
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
EUCKR_SM_MODEL = {'class_table': EUCKR_CLS,
'class_factor': 4,
'state_table': EUCKR_ST,
'char_len_table': EUCKR_CHAR_LEN_TABLE,
'name': 'EUC-KR'}
# EUC-TW
EUCTW_CLS = (
2,2,2,2,2,2,2,2, # 00 - 07
2,2,2,2,2,2,0,0, # 08 - 0f
2,2,2,2,2,2,2,2, # 10 - 17
2,2,2,0,2,2,2,2, # 18 - 1f
2,2,2,2,2,2,2,2, # 20 - 27
2,2,2,2,2,2,2,2, # 28 - 2f
2,2,2,2,2,2,2,2, # 30 - 37
2,2,2,2,2,2,2,2, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,2, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,6,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,3,4,4,4,4,4,4, # a0 - a7
5,5,1,1,1,1,1,1, # a8 - af
1,1,1,1,1,1,1,1, # b0 - b7
1,1,1,1,1,1,1,1, # b8 - bf
1,1,3,1,3,3,3,3, # c0 - c7
3,3,3,3,3,3,3,3, # c8 - cf
3,3,3,3,3,3,3,3, # d0 - d7
3,3,3,3,3,3,3,3, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,3,3,3, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,3,3,0 # f8 - ff
)
EUCTW_ST = (
MachineState.ERROR,MachineState.ERROR,MachineState.START, 3, 3, 3, 4,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.ERROR,#10-17
MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
)
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
EUCTW_SM_MODEL = {'class_table': EUCTW_CLS,
'class_factor': 7,
'state_table': EUCTW_ST,
'char_len_table': EUCTW_CHAR_LEN_TABLE,
'name': 'x-euc-tw'}
# GB2312
GB2312_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
3,3,3,3,3,3,3,3, # 30 - 37
3,3,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,4, # 78 - 7f
5,6,6,6,6,6,6,6, # 80 - 87
6,6,6,6,6,6,6,6, # 88 - 8f
6,6,6,6,6,6,6,6, # 90 - 97
6,6,6,6,6,6,6,6, # 98 - 9f
6,6,6,6,6,6,6,6, # a0 - a7
6,6,6,6,6,6,6,6, # a8 - af
6,6,6,6,6,6,6,6, # b0 - b7
6,6,6,6,6,6,6,6, # b8 - bf
6,6,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df
6,6,6,6,6,6,6,6, # e0 - e7
6,6,6,6,6,6,6,6, # e8 - ef
6,6,6,6,6,6,6,6, # f0 - f7
6,6,6,6,6,6,6,0 # f8 - ff
)
GB2312_ST = (
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, 3,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,#10-17
4,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
)
# To be accurate, the length of class 6 can be either 2 or 4.
# But it is not necessary to discriminate between the two since
# it is used for frequency analysis only, and we are validating
# each code range there as well. So it is safe to set it to be
# 2 here.
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
GB2312_SM_MODEL = {'class_table': GB2312_CLS,
'class_factor': 7,
'state_table': GB2312_ST,
'char_len_table': GB2312_CHAR_LEN_TABLE,
'name': 'GB2312'}
# Shift_JIS
SJIS_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
2,2,2,2,2,2,2,2, # 40 - 47
2,2,2,2,2,2,2,2, # 48 - 4f
2,2,2,2,2,2,2,2, # 50 - 57
2,2,2,2,2,2,2,2, # 58 - 5f
2,2,2,2,2,2,2,2, # 60 - 67
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
3,3,3,3,3,2,2,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f
#0xa0 is illegal in sjis encoding, but some pages does
#contain such byte. We need to be more error forgiven.
2,2,2,2,2,2,2,2, # a0 - a7
2,2,2,2,2,2,2,2, # a8 - af
2,2,2,2,2,2,2,2, # b0 - b7
2,2,2,2,2,2,2,2, # b8 - bf
2,2,2,2,2,2,2,2, # c0 - c7
2,2,2,2,2,2,2,2, # c8 - cf
2,2,2,2,2,2,2,2, # d0 - d7
2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef
3,3,3,3,3,3,3,3, # f0 - f7
3,3,3,3,3,0,0,0) # f8 - ff
SJIS_ST = (
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
)
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
SJIS_SM_MODEL = {'class_table': SJIS_CLS,
'class_factor': 6,
'state_table': SJIS_ST,
'char_len_table': SJIS_CHAR_LEN_TABLE,
'name': 'Shift_JIS'}
# UCS2-BE
UCS2BE_CLS = (
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2BE_ST = (
5, 7, 7,MachineState.ERROR, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME, 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,#10-17
6, 6, 6, 6, 6,MachineState.ITS_ME, 6, 6,#18-1f
6, 6, 6, 6, 5, 7, 7,MachineState.ERROR,#20-27
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
)
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
UCS2BE_SM_MODEL = {'class_table': UCS2BE_CLS,
'class_factor': 6,
'state_table': UCS2BE_ST,
'char_len_table': UCS2BE_CHAR_LEN_TABLE,
'name': 'UTF-16BE'}
# UCS2-LE
UCS2LE_CLS = (
0,0,0,0,0,0,0,0, # 00 - 07
0,0,1,0,0,2,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
0,0,0,3,0,0,0,0, # 18 - 1f
0,0,0,0,0,0,0,0, # 20 - 27
0,3,3,3,3,3,0,0, # 28 - 2f
0,0,0,0,0,0,0,0, # 30 - 37
0,0,0,0,0,0,0,0, # 38 - 3f
0,0,0,0,0,0,0,0, # 40 - 47
0,0,0,0,0,0,0,0, # 48 - 4f
0,0,0,0,0,0,0,0, # 50 - 57
0,0,0,0,0,0,0,0, # 58 - 5f
0,0,0,0,0,0,0,0, # 60 - 67
0,0,0,0,0,0,0,0, # 68 - 6f
0,0,0,0,0,0,0,0, # 70 - 77
0,0,0,0,0,0,0,0, # 78 - 7f
0,0,0,0,0,0,0,0, # 80 - 87
0,0,0,0,0,0,0,0, # 88 - 8f
0,0,0,0,0,0,0,0, # 90 - 97
0,0,0,0,0,0,0,0, # 98 - 9f
0,0,0,0,0,0,0,0, # a0 - a7
0,0,0,0,0,0,0,0, # a8 - af
0,0,0,0,0,0,0,0, # b0 - b7
0,0,0,0,0,0,0,0, # b8 - bf
0,0,0,0,0,0,0,0, # c0 - c7
0,0,0,0,0,0,0,0, # c8 - cf
0,0,0,0,0,0,0,0, # d0 - d7
0,0,0,0,0,0,0,0, # d8 - df
0,0,0,0,0,0,0,0, # e0 - e7
0,0,0,0,0,0,0,0, # e8 - ef
0,0,0,0,0,0,0,0, # f0 - f7
0,0,0,0,0,0,4,5 # f8 - ff
)
UCS2LE_ST = (
6, 6, 7, 6, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
MachineState.ITS_ME,MachineState.ITS_ME, 5, 5, 5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR, 6, 6,#18-1f
7, 6, 8, 8, 5, 5, 5,MachineState.ERROR,#20-27
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
)
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
UCS2LE_SM_MODEL = {'class_table': UCS2LE_CLS,
'class_factor': 6,
'state_table': UCS2LE_ST,
'char_len_table': UCS2LE_CHAR_LEN_TABLE,
'name': 'UTF-16LE'}
# UTF-8
UTF8_CLS = (
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
1,1,1,1,1,1,0,0, # 08 - 0f
1,1,1,1,1,1,1,1, # 10 - 17
1,1,1,0,1,1,1,1, # 18 - 1f
1,1,1,1,1,1,1,1, # 20 - 27
1,1,1,1,1,1,1,1, # 28 - 2f
1,1,1,1,1,1,1,1, # 30 - 37
1,1,1,1,1,1,1,1, # 38 - 3f
1,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,1, # 78 - 7f
2,2,2,2,3,3,3,3, # 80 - 87
4,4,4,4,4,4,4,4, # 88 - 8f
4,4,4,4,4,4,4,4, # 90 - 97
4,4,4,4,4,4,4,4, # 98 - 9f
5,5,5,5,5,5,5,5, # a0 - a7
5,5,5,5,5,5,5,5, # a8 - af
5,5,5,5,5,5,5,5, # b0 - b7
5,5,5,5,5,5,5,5, # b8 - bf
0,0,6,6,6,6,6,6, # c0 - c7
6,6,6,6,6,6,6,6, # c8 - cf
6,6,6,6,6,6,6,6, # d0 - d7
6,6,6,6,6,6,6,6, # d8 - df
7,8,8,8,8,8,8,8, # e0 - e7
8,8,8,8,8,9,8,8, # e8 - ef
10,11,11,11,11,11,11,11, # f0 - f7
12,13,13,13,14,15,0,0 # f8 - ff
)
UTF8_ST = (
MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12, 10,#00-07
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#20-27
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#28-2f
MachineState.ERROR,MachineState.ERROR, 5, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#38-3f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#40-47
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#48-4f
MachineState.ERROR,MachineState.ERROR, 7, 7, 7, 7,MachineState.ERROR,MachineState.ERROR,#50-57
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#58-5f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 7, 7,MachineState.ERROR,MachineState.ERROR,#60-67
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#68-6f
MachineState.ERROR,MachineState.ERROR, 9, 9, 9, 9,MachineState.ERROR,MachineState.ERROR,#70-77
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#78-7f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 9,MachineState.ERROR,MachineState.ERROR,#80-87
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#88-8f
MachineState.ERROR,MachineState.ERROR, 12, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,#90-97
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#98-9f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12,MachineState.ERROR,MachineState.ERROR,#a0-a7
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#a8-af
MachineState.ERROR,MachineState.ERROR, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b0-b7
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b8-bf
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
)
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
UTF8_SM_MODEL = {'class_table': UTF8_CLS,
'class_factor': 16,
'state_table': UTF8_ST,
'char_len_table': UTF8_CHAR_LEN_TABLE,
'name': 'UTF-8'}

View File

@ -0,0 +1,310 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Metadata about languages used by our model training code for our
SingleByteCharSetProbers. Could be used for other things in the future.
This code is based on the language metadata from the uchardet project.
"""
from __future__ import absolute_import, print_function
from string import ascii_letters
# TODO: Add Ukranian (KOI8-U)
class Language(object):
"""Metadata about a language useful for training models
:ivar name: The human name for the language, in English.
:type name: str
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
or use another catalog as a last resort.
:type iso_code: str
:ivar use_ascii: Whether or not ASCII letters should be included in trained
models.
:type use_ascii: bool
:ivar charsets: The charsets we want to support and create data for.
:type charsets: list of str
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
`True`, you only need to add those not in the ASCII set.
:type alphabet: str
:ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
Wikipedia for training data.
:type wiki_start_pages: list of str
"""
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
alphabet=None, wiki_start_pages=None):
super(Language, self).__init__()
self.name = name
self.iso_code = iso_code
self.use_ascii = use_ascii
self.charsets = charsets
if self.use_ascii:
if alphabet:
alphabet += ascii_letters
else:
alphabet = ascii_letters
elif not alphabet:
raise ValueError('Must supply alphabet if use_ascii is False')
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
self.wiki_start_pages = wiki_start_pages
def __repr__(self):
return '{}({})'.format(self.__class__.__name__,
', '.join('{}={!r}'.format(k, v)
for k, v in self.__dict__.items()
if not k.startswith('_')))
LANGUAGES = {'Arabic': Language(name='Arabic',
iso_code='ar',
use_ascii=False,
# We only support encodings that use isolated
# forms, because the current recommendation is
# that the rendering system handles presentation
# forms. This means we purposefully skip IBM864.
charsets=['ISO-8859-6', 'WINDOWS-1256',
'CP720', 'CP864'],
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
wiki_start_pages=[u'الصفحة_الرئيسية']),
'Belarusian': Language(name='Belarusian',
iso_code='be',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'IBM866', 'MacCyrillic'],
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
wiki_start_pages=[u'Галоўная_старонка']),
'Bulgarian': Language(name='Bulgarian',
iso_code='bg',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'IBM855'],
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
wiki_start_pages=[u'Начална_страница']),
'Czech': Language(name='Czech',
iso_code='cz',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
wiki_start_pages=[u'Hlavní_strana']),
'Danish': Language(name='Danish',
iso_code='da',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'æøåÆØÅ',
wiki_start_pages=[u'Forside']),
'German': Language(name='German',
iso_code='de',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
alphabet=u'äöüßÄÖÜ',
wiki_start_pages=[u'Wikipedia:Hauptseite']),
'Greek': Language(name='Greek',
iso_code='el',
use_ascii=False,
charsets=['ISO-8859-7', 'WINDOWS-1253'],
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
wiki_start_pages=[u'Πύλη:Κύρια']),
'English': Language(name='English',
iso_code='en',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
wiki_start_pages=[u'Main_Page']),
'Esperanto': Language(name='Esperanto',
iso_code='eo',
# Q, W, X, and Y not used at all
use_ascii=False,
charsets=['ISO-8859-3'],
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
'Spanish': Language(name='Spanish',
iso_code='es',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
wiki_start_pages=[u'Wikipedia:Portada']),
'Estonian': Language(name='Estonian',
iso_code='et',
use_ascii=False,
charsets=['ISO-8859-4', 'ISO-8859-13',
'WINDOWS-1257'],
# C, F, Š, Q, W, X, Y, Z, Ž are only for
# loanwords
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
u'abdeghijklmnoprstuvõäöü'),
wiki_start_pages=[u'Esileht']),
'Finnish': Language(name='Finnish',
iso_code='fi',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÅÄÖŠŽåäöšž',
wiki_start_pages=[u'Wikipedia:Etusivu']),
'French': Language(name='French',
iso_code='fr',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
wiki_start_pages=[u'Wikipédia:Accueil_principal',
u'Bœuf (animal)']),
'Hebrew': Language(name='Hebrew',
iso_code='he',
use_ascii=False,
charsets=['ISO-8859-8', 'WINDOWS-1255'],
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
wiki_start_pages=[u'עמוד_ראשי']),
'Croatian': Language(name='Croatian',
iso_code='hr',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
wiki_start_pages=[u'Glavna_stranica']),
'Hungarian': Language(name='Hungarian',
iso_code='hu',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
wiki_start_pages=[u'Kezdőlap']),
'Italian': Language(name='Italian',
iso_code='it',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
wiki_start_pages=[u'Pagina_principale']),
'Lithuanian': Language(name='Lithuanian',
iso_code='lt',
use_ascii=False,
charsets=['ISO-8859-13', 'WINDOWS-1257',
'ISO-8859-4'],
# Q, W, and X not used at all
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
wiki_start_pages=[u'Pagrindinis_puslapis']),
'Latvian': Language(name='Latvian',
iso_code='lv',
use_ascii=False,
charsets=['ISO-8859-13', 'WINDOWS-1257',
'ISO-8859-4'],
# Q, W, X, Y are only for loanwords
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
wiki_start_pages=[u'Sākumlapa']),
'Macedonian': Language(name='Macedonian',
iso_code='mk',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'MacCyrillic', 'IBM855'],
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
wiki_start_pages=[u'Главна_страница']),
'Dutch': Language(name='Dutch',
iso_code='nl',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
wiki_start_pages=[u'Hoofdpagina']),
'Polish': Language(name='Polish',
iso_code='pl',
# Q and X are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
wiki_start_pages=[u'Wikipedia:Strona_główna']),
'Portuguese': Language(name='Portuguese',
iso_code='pt',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
wiki_start_pages=[u'Wikipédia:Página_principal']),
'Romanian': Language(name='Romanian',
iso_code='ro',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'ăâîșțĂÂÎȘȚ',
wiki_start_pages=[u'Pagina_principală']),
'Russian': Language(name='Russian',
iso_code='ru',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'KOI8-R', 'MacCyrillic', 'IBM866',
'IBM855'],
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
wiki_start_pages=[u'Заглавная_страница']),
'Slovak': Language(name='Slovak',
iso_code='sk',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
wiki_start_pages=[u'Hlavná_stránka']),
'Slovene': Language(name='Slovene',
iso_code='sl',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcčdefghijklmnoprsštuvzž'
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
wiki_start_pages=[u'Glavna_stran']),
# Serbian can be written in both Latin and Cyrillic, but there's no
# simple way to get the Latin alphabet pages from Wikipedia through
# the API, so for now we just support Cyrillic.
'Serbian': Language(name='Serbian',
iso_code='sr',
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
charsets=['ISO-8859-5', 'WINDOWS-1251',
'MacCyrillic', 'IBM855'],
wiki_start_pages=[u'Главна_страна']),
'Thai': Language(name='Thai',
iso_code='th',
use_ascii=False,
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
wiki_start_pages=[u'หน้าหลัก']),
'Turkish': Language(name='Turkish',
iso_code='tr',
# Q, W, and X are not used by Turkish
use_ascii=False,
charsets=['ISO-8859-3', 'ISO-8859-9',
'WINDOWS-1254'],
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
wiki_start_pages=[u'Ana_Sayfa']),
'Vietnamese': Language(name='Vietnamese',
iso_code='vi',
use_ascii=False,
# Windows-1258 is the only common 8-bit
# Vietnamese encoding supported by Python.
# From Wikipedia:
# For systems that lack support for Unicode,
# dozens of 8-bit Vietnamese code pages are
# available.[1] The most common are VISCII
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
# Where ASCII is required, such as when
# ensuring readability in plain text e-mail,
# Vietnamese letters are often encoded
# according to Vietnamese Quoted-Readable
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
# though usage of either variable-width
# scheme has declined dramatically following
# the adoption of Unicode on the World Wide
# Web.
charsets=['WINDOWS-1258'],
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
}

View File

@ -0,0 +1,145 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from collections import namedtuple
from .charsetprober import CharSetProber
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
['charset_name',
'language',
'char_to_order_map',
'language_model',
'typical_positive_ratio',
'keep_ascii_letters',
'alphabet'])
class SingleByteCharSetProber(CharSetProber):
SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
POSITIVE_SHORTCUT_THRESHOLD = 0.95
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
def __init__(self, model, reversed=False, name_prober=None):
super(SingleByteCharSetProber, self).__init__()
self._model = model
# TRUE if we need to reverse every pair in the model lookup
self._reversed = reversed
# Optional auxiliary prober for name decision
self._name_prober = name_prober
self._last_order = None
self._seq_counters = None
self._total_seqs = None
self._total_char = None
self._freq_char = None
self.reset()
def reset(self):
super(SingleByteCharSetProber, self).reset()
# char order of last character
self._last_order = 255
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
self._total_seqs = 0
self._total_char = 0
# characters that fall in our sampling range
self._freq_char = 0
@property
def charset_name(self):
if self._name_prober:
return self._name_prober.charset_name
else:
return self._model.charset_name
@property
def language(self):
if self._name_prober:
return self._name_prober.language
else:
return self._model.language
def feed(self, byte_str):
# TODO: Make filter_international_words keep things in self.alphabet
if not self._model.keep_ascii_letters:
byte_str = self.filter_international_words(byte_str)
if not byte_str:
return self.state
char_to_order_map = self._model.char_to_order_map
language_model = self._model.language_model
for char in byte_str:
order = char_to_order_map.get(char, CharacterCategory.UNDEFINED)
# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
# CharacterCategory.SYMBOL is actually 253, so we use CONTROL
# to make it closer to the original intent. The only difference
# is whether or not we count digits and control characters for
# _total_char purposes.
if order < CharacterCategory.CONTROL:
self._total_char += 1
# TODO: Follow uchardet's lead and discount confidence for frequent
# control characters.
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
if order < self.SAMPLE_SIZE:
self._freq_char += 1
if self._last_order < self.SAMPLE_SIZE:
self._total_seqs += 1
if not self._reversed:
lm_cat = language_model[self._last_order][order]
else:
lm_cat = language_model[order][self._last_order]
self._seq_counters[lm_cat] += 1
self._last_order = order
charset_name = self._model.charset_name
if self.state == ProbingState.DETECTING:
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
confidence = self.get_confidence()
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
self.logger.debug('%s confidence = %s, we have a winner',
charset_name, confidence)
self._state = ProbingState.FOUND_IT
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
self.logger.debug('%s confidence = %s, below negative '
'shortcut threshhold %s', charset_name,
confidence,
self.NEGATIVE_SHORTCUT_THRESHOLD)
self._state = ProbingState.NOT_ME
return self.state
def get_confidence(self):
r = 0.01
if self._total_seqs > 0:
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
self._total_seqs / self._model.typical_positive_ratio)
r = r * self._freq_char / self._total_char
if r >= 1.0:
r = 0.99
return r

View File

@ -0,0 +1,83 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetgroupprober import CharSetGroupProber
from .hebrewprober import HebrewProber
from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
WINDOWS_1251_BULGARIAN_MODEL)
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
# WINDOWS_1250_HUNGARIAN_MODEL)
from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
MACCYRILLIC_RUSSIAN_MODEL,
WINDOWS_1251_RUSSIAN_MODEL)
from .langthaimodel import TIS_620_THAI_MODEL
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
from .sbcharsetprober import SingleByteCharSetProber
class SBCSGroupProber(CharSetGroupProber):
def __init__(self):
super(SBCSGroupProber, self).__init__()
hebrew_prober = HebrewProber()
logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
False, hebrew_prober)
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
# it's actually the visual one
visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
True, hebrew_prober)
hebrew_prober.set_model_probers(logical_hebrew_prober,
visual_hebrew_prober)
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
# and several tests failed that did not before. Some thought
# should be put into the ordering, and we should consider making
# order not matter here, because that is very counter-intuitive.
self.probers = [
SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
SingleByteCharSetProber(IBM866_RUSSIAN_MODEL),
SingleByteCharSetProber(IBM855_RUSSIAN_MODEL),
SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
# after we retrain model.
# SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
# SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
SingleByteCharSetProber(TIS_620_THAI_MODEL),
SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
hebrew_prober,
logical_hebrew_prober,
visual_hebrew_prober,
]
self.reset()

View File

@ -0,0 +1,92 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import SJISDistributionAnalysis
from .jpcntx import SJISContextAnalysis
from .mbcssm import SJIS_SM_MODEL
from .enums import ProbingState, MachineState
class SJISProber(MultiByteCharSetProber):
def __init__(self):
super(SJISProber, self).__init__()
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
self.distribution_analyzer = SJISDistributionAnalysis()
self.context_analyzer = SJISContextAnalysis()
self.reset()
def reset(self):
super(SJISProber, self).reset()
self.context_analyzer.reset()
@property
def charset_name(self):
return self.context_analyzer.charset_name
@property
def language(self):
return "Japanese"
def feed(self, byte_str):
for i in range(len(byte_str)):
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._last_char[1] = byte_str[0]
self.context_analyzer.feed(self._last_char[2 - char_len:],
char_len)
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
- char_len], char_len)
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self._last_char[0] = byte_str[-1]
if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.FOUND_IT
return self.state
def get_confidence(self):
context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf)

View File

@ -0,0 +1,286 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
"""
Module containing the UniversalDetector detector class, which is the primary
class a user of ``chardet`` should use.
:author: Mark Pilgrim (initial port to Python)
:author: Shy Shalom (original C code)
:author: Dan Blanchard (major refactoring for 3.0)
:author: Ian Cordasco
"""
import codecs
import logging
import re
from .charsetgroupprober import CharSetGroupProber
from .enums import InputState, LanguageFilter, ProbingState
from .escprober import EscCharSetProber
from .latin1prober import Latin1Prober
from .mbcsgroupprober import MBCSGroupProber
from .sbcsgroupprober import SBCSGroupProber
class UniversalDetector(object):
"""
The ``UniversalDetector`` class underlies the ``chardet.detect`` function
and coordinates all of the different charset probers.
To get a ``dict`` containing an encoding and its confidence, you can simply
run:
.. code::
u = UniversalDetector()
u.feed(some_bytes)
u.close()
detected = u.result
"""
MINIMUM_THRESHOLD = 0.20
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
ESC_DETECTOR = re.compile(b'(\033|~{)')
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
'iso-8859-2': 'Windows-1250',
'iso-8859-5': 'Windows-1251',
'iso-8859-6': 'Windows-1256',
'iso-8859-7': 'Windows-1253',
'iso-8859-8': 'Windows-1255',
'iso-8859-9': 'Windows-1254',
'iso-8859-13': 'Windows-1257'}
def __init__(self, lang_filter=LanguageFilter.ALL):
self._esc_charset_prober = None
self._charset_probers = []
self.result = None
self.done = None
self._got_data = None
self._input_state = None
self._last_char = None
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
self._has_win_bytes = None
self.reset()
def reset(self):
"""
Reset the UniversalDetector and all of its probers back to their
initial states. This is called by ``__init__``, so you only need to
call this directly in between analyses of different documents.
"""
self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
self.done = False
self._got_data = False
self._has_win_bytes = False
self._input_state = InputState.PURE_ASCII
self._last_char = b''
if self._esc_charset_prober:
self._esc_charset_prober.reset()
for prober in self._charset_probers:
prober.reset()
def feed(self, byte_str):
"""
Takes a chunk of a document and feeds it through all of the relevant
charset probers.
After calling ``feed``, you can check the value of the ``done``
attribute to see if you need to continue feeding the
``UniversalDetector`` more data, or if it has made a prediction
(in the ``result`` attribute).
.. note::
You should always call ``close`` when you're done feeding in your
document if ``done`` is not already ``True``.
"""
if self.done:
return
if not len(byte_str):
return
if not isinstance(byte_str, bytearray):
byte_str = bytearray(byte_str)
# First check for known BOMs, since these are guaranteed to be correct
if not self._got_data:
# If the data starts with BOM, we know it is UTF
if byte_str.startswith(codecs.BOM_UTF8):
# EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8-SIG",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith((codecs.BOM_UTF32_LE,
codecs.BOM_UTF32_BE)):
# FF FE 00 00 UTF-32, little-endian BOM
# 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0,
'language': ''}
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
# FF FE UTF-16, little endian BOM
# FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16",
'confidence': 1.0,
'language': ''}
self._got_data = True
if self.result['encoding'] is not None:
self.done = True
return
# If none of those matched and we've only see ASCII so far, check
# for high bytes and escape sequences
if self._input_state == InputState.PURE_ASCII:
if self.HIGH_BYTE_DETECTOR.search(byte_str):
self._input_state = InputState.HIGH_BYTE
elif self._input_state == InputState.PURE_ASCII and \
self.ESC_DETECTOR.search(self._last_char + byte_str):
self._input_state = InputState.ESC_ASCII
self._last_char = byte_str[-1:]
# If we've seen escape sequences, use the EscCharSetProber, which
# uses a simple state machine to check for known escape sequences in
# HZ and ISO-2022 encodings, since those are the only encodings that
# use such sequences.
if self._input_state == InputState.ESC_ASCII:
if not self._esc_charset_prober:
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {'encoding':
self._esc_charset_prober.charset_name,
'confidence':
self._esc_charset_prober.get_confidence(),
'language':
self._esc_charset_prober.language}
self.done = True
# If we've seen high bytes (i.e., those with values greater than 127),
# we need to do more complicated checks using all our multi-byte and
# single-byte probers that are left. The single-byte probers
# use character bigram distributions to determine the encoding, whereas
# the multi-byte probers use a combination of character unigram and
# bigram distributions.
elif self._input_state == InputState.HIGH_BYTE:
if not self._charset_probers:
self._charset_probers = [MBCSGroupProber(self.lang_filter)]
# If we're checking non-CJK encodings, use single-byte prober
if self.lang_filter & LanguageFilter.NON_CJK:
self._charset_probers.append(SBCSGroupProber())
self._charset_probers.append(Latin1Prober())
for prober in self._charset_probers:
if prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {'encoding': prober.charset_name,
'confidence': prober.get_confidence(),
'language': prober.language}
self.done = True
break
if self.WIN_BYTE_DETECTOR.search(byte_str):
self._has_win_bytes = True
def close(self):
"""
Stop analyzing the current document and come up with a final
prediction.
:returns: The ``result`` attribute, a ``dict`` with the keys
`encoding`, `confidence`, and `language`.
"""
# Don't bother with checks if we're already done
if self.done:
return self.result
self.done = True
if not self._got_data:
self.logger.debug('no data received!')
# Default to ASCII if it is all we've seen so far
elif self._input_state == InputState.PURE_ASCII:
self.result = {'encoding': 'ascii',
'confidence': 1.0,
'language': ''}
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
elif self._input_state == InputState.HIGH_BYTE:
prober_confidence = None
max_prober_confidence = 0.0
max_prober = None
for prober in self._charset_probers:
if not prober:
continue
prober_confidence = prober.get_confidence()
if prober_confidence > max_prober_confidence:
max_prober_confidence = prober_confidence
max_prober = prober
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
charset_name = max_prober.charset_name
lower_charset_name = max_prober.charset_name.lower()
confidence = max_prober.get_confidence()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if self._has_win_bytes:
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
self.result = {'encoding': charset_name,
'confidence': confidence,
'language': max_prober.language}
# Log all prober confidences if none met MINIMUM_THRESHOLD
if self.logger.getEffectiveLevel() <= logging.DEBUG:
if self.result['encoding'] is None:
self.logger.debug('no probers hit minimum threshold')
for group_prober in self._charset_probers:
if not group_prober:
continue
if isinstance(group_prober, CharSetGroupProber):
for prober in group_prober.probers:
self.logger.debug('%s %s confidence = %s',
prober.charset_name,
prober.language,
prober.get_confidence())
else:
self.logger.debug('%s %s confidence = %s',
group_prober.charset_name,
group_prober.language,
group_prober.get_confidence())
return self.result

View File

@ -0,0 +1,82 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .enums import ProbingState, MachineState
from .codingstatemachine import CodingStateMachine
from .mbcssm import UTF8_SM_MODEL
class UTF8Prober(CharSetProber):
ONE_CHAR_PROB = 0.5
def __init__(self):
super(UTF8Prober, self).__init__()
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
self._num_mb_chars = None
self.reset()
def reset(self):
super(UTF8Prober, self).reset()
self.coding_sm.reset()
self._num_mb_chars = 0
@property
def charset_name(self):
return "utf-8"
@property
def language(self):
return ""
def feed(self, byte_str):
for c in byte_str:
coding_state = self.coding_sm.next_state(c)
if coding_state == MachineState.ERROR:
self._state = ProbingState.NOT_ME
break
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif coding_state == MachineState.START:
if self.coding_sm.get_current_charlen() >= 2:
self._num_mb_chars += 1
if self.state == ProbingState.DETECTING:
if self.get_confidence() > self.SHORTCUT_THRESHOLD:
self._state = ProbingState.FOUND_IT
return self.state
def get_confidence(self):
unlike = 0.99
if self._num_mb_chars < 6:
unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars
return 1.0 - unlike
else:
return unlike

View File

@ -0,0 +1,9 @@
"""
This module exists only to simplify retrieving the version number of chardet
from within setup.py and from chardet subpackages.
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
__version__ = "4.0.0"
VERSION = __version__.split('.')

View File

@ -0,0 +1,6 @@
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
from .initialise import init, deinit, reinit, colorama_text
from .ansi import Fore, Back, Style, Cursor
from .ansitowin32 import AnsiToWin32
__version__ = '0.4.4'

View File

@ -0,0 +1,102 @@
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
'''
This module generates ANSI character codes to printing colors to terminals.
See: http://en.wikipedia.org/wiki/ANSI_escape_code
'''
CSI = '\033['
OSC = '\033]'
BEL = '\a'
def code_to_chars(code):
return CSI + str(code) + 'm'
def set_title(title):
return OSC + '2;' + title + BEL
def clear_screen(mode=2):
return CSI + str(mode) + 'J'
def clear_line(mode=2):
return CSI + str(mode) + 'K'
class AnsiCodes(object):
def __init__(self):
# the subclasses declare class attributes which are numbers.
# Upon instantiation we define instance attributes, which are the same
# as the class attributes but wrapped with the ANSI escape sequence
for name in dir(self):
if not name.startswith('_'):
value = getattr(self, name)
setattr(self, name, code_to_chars(value))
class AnsiCursor(object):
def UP(self, n=1):
return CSI + str(n) + 'A'
def DOWN(self, n=1):
return CSI + str(n) + 'B'
def FORWARD(self, n=1):
return CSI + str(n) + 'C'
def BACK(self, n=1):
return CSI + str(n) + 'D'
def POS(self, x=1, y=1):
return CSI + str(y) + ';' + str(x) + 'H'
class AnsiFore(AnsiCodes):
BLACK = 30
RED = 31
GREEN = 32
YELLOW = 33
BLUE = 34
MAGENTA = 35
CYAN = 36
WHITE = 37
RESET = 39
# These are fairly well supported, but not part of the standard.
LIGHTBLACK_EX = 90
LIGHTRED_EX = 91
LIGHTGREEN_EX = 92
LIGHTYELLOW_EX = 93
LIGHTBLUE_EX = 94
LIGHTMAGENTA_EX = 95
LIGHTCYAN_EX = 96
LIGHTWHITE_EX = 97
class AnsiBack(AnsiCodes):
BLACK = 40
RED = 41
GREEN = 42
YELLOW = 43
BLUE = 44
MAGENTA = 45
CYAN = 46
WHITE = 47
RESET = 49
# These are fairly well supported, but not part of the standard.
LIGHTBLACK_EX = 100
LIGHTRED_EX = 101
LIGHTGREEN_EX = 102
LIGHTYELLOW_EX = 103
LIGHTBLUE_EX = 104
LIGHTMAGENTA_EX = 105
LIGHTCYAN_EX = 106
LIGHTWHITE_EX = 107
class AnsiStyle(AnsiCodes):
BRIGHT = 1
DIM = 2
NORMAL = 22
RESET_ALL = 0
Fore = AnsiFore()
Back = AnsiBack()
Style = AnsiStyle()
Cursor = AnsiCursor()

View File

@ -0,0 +1,258 @@
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
import re
import sys
import os
from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style, BEL
from .winterm import WinTerm, WinColor, WinStyle
from .win32 import windll, winapi_test
winterm = None
if windll is not None:
winterm = WinTerm()
class StreamWrapper(object):
'''
Wraps a stream (such as stdout), acting as a transparent proxy for all
attribute access apart from method 'write()', which is delegated to our
Converter instance.
'''
def __init__(self, wrapped, converter):
# double-underscore everything to prevent clashes with names of
# attributes on the wrapped stream object.
self.__wrapped = wrapped
self.__convertor = converter
def __getattr__(self, name):
return getattr(self.__wrapped, name)
def __enter__(self, *args, **kwargs):
# special method lookup bypasses __getattr__/__getattribute__, see
# https://stackoverflow.com/questions/12632894/why-doesnt-getattr-work-with-exit
# thus, contextlib magic methods are not proxied via __getattr__
return self.__wrapped.__enter__(*args, **kwargs)
def __exit__(self, *args, **kwargs):
return self.__wrapped.__exit__(*args, **kwargs)
def write(self, text):
self.__convertor.write(text)
def isatty(self):
stream = self.__wrapped
if 'PYCHARM_HOSTED' in os.environ:
if stream is not None and (stream is sys.__stdout__ or stream is sys.__stderr__):
return True
try:
stream_isatty = stream.isatty
except AttributeError:
return False
else:
return stream_isatty()
@property
def closed(self):
stream = self.__wrapped
try:
return stream.closed
except AttributeError:
return True
class AnsiToWin32(object):
'''
Implements a 'write()' method which, on Windows, will strip ANSI character
sequences from the text, and if outputting to a tty, will convert them into
win32 function calls.
'''
ANSI_CSI_RE = re.compile('\001?\033\\[((?:\\d|;)*)([a-zA-Z])\002?') # Control Sequence Introducer
ANSI_OSC_RE = re.compile('\001?\033\\]([^\a]*)(\a)\002?') # Operating System Command
def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
# The wrapped stream (normally sys.stdout or sys.stderr)
self.wrapped = wrapped
# should we reset colors to defaults after every .write()
self.autoreset = autoreset
# create the proxy wrapping our output stream
self.stream = StreamWrapper(wrapped, self)
on_windows = os.name == 'nt'
# We test if the WinAPI works, because even if we are on Windows
# we may be using a terminal that doesn't support the WinAPI
# (e.g. Cygwin Terminal). In this case it's up to the terminal
# to support the ANSI codes.
conversion_supported = on_windows and winapi_test()
# should we strip ANSI sequences from our output?
if strip is None:
strip = conversion_supported or (not self.stream.closed and not self.stream.isatty())
self.strip = strip
# should we should convert ANSI sequences into win32 calls?
if convert is None:
convert = conversion_supported and not self.stream.closed and self.stream.isatty()
self.convert = convert
# dict of ansi codes to win32 functions and parameters
self.win32_calls = self.get_win32_calls()
# are we wrapping stderr?
self.on_stderr = self.wrapped is sys.stderr
def should_wrap(self):
'''
True if this class is actually needed. If false, then the output
stream will not be affected, nor will win32 calls be issued, so
wrapping stdout is not actually required. This will generally be
False on non-Windows platforms, unless optional functionality like
autoreset has been requested using kwargs to init()
'''
return self.convert or self.strip or self.autoreset
def get_win32_calls(self):
if self.convert and winterm:
return {
AnsiStyle.RESET_ALL: (winterm.reset_all, ),
AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
AnsiFore.RED: (winterm.fore, WinColor.RED),
AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
AnsiFore.RESET: (winterm.fore, ),
AnsiFore.LIGHTBLACK_EX: (winterm.fore, WinColor.BLACK, True),
AnsiFore.LIGHTRED_EX: (winterm.fore, WinColor.RED, True),
AnsiFore.LIGHTGREEN_EX: (winterm.fore, WinColor.GREEN, True),
AnsiFore.LIGHTYELLOW_EX: (winterm.fore, WinColor.YELLOW, True),
AnsiFore.LIGHTBLUE_EX: (winterm.fore, WinColor.BLUE, True),
AnsiFore.LIGHTMAGENTA_EX: (winterm.fore, WinColor.MAGENTA, True),
AnsiFore.LIGHTCYAN_EX: (winterm.fore, WinColor.CYAN, True),
AnsiFore.LIGHTWHITE_EX: (winterm.fore, WinColor.GREY, True),
AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
AnsiBack.RED: (winterm.back, WinColor.RED),
AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
AnsiBack.WHITE: (winterm.back, WinColor.GREY),
AnsiBack.RESET: (winterm.back, ),
AnsiBack.LIGHTBLACK_EX: (winterm.back, WinColor.BLACK, True),
AnsiBack.LIGHTRED_EX: (winterm.back, WinColor.RED, True),
AnsiBack.LIGHTGREEN_EX: (winterm.back, WinColor.GREEN, True),
AnsiBack.LIGHTYELLOW_EX: (winterm.back, WinColor.YELLOW, True),
AnsiBack.LIGHTBLUE_EX: (winterm.back, WinColor.BLUE, True),
AnsiBack.LIGHTMAGENTA_EX: (winterm.back, WinColor.MAGENTA, True),
AnsiBack.LIGHTCYAN_EX: (winterm.back, WinColor.CYAN, True),
AnsiBack.LIGHTWHITE_EX: (winterm.back, WinColor.GREY, True),
}
return dict()
def write(self, text):
if self.strip or self.convert:
self.write_and_convert(text)
else:
self.wrapped.write(text)
self.wrapped.flush()
if self.autoreset:
self.reset_all()
def reset_all(self):
if self.convert:
self.call_win32('m', (0,))
elif not self.strip and not self.stream.closed:
self.wrapped.write(Style.RESET_ALL)
def write_and_convert(self, text):
'''
Write the given text to our wrapped stream, stripping any ANSI
sequences from the text, and optionally converting them into win32
calls.
'''
cursor = 0
text = self.convert_osc(text)
for match in self.ANSI_CSI_RE.finditer(text):
start, end = match.span()
self.write_plain_text(text, cursor, start)
self.convert_ansi(*match.groups())
cursor = end
self.write_plain_text(text, cursor, len(text))
def write_plain_text(self, text, start, end):
if start < end:
self.wrapped.write(text[start:end])
self.wrapped.flush()
def convert_ansi(self, paramstring, command):
if self.convert:
params = self.extract_params(command, paramstring)
self.call_win32(command, params)
def extract_params(self, command, paramstring):
if command in 'Hf':
params = tuple(int(p) if len(p) != 0 else 1 for p in paramstring.split(';'))
while len(params) < 2:
# defaults:
params = params + (1,)
else:
params = tuple(int(p) for p in paramstring.split(';') if len(p) != 0)
if len(params) == 0:
# defaults:
if command in 'JKm':
params = (0,)
elif command in 'ABCD':
params = (1,)
return params
def call_win32(self, command, params):
if command == 'm':
for param in params:
if param in self.win32_calls:
func_args = self.win32_calls[param]
func = func_args[0]
args = func_args[1:]
kwargs = dict(on_stderr=self.on_stderr)
func(*args, **kwargs)
elif command in 'J':
winterm.erase_screen(params[0], on_stderr=self.on_stderr)
elif command in 'K':
winterm.erase_line(params[0], on_stderr=self.on_stderr)
elif command in 'Hf': # cursor position - absolute
winterm.set_cursor_position(params, on_stderr=self.on_stderr)
elif command in 'ABCD': # cursor position - relative
n = params[0]
# A - up, B - down, C - forward, D - back
x, y = {'A': (0, -n), 'B': (0, n), 'C': (n, 0), 'D': (-n, 0)}[command]
winterm.cursor_adjust(x, y, on_stderr=self.on_stderr)
def convert_osc(self, text):
for match in self.ANSI_OSC_RE.finditer(text):
start, end = match.span()
text = text[:start] + text[end:]
paramstring, command = match.groups()
if command == BEL:
if paramstring.count(";") == 1:
params = paramstring.split(";")
# 0 - change title and icon (we will only change title)
# 1 - change icon (we don't support this)
# 2 - change title
if params[0] in '02':
winterm.set_title(params[1])
return text

View File

@ -0,0 +1,80 @@
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
import atexit
import contextlib
import sys
from .ansitowin32 import AnsiToWin32
orig_stdout = None
orig_stderr = None
wrapped_stdout = None
wrapped_stderr = None
atexit_done = False
def reset_all():
if AnsiToWin32 is not None: # Issue #74: objects might become None at exit
AnsiToWin32(orig_stdout).reset_all()
def init(autoreset=False, convert=None, strip=None, wrap=True):
if not wrap and any([autoreset, convert, strip]):
raise ValueError('wrap=False conflicts with any other arg=True')
global wrapped_stdout, wrapped_stderr
global orig_stdout, orig_stderr
orig_stdout = sys.stdout
orig_stderr = sys.stderr
if sys.stdout is None:
wrapped_stdout = None
else:
sys.stdout = wrapped_stdout = \
wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
if sys.stderr is None:
wrapped_stderr = None
else:
sys.stderr = wrapped_stderr = \
wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
global atexit_done
if not atexit_done:
atexit.register(reset_all)
atexit_done = True
def deinit():
if orig_stdout is not None:
sys.stdout = orig_stdout
if orig_stderr is not None:
sys.stderr = orig_stderr
@contextlib.contextmanager
def colorama_text(*args, **kwargs):
init(*args, **kwargs)
try:
yield
finally:
deinit()
def reinit():
if wrapped_stdout is not None:
sys.stdout = wrapped_stdout
if wrapped_stderr is not None:
sys.stderr = wrapped_stderr
def wrap_stream(stream, convert, strip, autoreset, wrap):
if wrap:
wrapper = AnsiToWin32(stream,
convert=convert, strip=strip, autoreset=autoreset)
if wrapper.should_wrap():
stream = wrapper.stream
return stream

View File

@ -0,0 +1,152 @@
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
# from winbase.h
STDOUT = -11
STDERR = -12
try:
import ctypes
from ctypes import LibraryLoader
windll = LibraryLoader(ctypes.WinDLL)
from ctypes import wintypes
except (AttributeError, ImportError):
windll = None
SetConsoleTextAttribute = lambda *_: None
winapi_test = lambda *_: None
else:
from ctypes import byref, Structure, c_char, POINTER
COORD = wintypes._COORD
class CONSOLE_SCREEN_BUFFER_INFO(Structure):
"""struct in wincon.h."""
_fields_ = [
("dwSize", COORD),
("dwCursorPosition", COORD),
("wAttributes", wintypes.WORD),
("srWindow", wintypes.SMALL_RECT),
("dwMaximumWindowSize", COORD),
]
def __str__(self):
return '(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)' % (
self.dwSize.Y, self.dwSize.X
, self.dwCursorPosition.Y, self.dwCursorPosition.X
, self.wAttributes
, self.srWindow.Top, self.srWindow.Left, self.srWindow.Bottom, self.srWindow.Right
, self.dwMaximumWindowSize.Y, self.dwMaximumWindowSize.X
)
_GetStdHandle = windll.kernel32.GetStdHandle
_GetStdHandle.argtypes = [
wintypes.DWORD,
]
_GetStdHandle.restype = wintypes.HANDLE
_GetConsoleScreenBufferInfo = windll.kernel32.GetConsoleScreenBufferInfo
_GetConsoleScreenBufferInfo.argtypes = [
wintypes.HANDLE,
POINTER(CONSOLE_SCREEN_BUFFER_INFO),
]
_GetConsoleScreenBufferInfo.restype = wintypes.BOOL
_SetConsoleTextAttribute = windll.kernel32.SetConsoleTextAttribute
_SetConsoleTextAttribute.argtypes = [
wintypes.HANDLE,
wintypes.WORD,
]
_SetConsoleTextAttribute.restype = wintypes.BOOL
_SetConsoleCursorPosition = windll.kernel32.SetConsoleCursorPosition
_SetConsoleCursorPosition.argtypes = [
wintypes.HANDLE,
COORD,
]
_SetConsoleCursorPosition.restype = wintypes.BOOL
_FillConsoleOutputCharacterA = windll.kernel32.FillConsoleOutputCharacterA
_FillConsoleOutputCharacterA.argtypes = [
wintypes.HANDLE,
c_char,
wintypes.DWORD,
COORD,
POINTER(wintypes.DWORD),
]
_FillConsoleOutputCharacterA.restype = wintypes.BOOL
_FillConsoleOutputAttribute = windll.kernel32.FillConsoleOutputAttribute
_FillConsoleOutputAttribute.argtypes = [
wintypes.HANDLE,
wintypes.WORD,
wintypes.DWORD,
COORD,
POINTER(wintypes.DWORD),
]
_FillConsoleOutputAttribute.restype = wintypes.BOOL
_SetConsoleTitleW = windll.kernel32.SetConsoleTitleW
_SetConsoleTitleW.argtypes = [
wintypes.LPCWSTR
]
_SetConsoleTitleW.restype = wintypes.BOOL
def _winapi_test(handle):
csbi = CONSOLE_SCREEN_BUFFER_INFO()
success = _GetConsoleScreenBufferInfo(
handle, byref(csbi))
return bool(success)
def winapi_test():
return any(_winapi_test(h) for h in
(_GetStdHandle(STDOUT), _GetStdHandle(STDERR)))
def GetConsoleScreenBufferInfo(stream_id=STDOUT):
handle = _GetStdHandle(stream_id)
csbi = CONSOLE_SCREEN_BUFFER_INFO()
success = _GetConsoleScreenBufferInfo(
handle, byref(csbi))
return csbi
def SetConsoleTextAttribute(stream_id, attrs):
handle = _GetStdHandle(stream_id)
return _SetConsoleTextAttribute(handle, attrs)
def SetConsoleCursorPosition(stream_id, position, adjust=True):
position = COORD(*position)
# If the position is out of range, do nothing.
if position.Y <= 0 or position.X <= 0:
return
# Adjust for Windows' SetConsoleCursorPosition:
# 1. being 0-based, while ANSI is 1-based.
# 2. expecting (x,y), while ANSI uses (y,x).
adjusted_position = COORD(position.Y - 1, position.X - 1)
if adjust:
# Adjust for viewport's scroll position
sr = GetConsoleScreenBufferInfo(STDOUT).srWindow
adjusted_position.Y += sr.Top
adjusted_position.X += sr.Left
# Resume normal processing
handle = _GetStdHandle(stream_id)
return _SetConsoleCursorPosition(handle, adjusted_position)
def FillConsoleOutputCharacter(stream_id, char, length, start):
handle = _GetStdHandle(stream_id)
char = c_char(char.encode())
length = wintypes.DWORD(length)
num_written = wintypes.DWORD(0)
# Note that this is hard-coded for ANSI (vs wide) bytes.
success = _FillConsoleOutputCharacterA(
handle, char, length, start, byref(num_written))
return num_written.value
def FillConsoleOutputAttribute(stream_id, attr, length, start):
''' FillConsoleOutputAttribute( hConsole, csbi.wAttributes, dwConSize, coordScreen, &cCharsWritten )'''
handle = _GetStdHandle(stream_id)
attribute = wintypes.WORD(attr)
length = wintypes.DWORD(length)
num_written = wintypes.DWORD(0)
# Note that this is hard-coded for ANSI (vs wide) bytes.
return _FillConsoleOutputAttribute(
handle, attribute, length, start, byref(num_written))
def SetConsoleTitle(title):
return _SetConsoleTitleW(title)

View File

@ -0,0 +1,169 @@
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
from . import win32
# from wincon.h
class WinColor(object):
BLACK = 0
BLUE = 1
GREEN = 2
CYAN = 3
RED = 4
MAGENTA = 5
YELLOW = 6
GREY = 7
# from wincon.h
class WinStyle(object):
NORMAL = 0x00 # dim text, dim background
BRIGHT = 0x08 # bright text, dim background
BRIGHT_BACKGROUND = 0x80 # dim text, bright background
class WinTerm(object):
def __init__(self):
self._default = win32.GetConsoleScreenBufferInfo(win32.STDOUT).wAttributes
self.set_attrs(self._default)
self._default_fore = self._fore
self._default_back = self._back
self._default_style = self._style
# In order to emulate LIGHT_EX in windows, we borrow the BRIGHT style.
# So that LIGHT_EX colors and BRIGHT style do not clobber each other,
# we track them separately, since LIGHT_EX is overwritten by Fore/Back
# and BRIGHT is overwritten by Style codes.
self._light = 0
def get_attrs(self):
return self._fore + self._back * 16 + (self._style | self._light)
def set_attrs(self, value):
self._fore = value & 7
self._back = (value >> 4) & 7
self._style = value & (WinStyle.BRIGHT | WinStyle.BRIGHT_BACKGROUND)
def reset_all(self, on_stderr=None):
self.set_attrs(self._default)
self.set_console(attrs=self._default)
self._light = 0
def fore(self, fore=None, light=False, on_stderr=False):
if fore is None:
fore = self._default_fore
self._fore = fore
# Emulate LIGHT_EX with BRIGHT Style
if light:
self._light |= WinStyle.BRIGHT
else:
self._light &= ~WinStyle.BRIGHT
self.set_console(on_stderr=on_stderr)
def back(self, back=None, light=False, on_stderr=False):
if back is None:
back = self._default_back
self._back = back
# Emulate LIGHT_EX with BRIGHT_BACKGROUND Style
if light:
self._light |= WinStyle.BRIGHT_BACKGROUND
else:
self._light &= ~WinStyle.BRIGHT_BACKGROUND
self.set_console(on_stderr=on_stderr)
def style(self, style=None, on_stderr=False):
if style is None:
style = self._default_style
self._style = style
self.set_console(on_stderr=on_stderr)
def set_console(self, attrs=None, on_stderr=False):
if attrs is None:
attrs = self.get_attrs()
handle = win32.STDOUT
if on_stderr:
handle = win32.STDERR
win32.SetConsoleTextAttribute(handle, attrs)
def get_position(self, handle):
position = win32.GetConsoleScreenBufferInfo(handle).dwCursorPosition
# Because Windows coordinates are 0-based,
# and win32.SetConsoleCursorPosition expects 1-based.
position.X += 1
position.Y += 1
return position
def set_cursor_position(self, position=None, on_stderr=False):
if position is None:
# I'm not currently tracking the position, so there is no default.
# position = self.get_position()
return
handle = win32.STDOUT
if on_stderr:
handle = win32.STDERR
win32.SetConsoleCursorPosition(handle, position)
def cursor_adjust(self, x, y, on_stderr=False):
handle = win32.STDOUT
if on_stderr:
handle = win32.STDERR
position = self.get_position(handle)
adjusted_position = (position.Y + y, position.X + x)
win32.SetConsoleCursorPosition(handle, adjusted_position, adjust=False)
def erase_screen(self, mode=0, on_stderr=False):
# 0 should clear from the cursor to the end of the screen.
# 1 should clear from the cursor to the beginning of the screen.
# 2 should clear the entire screen, and move cursor to (1,1)
handle = win32.STDOUT
if on_stderr:
handle = win32.STDERR
csbi = win32.GetConsoleScreenBufferInfo(handle)
# get the number of character cells in the current buffer
cells_in_screen = csbi.dwSize.X * csbi.dwSize.Y
# get number of character cells before current cursor position
cells_before_cursor = csbi.dwSize.X * csbi.dwCursorPosition.Y + csbi.dwCursorPosition.X
if mode == 0:
from_coord = csbi.dwCursorPosition
cells_to_erase = cells_in_screen - cells_before_cursor
elif mode == 1:
from_coord = win32.COORD(0, 0)
cells_to_erase = cells_before_cursor
elif mode == 2:
from_coord = win32.COORD(0, 0)
cells_to_erase = cells_in_screen
else:
# invalid mode
return
# fill the entire screen with blanks
win32.FillConsoleOutputCharacter(handle, ' ', cells_to_erase, from_coord)
# now set the buffer's attributes accordingly
win32.FillConsoleOutputAttribute(handle, self.get_attrs(), cells_to_erase, from_coord)
if mode == 2:
# put the cursor where needed
win32.SetConsoleCursorPosition(handle, (1, 1))
def erase_line(self, mode=0, on_stderr=False):
# 0 should clear from the cursor to the end of the line.
# 1 should clear from the cursor to the beginning of the line.
# 2 should clear the entire line.
handle = win32.STDOUT
if on_stderr:
handle = win32.STDERR
csbi = win32.GetConsoleScreenBufferInfo(handle)
if mode == 0:
from_coord = csbi.dwCursorPosition
cells_to_erase = csbi.dwSize.X - csbi.dwCursorPosition.X
elif mode == 1:
from_coord = win32.COORD(0, csbi.dwCursorPosition.Y)
cells_to_erase = csbi.dwCursorPosition.X
elif mode == 2:
from_coord = win32.COORD(0, csbi.dwCursorPosition.Y)
cells_to_erase = csbi.dwSize.X
else:
# invalid mode
return
# fill the entire screen with blanks
win32.FillConsoleOutputCharacter(handle, ' ', cells_to_erase, from_coord)
# now set the buffer's attributes accordingly
win32.FillConsoleOutputAttribute(handle, self.get_attrs(), cells_to_erase, from_coord)
def set_title(self, title):
win32.SetConsoleTitle(title)

View File

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2019 Vinay Sajip.
# Licensed to the Python Software Foundation under a contributor agreement.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
import logging
__version__ = '0.3.3'
class DistlibException(Exception):
pass
try:
from logging import NullHandler
except ImportError: # pragma: no cover
class NullHandler(logging.Handler):
def handle(self, record): pass
def emit(self, record): pass
def createLock(self): self.lock = None
logger = logging.getLogger(__name__)
logger.addHandler(NullHandler())

View File

@ -0,0 +1,6 @@
"""Modules copied from Python 3 standard libraries, for internal use only.
Individual classes and functions are found in d2._backport.misc. Intended
usage is to always import things missing from 3.1 from that module: the
built-in/stdlib objects will be used if found.
"""

View File

@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 The Python Software Foundation.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
"""Backports for individual classes and functions."""
import os
import sys
__all__ = ['cache_from_source', 'callable', 'fsencode']
try:
from imp import cache_from_source
except ImportError:
def cache_from_source(py_file, debug=__debug__):
ext = debug and 'c' or 'o'
return py_file + ext
try:
callable = callable
except NameError:
from collections import Callable
def callable(obj):
return isinstance(obj, Callable)
try:
fsencode = os.fsencode
except AttributeError:
def fsencode(filename):
if isinstance(filename, bytes):
return filename
elif isinstance(filename, str):
return filename.encode(sys.getfilesystemencoding())
else:
raise TypeError("expect bytes or str, not %s" %
type(filename).__name__)

View File

@ -0,0 +1,764 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 The Python Software Foundation.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
"""Utility functions for copying and archiving files and directory trees.
XXX The functions here don't copy the resource fork or other metadata on Mac.
"""
import os
import sys
import stat
from os.path import abspath
import fnmatch
try:
from collections.abc import Callable
except ImportError:
from collections import Callable
import errno
from . import tarfile
try:
import bz2
_BZ2_SUPPORTED = True
except ImportError:
_BZ2_SUPPORTED = False
try:
from pwd import getpwnam
except ImportError:
getpwnam = None
try:
from grp import getgrnam
except ImportError:
getgrnam = None
__all__ = ["copyfileobj", "copyfile", "copymode", "copystat", "copy", "copy2",
"copytree", "move", "rmtree", "Error", "SpecialFileError",
"ExecError", "make_archive", "get_archive_formats",
"register_archive_format", "unregister_archive_format",
"get_unpack_formats", "register_unpack_format",
"unregister_unpack_format", "unpack_archive", "ignore_patterns"]
class Error(EnvironmentError):
pass
class SpecialFileError(EnvironmentError):
"""Raised when trying to do a kind of operation (e.g. copying) which is
not supported on a special file (e.g. a named pipe)"""
class ExecError(EnvironmentError):
"""Raised when a command could not be executed"""
class ReadError(EnvironmentError):
"""Raised when an archive cannot be read"""
class RegistryError(Exception):
"""Raised when a registry operation with the archiving
and unpacking registries fails"""
try:
WindowsError
except NameError:
WindowsError = None
def copyfileobj(fsrc, fdst, length=16*1024):
"""copy data from file-like object fsrc to file-like object fdst"""
while 1:
buf = fsrc.read(length)
if not buf:
break
fdst.write(buf)
def _samefile(src, dst):
# Macintosh, Unix.
if hasattr(os.path, 'samefile'):
try:
return os.path.samefile(src, dst)
except OSError:
return False
# All other platforms: check for same pathname.
return (os.path.normcase(os.path.abspath(src)) ==
os.path.normcase(os.path.abspath(dst)))
def copyfile(src, dst):
"""Copy data from src to dst"""
if _samefile(src, dst):
raise Error("`%s` and `%s` are the same file" % (src, dst))
for fn in [src, dst]:
try:
st = os.stat(fn)
except OSError:
# File most likely does not exist
pass
else:
# XXX What about other special files? (sockets, devices...)
if stat.S_ISFIFO(st.st_mode):
raise SpecialFileError("`%s` is a named pipe" % fn)
with open(src, 'rb') as fsrc:
with open(dst, 'wb') as fdst:
copyfileobj(fsrc, fdst)
def copymode(src, dst):
"""Copy mode bits from src to dst"""
if hasattr(os, 'chmod'):
st = os.stat(src)
mode = stat.S_IMODE(st.st_mode)
os.chmod(dst, mode)
def copystat(src, dst):
"""Copy all stat info (mode bits, atime, mtime, flags) from src to dst"""
st = os.stat(src)
mode = stat.S_IMODE(st.st_mode)
if hasattr(os, 'utime'):
os.utime(dst, (st.st_atime, st.st_mtime))
if hasattr(os, 'chmod'):
os.chmod(dst, mode)
if hasattr(os, 'chflags') and hasattr(st, 'st_flags'):
try:
os.chflags(dst, st.st_flags)
except OSError as why:
if (not hasattr(errno, 'EOPNOTSUPP') or
why.errno != errno.EOPNOTSUPP):
raise
def copy(src, dst):
"""Copy data and mode bits ("cp src dst").
The destination may be a directory.
"""
if os.path.isdir(dst):
dst = os.path.join(dst, os.path.basename(src))
copyfile(src, dst)
copymode(src, dst)
def copy2(src, dst):
"""Copy data and all stat info ("cp -p src dst").
The destination may be a directory.
"""
if os.path.isdir(dst):
dst = os.path.join(dst, os.path.basename(src))
copyfile(src, dst)
copystat(src, dst)
def ignore_patterns(*patterns):
"""Function that can be used as copytree() ignore parameter.
Patterns is a sequence of glob-style patterns
that are used to exclude files"""
def _ignore_patterns(path, names):
ignored_names = []
for pattern in patterns:
ignored_names.extend(fnmatch.filter(names, pattern))
return set(ignored_names)
return _ignore_patterns
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
ignore_dangling_symlinks=False):
"""Recursively copy a directory tree.
The destination directory must not already exist.
If exception(s) occur, an Error is raised with a list of reasons.
If the optional symlinks flag is true, symbolic links in the
source tree result in symbolic links in the destination tree; if
it is false, the contents of the files pointed to by symbolic
links are copied. If the file pointed by the symlink doesn't
exist, an exception will be added in the list of errors raised in
an Error exception at the end of the copy process.
You can set the optional ignore_dangling_symlinks flag to true if you
want to silence this exception. Notice that this has no effect on
platforms that don't support os.symlink.
The optional ignore argument is a callable. If given, it
is called with the `src` parameter, which is the directory
being visited by copytree(), and `names` which is the list of
`src` contents, as returned by os.listdir():
callable(src, names) -> ignored_names
Since copytree() is called recursively, the callable will be
called once for each directory that is copied. It returns a
list of names relative to the `src` directory that should
not be copied.
The optional copy_function argument is a callable that will be used
to copy each file. It will be called with the source path and the
destination path as arguments. By default, copy2() is used, but any
function that supports the same signature (like copy()) can be used.
"""
names = os.listdir(src)
if ignore is not None:
ignored_names = ignore(src, names)
else:
ignored_names = set()
os.makedirs(dst)
errors = []
for name in names:
if name in ignored_names:
continue
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
try:
if os.path.islink(srcname):
linkto = os.readlink(srcname)
if symlinks:
os.symlink(linkto, dstname)
else:
# ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue
# otherwise let the copy occurs. copy2 will raise an error
copy_function(srcname, dstname)
elif os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore, copy_function)
else:
# Will raise a SpecialFileError for unsupported file types
copy_function(srcname, dstname)
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error as err:
errors.extend(err.args[0])
except EnvironmentError as why:
errors.append((srcname, dstname, str(why)))
try:
copystat(src, dst)
except OSError as why:
if WindowsError is not None and isinstance(why, WindowsError):
# Copying file access times may fail on Windows
pass
else:
errors.extend((src, dst, str(why)))
if errors:
raise Error(errors)
def rmtree(path, ignore_errors=False, onerror=None):
"""Recursively delete a directory tree.
If ignore_errors is set, errors are ignored; otherwise, if onerror
is set, it is called to handle the error with arguments (func,
path, exc_info) where func is os.listdir, os.remove, or os.rmdir;
path is the argument to that function that caused it to fail; and
exc_info is a tuple returned by sys.exc_info(). If ignore_errors
is false and onerror is None, an exception is raised.
"""
if ignore_errors:
def onerror(*args):
pass
elif onerror is None:
def onerror(*args):
raise
try:
if os.path.islink(path):
# symlinks to directories are forbidden, see bug #1669
raise OSError("Cannot call rmtree on a symbolic link")
except OSError:
onerror(os.path.islink, path, sys.exc_info())
# can't continue even if onerror hook returns
return
names = []
try:
names = os.listdir(path)
except os.error:
onerror(os.listdir, path, sys.exc_info())
for name in names:
fullname = os.path.join(path, name)
try:
mode = os.lstat(fullname).st_mode
except os.error:
mode = 0
if stat.S_ISDIR(mode):
rmtree(fullname, ignore_errors, onerror)
else:
try:
os.remove(fullname)
except os.error:
onerror(os.remove, fullname, sys.exc_info())
try:
os.rmdir(path)
except os.error:
onerror(os.rmdir, path, sys.exc_info())
def _basename(path):
# A basename() variant which first strips the trailing slash, if present.
# Thus we always get the last component of the path, even for directories.
return os.path.basename(path.rstrip(os.path.sep))
def move(src, dst):
"""Recursively move a file or directory to another location. This is
similar to the Unix "mv" command.
If the destination is a directory or a symlink to a directory, the source
is moved inside the directory. The destination path must not already
exist.
If the destination already exists but is not a directory, it may be
overwritten depending on os.rename() semantics.
If the destination is on our current filesystem, then rename() is used.
Otherwise, src is copied to the destination and then removed.
A lot more could be done here... A look at a mv.c shows a lot of
the issues this implementation glosses over.
"""
real_dst = dst
if os.path.isdir(dst):
if _samefile(src, dst):
# We might be on a case insensitive filesystem,
# perform the rename anyway.
os.rename(src, dst)
return
real_dst = os.path.join(dst, _basename(src))
if os.path.exists(real_dst):
raise Error("Destination path '%s' already exists" % real_dst)
try:
os.rename(src, real_dst)
except OSError:
if os.path.isdir(src):
if _destinsrc(src, dst):
raise Error("Cannot move a directory '%s' into itself '%s'." % (src, dst))
copytree(src, real_dst, symlinks=True)
rmtree(src)
else:
copy2(src, real_dst)
os.unlink(src)
def _destinsrc(src, dst):
src = abspath(src)
dst = abspath(dst)
if not src.endswith(os.path.sep):
src += os.path.sep
if not dst.endswith(os.path.sep):
dst += os.path.sep
return dst.startswith(src)
def _get_gid(name):
"""Returns a gid, given a group name."""
if getgrnam is None or name is None:
return None
try:
result = getgrnam(name)
except KeyError:
result = None
if result is not None:
return result[2]
return None
def _get_uid(name):
"""Returns an uid, given a user name."""
if getpwnam is None or name is None:
return None
try:
result = getpwnam(name)
except KeyError:
result = None
if result is not None:
return result[2]
return None
def _make_tarball(base_name, base_dir, compress="gzip", verbose=0, dry_run=0,
owner=None, group=None, logger=None):
"""Create a (possibly compressed) tar file from all the files under
'base_dir'.
'compress' must be "gzip" (the default), "bzip2", or None.
'owner' and 'group' can be used to define an owner and a group for the
archive that is being built. If not provided, the current owner and group
will be used.
The output tar file will be named 'base_name' + ".tar", possibly plus
the appropriate compression extension (".gz", or ".bz2").
Returns the output filename.
"""
tar_compression = {'gzip': 'gz', None: ''}
compress_ext = {'gzip': '.gz'}
if _BZ2_SUPPORTED:
tar_compression['bzip2'] = 'bz2'
compress_ext['bzip2'] = '.bz2'
# flags for compression program, each element of list will be an argument
if compress is not None and compress not in compress_ext:
raise ValueError("bad value for 'compress', or compression format not "
"supported : {0}".format(compress))
archive_name = base_name + '.tar' + compress_ext.get(compress, '')
archive_dir = os.path.dirname(archive_name)
if not os.path.exists(archive_dir):
if logger is not None:
logger.info("creating %s", archive_dir)
if not dry_run:
os.makedirs(archive_dir)
# creating the tarball
if logger is not None:
logger.info('Creating tar archive')
uid = _get_uid(owner)
gid = _get_gid(group)
def _set_uid_gid(tarinfo):
if gid is not None:
tarinfo.gid = gid
tarinfo.gname = group
if uid is not None:
tarinfo.uid = uid
tarinfo.uname = owner
return tarinfo
if not dry_run:
tar = tarfile.open(archive_name, 'w|%s' % tar_compression[compress])
try:
tar.add(base_dir, filter=_set_uid_gid)
finally:
tar.close()
return archive_name
def _call_external_zip(base_dir, zip_filename, verbose=False, dry_run=False):
# XXX see if we want to keep an external call here
if verbose:
zipoptions = "-r"
else:
zipoptions = "-rq"
from distutils.errors import DistutilsExecError
from distutils.spawn import spawn
try:
spawn(["zip", zipoptions, zip_filename, base_dir], dry_run=dry_run)
except DistutilsExecError:
# XXX really should distinguish between "couldn't find
# external 'zip' command" and "zip failed".
raise ExecError("unable to create zip file '%s': "
"could neither import the 'zipfile' module nor "
"find a standalone zip utility") % zip_filename
def _make_zipfile(base_name, base_dir, verbose=0, dry_run=0, logger=None):
"""Create a zip file from all the files under 'base_dir'.
The output zip file will be named 'base_name' + ".zip". Uses either the
"zipfile" Python module (if available) or the InfoZIP "zip" utility
(if installed and found on the default search path). If neither tool is
available, raises ExecError. Returns the name of the output zip
file.
"""
zip_filename = base_name + ".zip"
archive_dir = os.path.dirname(base_name)
if not os.path.exists(archive_dir):
if logger is not None:
logger.info("creating %s", archive_dir)
if not dry_run:
os.makedirs(archive_dir)
# If zipfile module is not available, try spawning an external 'zip'
# command.
try:
import zipfile
except ImportError:
zipfile = None
if zipfile is None:
_call_external_zip(base_dir, zip_filename, verbose, dry_run)
else:
if logger is not None:
logger.info("creating '%s' and adding '%s' to it",
zip_filename, base_dir)
if not dry_run:
zip = zipfile.ZipFile(zip_filename, "w",
compression=zipfile.ZIP_DEFLATED)
for dirpath, dirnames, filenames in os.walk(base_dir):
for name in filenames:
path = os.path.normpath(os.path.join(dirpath, name))
if os.path.isfile(path):
zip.write(path, path)
if logger is not None:
logger.info("adding '%s'", path)
zip.close()
return zip_filename
_ARCHIVE_FORMATS = {
'gztar': (_make_tarball, [('compress', 'gzip')], "gzip'ed tar-file"),
'bztar': (_make_tarball, [('compress', 'bzip2')], "bzip2'ed tar-file"),
'tar': (_make_tarball, [('compress', None)], "uncompressed tar file"),
'zip': (_make_zipfile, [], "ZIP file"),
}
if _BZ2_SUPPORTED:
_ARCHIVE_FORMATS['bztar'] = (_make_tarball, [('compress', 'bzip2')],
"bzip2'ed tar-file")
def get_archive_formats():
"""Returns a list of supported formats for archiving and unarchiving.
Each element of the returned sequence is a tuple (name, description)
"""
formats = [(name, registry[2]) for name, registry in
_ARCHIVE_FORMATS.items()]
formats.sort()
return formats
def register_archive_format(name, function, extra_args=None, description=''):
"""Registers an archive format.
name is the name of the format. function is the callable that will be
used to create archives. If provided, extra_args is a sequence of
(name, value) tuples that will be passed as arguments to the callable.
description can be provided to describe the format, and will be returned
by the get_archive_formats() function.
"""
if extra_args is None:
extra_args = []
if not isinstance(function, Callable):
raise TypeError('The %s object is not callable' % function)
if not isinstance(extra_args, (tuple, list)):
raise TypeError('extra_args needs to be a sequence')
for element in extra_args:
if not isinstance(element, (tuple, list)) or len(element) !=2:
raise TypeError('extra_args elements are : (arg_name, value)')
_ARCHIVE_FORMATS[name] = (function, extra_args, description)
def unregister_archive_format(name):
del _ARCHIVE_FORMATS[name]
def make_archive(base_name, format, root_dir=None, base_dir=None, verbose=0,
dry_run=0, owner=None, group=None, logger=None):
"""Create an archive file (eg. zip or tar).
'base_name' is the name of the file to create, minus any format-specific
extension; 'format' is the archive format: one of "zip", "tar", "bztar"
or "gztar".
'root_dir' is a directory that will be the root directory of the
archive; ie. we typically chdir into 'root_dir' before creating the
archive. 'base_dir' is the directory where we start archiving from;
ie. 'base_dir' will be the common prefix of all files and
directories in the archive. 'root_dir' and 'base_dir' both default
to the current directory. Returns the name of the archive file.
'owner' and 'group' are used when creating a tar archive. By default,
uses the current owner and group.
"""
save_cwd = os.getcwd()
if root_dir is not None:
if logger is not None:
logger.debug("changing into '%s'", root_dir)
base_name = os.path.abspath(base_name)
if not dry_run:
os.chdir(root_dir)
if base_dir is None:
base_dir = os.curdir
kwargs = {'dry_run': dry_run, 'logger': logger}
try:
format_info = _ARCHIVE_FORMATS[format]
except KeyError:
raise ValueError("unknown archive format '%s'" % format)
func = format_info[0]
for arg, val in format_info[1]:
kwargs[arg] = val
if format != 'zip':
kwargs['owner'] = owner
kwargs['group'] = group
try:
filename = func(base_name, base_dir, **kwargs)
finally:
if root_dir is not None:
if logger is not None:
logger.debug("changing back to '%s'", save_cwd)
os.chdir(save_cwd)
return filename
def get_unpack_formats():
"""Returns a list of supported formats for unpacking.
Each element of the returned sequence is a tuple
(name, extensions, description)
"""
formats = [(name, info[0], info[3]) for name, info in
_UNPACK_FORMATS.items()]
formats.sort()
return formats
def _check_unpack_options(extensions, function, extra_args):
"""Checks what gets registered as an unpacker."""
# first make sure no other unpacker is registered for this extension
existing_extensions = {}
for name, info in _UNPACK_FORMATS.items():
for ext in info[0]:
existing_extensions[ext] = name
for extension in extensions:
if extension in existing_extensions:
msg = '%s is already registered for "%s"'
raise RegistryError(msg % (extension,
existing_extensions[extension]))
if not isinstance(function, Callable):
raise TypeError('The registered function must be a callable')
def register_unpack_format(name, extensions, function, extra_args=None,
description=''):
"""Registers an unpack format.
`name` is the name of the format. `extensions` is a list of extensions
corresponding to the format.
`function` is the callable that will be
used to unpack archives. The callable will receive archives to unpack.
If it's unable to handle an archive, it needs to raise a ReadError
exception.
If provided, `extra_args` is a sequence of
(name, value) tuples that will be passed as arguments to the callable.
description can be provided to describe the format, and will be returned
by the get_unpack_formats() function.
"""
if extra_args is None:
extra_args = []
_check_unpack_options(extensions, function, extra_args)
_UNPACK_FORMATS[name] = extensions, function, extra_args, description
def unregister_unpack_format(name):
"""Removes the pack format from the registry."""
del _UNPACK_FORMATS[name]
def _ensure_directory(path):
"""Ensure that the parent directory of `path` exists"""
dirname = os.path.dirname(path)
if not os.path.isdir(dirname):
os.makedirs(dirname)
def _unpack_zipfile(filename, extract_dir):
"""Unpack zip `filename` to `extract_dir`
"""
try:
import zipfile
except ImportError:
raise ReadError('zlib not supported, cannot unpack this archive.')
if not zipfile.is_zipfile(filename):
raise ReadError("%s is not a zip file" % filename)
zip = zipfile.ZipFile(filename)
try:
for info in zip.infolist():
name = info.filename
# don't extract absolute paths or ones with .. in them
if name.startswith('/') or '..' in name:
continue
target = os.path.join(extract_dir, *name.split('/'))
if not target:
continue
_ensure_directory(target)
if not name.endswith('/'):
# file
data = zip.read(info.filename)
f = open(target, 'wb')
try:
f.write(data)
finally:
f.close()
del data
finally:
zip.close()
def _unpack_tarfile(filename, extract_dir):
"""Unpack tar/tar.gz/tar.bz2 `filename` to `extract_dir`
"""
try:
tarobj = tarfile.open(filename)
except tarfile.TarError:
raise ReadError(
"%s is not a compressed or uncompressed tar file" % filename)
try:
tarobj.extractall(extract_dir)
finally:
tarobj.close()
_UNPACK_FORMATS = {
'gztar': (['.tar.gz', '.tgz'], _unpack_tarfile, [], "gzip'ed tar-file"),
'tar': (['.tar'], _unpack_tarfile, [], "uncompressed tar file"),
'zip': (['.zip'], _unpack_zipfile, [], "ZIP file")
}
if _BZ2_SUPPORTED:
_UNPACK_FORMATS['bztar'] = (['.bz2'], _unpack_tarfile, [],
"bzip2'ed tar-file")
def _find_unpack_format(filename):
for name, info in _UNPACK_FORMATS.items():
for extension in info[0]:
if filename.endswith(extension):
return name
return None
def unpack_archive(filename, extract_dir=None, format=None):
"""Unpack an archive.
`filename` is the name of the archive.
`extract_dir` is the name of the target directory, where the archive
is unpacked. If not provided, the current working directory is used.
`format` is the archive format: one of "zip", "tar", or "gztar". Or any
other registered format. If not provided, unpack_archive will use the
filename extension and see if an unpacker was registered for that
extension.
In case none is found, a ValueError is raised.
"""
if extract_dir is None:
extract_dir = os.getcwd()
if format is not None:
try:
format_info = _UNPACK_FORMATS[format]
except KeyError:
raise ValueError("Unknown unpack format '{0}'".format(format))
func = format_info[1]
func(filename, extract_dir, **dict(format_info[2]))
else:
# we need to look at the registered unpackers supported extensions
format = _find_unpack_format(filename)
if format is None:
raise ReadError("Unknown archive format '{0}'".format(filename))
func = _UNPACK_FORMATS[format][1]
kwargs = dict(_UNPACK_FORMATS[format][2])
func(filename, extract_dir, **kwargs)

View File

@ -0,0 +1,84 @@
[posix_prefix]
# Configuration directories. Some of these come straight out of the
# configure script. They are for implementing the other variables, not to
# be used directly in [resource_locations].
confdir = /etc
datadir = /usr/share
libdir = /usr/lib
statedir = /var
# User resource directory
local = ~/.local/{distribution.name}
stdlib = {base}/lib/python{py_version_short}
platstdlib = {platbase}/lib/python{py_version_short}
purelib = {base}/lib/python{py_version_short}/site-packages
platlib = {platbase}/lib/python{py_version_short}/site-packages
include = {base}/include/python{py_version_short}{abiflags}
platinclude = {platbase}/include/python{py_version_short}{abiflags}
data = {base}
[posix_home]
stdlib = {base}/lib/python
platstdlib = {base}/lib/python
purelib = {base}/lib/python
platlib = {base}/lib/python
include = {base}/include/python
platinclude = {base}/include/python
scripts = {base}/bin
data = {base}
[nt]
stdlib = {base}/Lib
platstdlib = {base}/Lib
purelib = {base}/Lib/site-packages
platlib = {base}/Lib/site-packages
include = {base}/Include
platinclude = {base}/Include
scripts = {base}/Scripts
data = {base}
[os2]
stdlib = {base}/Lib
platstdlib = {base}/Lib
purelib = {base}/Lib/site-packages
platlib = {base}/Lib/site-packages
include = {base}/Include
platinclude = {base}/Include
scripts = {base}/Scripts
data = {base}
[os2_home]
stdlib = {userbase}/lib/python{py_version_short}
platstdlib = {userbase}/lib/python{py_version_short}
purelib = {userbase}/lib/python{py_version_short}/site-packages
platlib = {userbase}/lib/python{py_version_short}/site-packages
include = {userbase}/include/python{py_version_short}
scripts = {userbase}/bin
data = {userbase}
[nt_user]
stdlib = {userbase}/Python{py_version_nodot}
platstdlib = {userbase}/Python{py_version_nodot}
purelib = {userbase}/Python{py_version_nodot}/site-packages
platlib = {userbase}/Python{py_version_nodot}/site-packages
include = {userbase}/Python{py_version_nodot}/Include
scripts = {userbase}/Scripts
data = {userbase}
[posix_user]
stdlib = {userbase}/lib/python{py_version_short}
platstdlib = {userbase}/lib/python{py_version_short}
purelib = {userbase}/lib/python{py_version_short}/site-packages
platlib = {userbase}/lib/python{py_version_short}/site-packages
include = {userbase}/include/python{py_version_short}
scripts = {userbase}/bin
data = {userbase}
[osx_framework_user]
stdlib = {userbase}/lib/python
platstdlib = {userbase}/lib/python
purelib = {userbase}/lib/python/site-packages
platlib = {userbase}/lib/python/site-packages
include = {userbase}/include
scripts = {userbase}/bin
data = {userbase}

View File

@ -0,0 +1,786 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 The Python Software Foundation.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
"""Access to Python's configuration information."""
import codecs
import os
import re
import sys
from os.path import pardir, realpath
try:
import configparser
except ImportError:
import ConfigParser as configparser
__all__ = [
'get_config_h_filename',
'get_config_var',
'get_config_vars',
'get_makefile_filename',
'get_path',
'get_path_names',
'get_paths',
'get_platform',
'get_python_version',
'get_scheme_names',
'parse_config_h',
]
def _safe_realpath(path):
try:
return realpath(path)
except OSError:
return path
if sys.executable:
_PROJECT_BASE = os.path.dirname(_safe_realpath(sys.executable))
else:
# sys.executable can be empty if argv[0] has been changed and Python is
# unable to retrieve the real program name
_PROJECT_BASE = _safe_realpath(os.getcwd())
if os.name == "nt" and "pcbuild" in _PROJECT_BASE[-8:].lower():
_PROJECT_BASE = _safe_realpath(os.path.join(_PROJECT_BASE, pardir))
# PC/VS7.1
if os.name == "nt" and "\\pc\\v" in _PROJECT_BASE[-10:].lower():
_PROJECT_BASE = _safe_realpath(os.path.join(_PROJECT_BASE, pardir, pardir))
# PC/AMD64
if os.name == "nt" and "\\pcbuild\\amd64" in _PROJECT_BASE[-14:].lower():
_PROJECT_BASE = _safe_realpath(os.path.join(_PROJECT_BASE, pardir, pardir))
def is_python_build():
for fn in ("Setup.dist", "Setup.local"):
if os.path.isfile(os.path.join(_PROJECT_BASE, "Modules", fn)):
return True
return False
_PYTHON_BUILD = is_python_build()
_cfg_read = False
def _ensure_cfg_read():
global _cfg_read
if not _cfg_read:
from ..resources import finder
backport_package = __name__.rsplit('.', 1)[0]
_finder = finder(backport_package)
_cfgfile = _finder.find('sysconfig.cfg')
assert _cfgfile, 'sysconfig.cfg exists'
with _cfgfile.as_stream() as s:
_SCHEMES.readfp(s)
if _PYTHON_BUILD:
for scheme in ('posix_prefix', 'posix_home'):
_SCHEMES.set(scheme, 'include', '{srcdir}/Include')
_SCHEMES.set(scheme, 'platinclude', '{projectbase}/.')
_cfg_read = True
_SCHEMES = configparser.RawConfigParser()
_VAR_REPL = re.compile(r'\{([^{]*?)\}')
def _expand_globals(config):
_ensure_cfg_read()
if config.has_section('globals'):
globals = config.items('globals')
else:
globals = tuple()
sections = config.sections()
for section in sections:
if section == 'globals':
continue
for option, value in globals:
if config.has_option(section, option):
continue
config.set(section, option, value)
config.remove_section('globals')
# now expanding local variables defined in the cfg file
#
for section in config.sections():
variables = dict(config.items(section))
def _replacer(matchobj):
name = matchobj.group(1)
if name in variables:
return variables[name]
return matchobj.group(0)
for option, value in config.items(section):
config.set(section, option, _VAR_REPL.sub(_replacer, value))
#_expand_globals(_SCHEMES)
_PY_VERSION = '%s.%s.%s' % sys.version_info[:3]
_PY_VERSION_SHORT = '%s.%s' % sys.version_info[:2]
_PY_VERSION_SHORT_NO_DOT = '%s%s' % sys.version_info[:2]
_PREFIX = os.path.normpath(sys.prefix)
_EXEC_PREFIX = os.path.normpath(sys.exec_prefix)
_CONFIG_VARS = None
_USER_BASE = None
def _subst_vars(path, local_vars):
"""In the string `path`, replace tokens like {some.thing} with the
corresponding value from the map `local_vars`.
If there is no corresponding value, leave the token unchanged.
"""
def _replacer(matchobj):
name = matchobj.group(1)
if name in local_vars:
return local_vars[name]
elif name in os.environ:
return os.environ[name]
return matchobj.group(0)
return _VAR_REPL.sub(_replacer, path)
def _extend_dict(target_dict, other_dict):
target_keys = target_dict.keys()
for key, value in other_dict.items():
if key in target_keys:
continue
target_dict[key] = value
def _expand_vars(scheme, vars):
res = {}
if vars is None:
vars = {}
_extend_dict(vars, get_config_vars())
for key, value in _SCHEMES.items(scheme):
if os.name in ('posix', 'nt'):
value = os.path.expanduser(value)
res[key] = os.path.normpath(_subst_vars(value, vars))
return res
def format_value(value, vars):
def _replacer(matchobj):
name = matchobj.group(1)
if name in vars:
return vars[name]
return matchobj.group(0)
return _VAR_REPL.sub(_replacer, value)
def _get_default_scheme():
if os.name == 'posix':
# the default scheme for posix is posix_prefix
return 'posix_prefix'
return os.name
def _getuserbase():
env_base = os.environ.get("PYTHONUSERBASE", None)
def joinuser(*args):
return os.path.expanduser(os.path.join(*args))
# what about 'os2emx', 'riscos' ?
if os.name == "nt":
base = os.environ.get("APPDATA") or "~"
if env_base:
return env_base
else:
return joinuser(base, "Python")
if sys.platform == "darwin":
framework = get_config_var("PYTHONFRAMEWORK")
if framework:
if env_base:
return env_base
else:
return joinuser("~", "Library", framework, "%d.%d" %
sys.version_info[:2])
if env_base:
return env_base
else:
return joinuser("~", ".local")
def _parse_makefile(filename, vars=None):
"""Parse a Makefile-style file.
A dictionary containing name/value pairs is returned. If an
optional dictionary is passed in as the second argument, it is
used instead of a new dictionary.
"""
# Regexes needed for parsing Makefile (and similar syntaxes,
# like old-style Setup files).
_variable_rx = re.compile(r"([a-zA-Z][a-zA-Z0-9_]+)\s*=\s*(.*)")
_findvar1_rx = re.compile(r"\$\(([A-Za-z][A-Za-z0-9_]*)\)")
_findvar2_rx = re.compile(r"\${([A-Za-z][A-Za-z0-9_]*)}")
if vars is None:
vars = {}
done = {}
notdone = {}
with codecs.open(filename, encoding='utf-8', errors="surrogateescape") as f:
lines = f.readlines()
for line in lines:
if line.startswith('#') or line.strip() == '':
continue
m = _variable_rx.match(line)
if m:
n, v = m.group(1, 2)
v = v.strip()
# `$$' is a literal `$' in make
tmpv = v.replace('$$', '')
if "$" in tmpv:
notdone[n] = v
else:
try:
v = int(v)
except ValueError:
# insert literal `$'
done[n] = v.replace('$$', '$')
else:
done[n] = v
# do variable interpolation here
variables = list(notdone.keys())
# Variables with a 'PY_' prefix in the makefile. These need to
# be made available without that prefix through sysconfig.
# Special care is needed to ensure that variable expansion works, even
# if the expansion uses the name without a prefix.
renamed_variables = ('CFLAGS', 'LDFLAGS', 'CPPFLAGS')
while len(variables) > 0:
for name in tuple(variables):
value = notdone[name]
m = _findvar1_rx.search(value) or _findvar2_rx.search(value)
if m is not None:
n = m.group(1)
found = True
if n in done:
item = str(done[n])
elif n in notdone:
# get it on a subsequent round
found = False
elif n in os.environ:
# do it like make: fall back to environment
item = os.environ[n]
elif n in renamed_variables:
if (name.startswith('PY_') and
name[3:] in renamed_variables):
item = ""
elif 'PY_' + n in notdone:
found = False
else:
item = str(done['PY_' + n])
else:
done[n] = item = ""
if found:
after = value[m.end():]
value = value[:m.start()] + item + after
if "$" in after:
notdone[name] = value
else:
try:
value = int(value)
except ValueError:
done[name] = value.strip()
else:
done[name] = value
variables.remove(name)
if (name.startswith('PY_') and
name[3:] in renamed_variables):
name = name[3:]
if name not in done:
done[name] = value
else:
# bogus variable reference (e.g. "prefix=$/opt/python");
# just drop it since we can't deal
done[name] = value
variables.remove(name)
# strip spurious spaces
for k, v in done.items():
if isinstance(v, str):
done[k] = v.strip()
# save the results in the global dictionary
vars.update(done)
return vars
def get_makefile_filename():
"""Return the path of the Makefile."""
if _PYTHON_BUILD:
return os.path.join(_PROJECT_BASE, "Makefile")
if hasattr(sys, 'abiflags'):
config_dir_name = 'config-%s%s' % (_PY_VERSION_SHORT, sys.abiflags)
else:
config_dir_name = 'config'
return os.path.join(get_path('stdlib'), config_dir_name, 'Makefile')
def _init_posix(vars):
"""Initialize the module as appropriate for POSIX systems."""
# load the installed Makefile:
makefile = get_makefile_filename()
try:
_parse_makefile(makefile, vars)
except IOError as e:
msg = "invalid Python installation: unable to open %s" % makefile
if hasattr(e, "strerror"):
msg = msg + " (%s)" % e.strerror
raise IOError(msg)
# load the installed pyconfig.h:
config_h = get_config_h_filename()
try:
with open(config_h) as f:
parse_config_h(f, vars)
except IOError as e:
msg = "invalid Python installation: unable to open %s" % config_h
if hasattr(e, "strerror"):
msg = msg + " (%s)" % e.strerror
raise IOError(msg)
# On AIX, there are wrong paths to the linker scripts in the Makefile
# -- these paths are relative to the Python source, but when installed
# the scripts are in another directory.
if _PYTHON_BUILD:
vars['LDSHARED'] = vars['BLDSHARED']
def _init_non_posix(vars):
"""Initialize the module as appropriate for NT"""
# set basic install directories
vars['LIBDEST'] = get_path('stdlib')
vars['BINLIBDEST'] = get_path('platstdlib')
vars['INCLUDEPY'] = get_path('include')
vars['SO'] = '.pyd'
vars['EXE'] = '.exe'
vars['VERSION'] = _PY_VERSION_SHORT_NO_DOT
vars['BINDIR'] = os.path.dirname(_safe_realpath(sys.executable))
#
# public APIs
#
def parse_config_h(fp, vars=None):
"""Parse a config.h-style file.
A dictionary containing name/value pairs is returned. If an
optional dictionary is passed in as the second argument, it is
used instead of a new dictionary.
"""
if vars is None:
vars = {}
define_rx = re.compile("#define ([A-Z][A-Za-z0-9_]+) (.*)\n")
undef_rx = re.compile("/[*] #undef ([A-Z][A-Za-z0-9_]+) [*]/\n")
while True:
line = fp.readline()
if not line:
break
m = define_rx.match(line)
if m:
n, v = m.group(1, 2)
try:
v = int(v)
except ValueError:
pass
vars[n] = v
else:
m = undef_rx.match(line)
if m:
vars[m.group(1)] = 0
return vars
def get_config_h_filename():
"""Return the path of pyconfig.h."""
if _PYTHON_BUILD:
if os.name == "nt":
inc_dir = os.path.join(_PROJECT_BASE, "PC")
else:
inc_dir = _PROJECT_BASE
else:
inc_dir = get_path('platinclude')
return os.path.join(inc_dir, 'pyconfig.h')
def get_scheme_names():
"""Return a tuple containing the schemes names."""
return tuple(sorted(_SCHEMES.sections()))
def get_path_names():
"""Return a tuple containing the paths names."""
# xxx see if we want a static list
return _SCHEMES.options('posix_prefix')
def get_paths(scheme=_get_default_scheme(), vars=None, expand=True):
"""Return a mapping containing an install scheme.
``scheme`` is the install scheme name. If not provided, it will
return the default scheme for the current platform.
"""
_ensure_cfg_read()
if expand:
return _expand_vars(scheme, vars)
else:
return dict(_SCHEMES.items(scheme))
def get_path(name, scheme=_get_default_scheme(), vars=None, expand=True):
"""Return a path corresponding to the scheme.
``scheme`` is the install scheme name.
"""
return get_paths(scheme, vars, expand)[name]
def get_config_vars(*args):
"""With no arguments, return a dictionary of all configuration
variables relevant for the current platform.
On Unix, this means every variable defined in Python's installed Makefile;
On Windows and Mac OS it's a much smaller set.
With arguments, return a list of values that result from looking up
each argument in the configuration variable dictionary.
"""
global _CONFIG_VARS
if _CONFIG_VARS is None:
_CONFIG_VARS = {}
# Normalized versions of prefix and exec_prefix are handy to have;
# in fact, these are the standard versions used most places in the
# distutils2 module.
_CONFIG_VARS['prefix'] = _PREFIX
_CONFIG_VARS['exec_prefix'] = _EXEC_PREFIX
_CONFIG_VARS['py_version'] = _PY_VERSION
_CONFIG_VARS['py_version_short'] = _PY_VERSION_SHORT
_CONFIG_VARS['py_version_nodot'] = _PY_VERSION[0] + _PY_VERSION[2]
_CONFIG_VARS['base'] = _PREFIX
_CONFIG_VARS['platbase'] = _EXEC_PREFIX
_CONFIG_VARS['projectbase'] = _PROJECT_BASE
try:
_CONFIG_VARS['abiflags'] = sys.abiflags
except AttributeError:
# sys.abiflags may not be defined on all platforms.
_CONFIG_VARS['abiflags'] = ''
if os.name in ('nt', 'os2'):
_init_non_posix(_CONFIG_VARS)
if os.name == 'posix':
_init_posix(_CONFIG_VARS)
# Setting 'userbase' is done below the call to the
# init function to enable using 'get_config_var' in
# the init-function.
if sys.version >= '2.6':
_CONFIG_VARS['userbase'] = _getuserbase()
if 'srcdir' not in _CONFIG_VARS:
_CONFIG_VARS['srcdir'] = _PROJECT_BASE
else:
_CONFIG_VARS['srcdir'] = _safe_realpath(_CONFIG_VARS['srcdir'])
# Convert srcdir into an absolute path if it appears necessary.
# Normally it is relative to the build directory. However, during
# testing, for example, we might be running a non-installed python
# from a different directory.
if _PYTHON_BUILD and os.name == "posix":
base = _PROJECT_BASE
try:
cwd = os.getcwd()
except OSError:
cwd = None
if (not os.path.isabs(_CONFIG_VARS['srcdir']) and
base != cwd):
# srcdir is relative and we are not in the same directory
# as the executable. Assume executable is in the build
# directory and make srcdir absolute.
srcdir = os.path.join(base, _CONFIG_VARS['srcdir'])
_CONFIG_VARS['srcdir'] = os.path.normpath(srcdir)
if sys.platform == 'darwin':
kernel_version = os.uname()[2] # Kernel version (8.4.3)
major_version = int(kernel_version.split('.')[0])
if major_version < 8:
# On Mac OS X before 10.4, check if -arch and -isysroot
# are in CFLAGS or LDFLAGS and remove them if they are.
# This is needed when building extensions on a 10.3 system
# using a universal build of python.
for key in ('LDFLAGS', 'BASECFLAGS',
# a number of derived variables. These need to be
# patched up as well.
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
flags = _CONFIG_VARS[key]
flags = re.sub(r'-arch\s+\w+\s', ' ', flags)
flags = re.sub('-isysroot [^ \t]*', ' ', flags)
_CONFIG_VARS[key] = flags
else:
# Allow the user to override the architecture flags using
# an environment variable.
# NOTE: This name was introduced by Apple in OSX 10.5 and
# is used by several scripting languages distributed with
# that OS release.
if 'ARCHFLAGS' in os.environ:
arch = os.environ['ARCHFLAGS']
for key in ('LDFLAGS', 'BASECFLAGS',
# a number of derived variables. These need to be
# patched up as well.
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
flags = _CONFIG_VARS[key]
flags = re.sub(r'-arch\s+\w+\s', ' ', flags)
flags = flags + ' ' + arch
_CONFIG_VARS[key] = flags
# If we're on OSX 10.5 or later and the user tries to
# compiles an extension using an SDK that is not present
# on the current machine it is better to not use an SDK
# than to fail.
#
# The major usecase for this is users using a Python.org
# binary installer on OSX 10.6: that installer uses
# the 10.4u SDK, but that SDK is not installed by default
# when you install Xcode.
#
CFLAGS = _CONFIG_VARS.get('CFLAGS', '')
m = re.search(r'-isysroot\s+(\S+)', CFLAGS)
if m is not None:
sdk = m.group(1)
if not os.path.exists(sdk):
for key in ('LDFLAGS', 'BASECFLAGS',
# a number of derived variables. These need to be
# patched up as well.
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
flags = _CONFIG_VARS[key]
flags = re.sub(r'-isysroot\s+\S+(\s|$)', ' ', flags)
_CONFIG_VARS[key] = flags
if args:
vals = []
for name in args:
vals.append(_CONFIG_VARS.get(name))
return vals
else:
return _CONFIG_VARS
def get_config_var(name):
"""Return the value of a single variable using the dictionary returned by
'get_config_vars()'.
Equivalent to get_config_vars().get(name)
"""
return get_config_vars().get(name)
def get_platform():
"""Return a string that identifies the current platform.
This is used mainly to distinguish platform-specific build directories and
platform-specific built distributions. Typically includes the OS name
and version and the architecture (as supplied by 'os.uname()'),
although the exact information included depends on the OS; eg. for IRIX
the architecture isn't particularly important (IRIX only runs on SGI
hardware), but for Linux the kernel version isn't particularly
important.
Examples of returned values:
linux-i586
linux-alpha (?)
solaris-2.6-sun4u
irix-5.3
irix64-6.2
Windows will return one of:
win-amd64 (64bit Windows on AMD64 (aka x86_64, Intel64, EM64T, etc)
win-ia64 (64bit Windows on Itanium)
win32 (all others - specifically, sys.platform is returned)
For other non-POSIX platforms, currently just returns 'sys.platform'.
"""
if os.name == 'nt':
# sniff sys.version for architecture.
prefix = " bit ("
i = sys.version.find(prefix)
if i == -1:
return sys.platform
j = sys.version.find(")", i)
look = sys.version[i+len(prefix):j].lower()
if look == 'amd64':
return 'win-amd64'
if look == 'itanium':
return 'win-ia64'
return sys.platform
if os.name != "posix" or not hasattr(os, 'uname'):
# XXX what about the architecture? NT is Intel or Alpha,
# Mac OS is M68k or PPC, etc.
return sys.platform
# Try to distinguish various flavours of Unix
osname, host, release, version, machine = os.uname()
# Convert the OS name to lowercase, remove '/' characters
# (to accommodate BSD/OS), and translate spaces (for "Power Macintosh")
osname = osname.lower().replace('/', '')
machine = machine.replace(' ', '_')
machine = machine.replace('/', '-')
if osname[:5] == "linux":
# At least on Linux/Intel, 'machine' is the processor --
# i386, etc.
# XXX what about Alpha, SPARC, etc?
return "%s-%s" % (osname, machine)
elif osname[:5] == "sunos":
if release[0] >= "5": # SunOS 5 == Solaris 2
osname = "solaris"
release = "%d.%s" % (int(release[0]) - 3, release[2:])
# fall through to standard osname-release-machine representation
elif osname[:4] == "irix": # could be "irix64"!
return "%s-%s" % (osname, release)
elif osname[:3] == "aix":
return "%s-%s.%s" % (osname, version, release)
elif osname[:6] == "cygwin":
osname = "cygwin"
rel_re = re.compile(r'[\d.]+')
m = rel_re.match(release)
if m:
release = m.group()
elif osname[:6] == "darwin":
#
# For our purposes, we'll assume that the system version from
# distutils' perspective is what MACOSX_DEPLOYMENT_TARGET is set
# to. This makes the compatibility story a bit more sane because the
# machine is going to compile and link as if it were
# MACOSX_DEPLOYMENT_TARGET.
cfgvars = get_config_vars()
macver = cfgvars.get('MACOSX_DEPLOYMENT_TARGET')
if True:
# Always calculate the release of the running machine,
# needed to determine if we can build fat binaries or not.
macrelease = macver
# Get the system version. Reading this plist is a documented
# way to get the system version (see the documentation for
# the Gestalt Manager)
try:
f = open('/System/Library/CoreServices/SystemVersion.plist')
except IOError:
# We're on a plain darwin box, fall back to the default
# behaviour.
pass
else:
try:
m = re.search(r'<key>ProductUserVisibleVersion</key>\s*'
r'<string>(.*?)</string>', f.read())
finally:
f.close()
if m is not None:
macrelease = '.'.join(m.group(1).split('.')[:2])
# else: fall back to the default behaviour
if not macver:
macver = macrelease
if macver:
release = macver
osname = "macosx"
if ((macrelease + '.') >= '10.4.' and
'-arch' in get_config_vars().get('CFLAGS', '').strip()):
# The universal build will build fat binaries, but not on
# systems before 10.4
#
# Try to detect 4-way universal builds, those have machine-type
# 'universal' instead of 'fat'.
machine = 'fat'
cflags = get_config_vars().get('CFLAGS')
archs = re.findall(r'-arch\s+(\S+)', cflags)
archs = tuple(sorted(set(archs)))
if len(archs) == 1:
machine = archs[0]
elif archs == ('i386', 'ppc'):
machine = 'fat'
elif archs == ('i386', 'x86_64'):
machine = 'intel'
elif archs == ('i386', 'ppc', 'x86_64'):
machine = 'fat3'
elif archs == ('ppc64', 'x86_64'):
machine = 'fat64'
elif archs == ('i386', 'ppc', 'ppc64', 'x86_64'):
machine = 'universal'
else:
raise ValueError(
"Don't know machine value for archs=%r" % (archs,))
elif machine == 'i386':
# On OSX the machine type returned by uname is always the
# 32-bit variant, even if the executable architecture is
# the 64-bit variant
if sys.maxsize >= 2**32:
machine = 'x86_64'
elif machine in ('PowerPC', 'Power_Macintosh'):
# Pick a sane name for the PPC architecture.
# See 'i386' case
if sys.maxsize >= 2**32:
machine = 'ppc64'
else:
machine = 'ppc'
return "%s-%s-%s" % (osname, release, machine)
def get_python_version():
return _PY_VERSION_SHORT
def _print_dict(title, data):
for index, (key, value) in enumerate(sorted(data.items())):
if index == 0:
print('%s: ' % (title))
print('\t%s = "%s"' % (key, value))
def _main():
"""Display all information sysconfig detains."""
print('Platform: "%s"' % get_platform())
print('Python version: "%s"' % get_python_version())
print('Current installation scheme: "%s"' % _get_default_scheme())
print()
_print_dict('Paths', get_paths())
print()
_print_dict('Variables', get_config_vars())
if __name__ == '__main__':
_main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,509 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Vinay Sajip.
# Licensed to the Python Software Foundation under a contributor agreement.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
import hashlib
import logging
import os
import shutil
import subprocess
import tempfile
try:
from threading import Thread
except ImportError:
from dummy_threading import Thread
from . import DistlibException
from .compat import (HTTPBasicAuthHandler, Request, HTTPPasswordMgr,
urlparse, build_opener, string_types)
from .util import zip_dir, ServerProxy
logger = logging.getLogger(__name__)
DEFAULT_INDEX = 'https://pypi.org/pypi'
DEFAULT_REALM = 'pypi'
class PackageIndex(object):
"""
This class represents a package index compatible with PyPI, the Python
Package Index.
"""
boundary = b'----------ThIs_Is_tHe_distlib_index_bouNdaRY_$'
def __init__(self, url=None):
"""
Initialise an instance.
:param url: The URL of the index. If not specified, the URL for PyPI is
used.
"""
self.url = url or DEFAULT_INDEX
self.read_configuration()
scheme, netloc, path, params, query, frag = urlparse(self.url)
if params or query or frag or scheme not in ('http', 'https'):
raise DistlibException('invalid repository: %s' % self.url)
self.password_handler = None
self.ssl_verifier = None
self.gpg = None
self.gpg_home = None
with open(os.devnull, 'w') as sink:
# Use gpg by default rather than gpg2, as gpg2 insists on
# prompting for passwords
for s in ('gpg', 'gpg2'):
try:
rc = subprocess.check_call([s, '--version'], stdout=sink,
stderr=sink)
if rc == 0:
self.gpg = s
break
except OSError:
pass
def _get_pypirc_command(self):
"""
Get the distutils command for interacting with PyPI configurations.
:return: the command.
"""
from .util import _get_pypirc_command as cmd
return cmd()
def read_configuration(self):
"""
Read the PyPI access configuration as supported by distutils. This populates
``username``, ``password``, ``realm`` and ``url`` attributes from the
configuration.
"""
from .util import _load_pypirc
cfg = _load_pypirc(self)
self.username = cfg.get('username')
self.password = cfg.get('password')
self.realm = cfg.get('realm', 'pypi')
self.url = cfg.get('repository', self.url)
def save_configuration(self):
"""
Save the PyPI access configuration. You must have set ``username`` and
``password`` attributes before calling this method.
"""
self.check_credentials()
from .util import _store_pypirc
_store_pypirc(self)
def check_credentials(self):
"""
Check that ``username`` and ``password`` have been set, and raise an
exception if not.
"""
if self.username is None or self.password is None:
raise DistlibException('username and password must be set')
pm = HTTPPasswordMgr()
_, netloc, _, _, _, _ = urlparse(self.url)
pm.add_password(self.realm, netloc, self.username, self.password)
self.password_handler = HTTPBasicAuthHandler(pm)
def register(self, metadata):
"""
Register a distribution on PyPI, using the provided metadata.
:param metadata: A :class:`Metadata` instance defining at least a name
and version number for the distribution to be
registered.
:return: The HTTP response received from PyPI upon submission of the
request.
"""
self.check_credentials()
metadata.validate()
d = metadata.todict()
d[':action'] = 'verify'
request = self.encode_request(d.items(), [])
response = self.send_request(request)
d[':action'] = 'submit'
request = self.encode_request(d.items(), [])
return self.send_request(request)
def _reader(self, name, stream, outbuf):
"""
Thread runner for reading lines of from a subprocess into a buffer.
:param name: The logical name of the stream (used for logging only).
:param stream: The stream to read from. This will typically a pipe
connected to the output stream of a subprocess.
:param outbuf: The list to append the read lines to.
"""
while True:
s = stream.readline()
if not s:
break
s = s.decode('utf-8').rstrip()
outbuf.append(s)
logger.debug('%s: %s' % (name, s))
stream.close()
def get_sign_command(self, filename, signer, sign_password,
keystore=None):
"""
Return a suitable command for signing a file.
:param filename: The pathname to the file to be signed.
:param signer: The identifier of the signer of the file.
:param sign_password: The passphrase for the signer's
private key used for signing.
:param keystore: The path to a directory which contains the keys
used in verification. If not specified, the
instance's ``gpg_home`` attribute is used instead.
:return: The signing command as a list suitable to be
passed to :class:`subprocess.Popen`.
"""
cmd = [self.gpg, '--status-fd', '2', '--no-tty']
if keystore is None:
keystore = self.gpg_home
if keystore:
cmd.extend(['--homedir', keystore])
if sign_password is not None:
cmd.extend(['--batch', '--passphrase-fd', '0'])
td = tempfile.mkdtemp()
sf = os.path.join(td, os.path.basename(filename) + '.asc')
cmd.extend(['--detach-sign', '--armor', '--local-user',
signer, '--output', sf, filename])
logger.debug('invoking: %s', ' '.join(cmd))
return cmd, sf
def run_command(self, cmd, input_data=None):
"""
Run a command in a child process , passing it any input data specified.
:param cmd: The command to run.
:param input_data: If specified, this must be a byte string containing
data to be sent to the child process.
:return: A tuple consisting of the subprocess' exit code, a list of
lines read from the subprocess' ``stdout``, and a list of
lines read from the subprocess' ``stderr``.
"""
kwargs = {
'stdout': subprocess.PIPE,
'stderr': subprocess.PIPE,
}
if input_data is not None:
kwargs['stdin'] = subprocess.PIPE
stdout = []
stderr = []
p = subprocess.Popen(cmd, **kwargs)
# We don't use communicate() here because we may need to
# get clever with interacting with the command
t1 = Thread(target=self._reader, args=('stdout', p.stdout, stdout))
t1.start()
t2 = Thread(target=self._reader, args=('stderr', p.stderr, stderr))
t2.start()
if input_data is not None:
p.stdin.write(input_data)
p.stdin.close()
p.wait()
t1.join()
t2.join()
return p.returncode, stdout, stderr
def sign_file(self, filename, signer, sign_password, keystore=None):
"""
Sign a file.
:param filename: The pathname to the file to be signed.
:param signer: The identifier of the signer of the file.
:param sign_password: The passphrase for the signer's
private key used for signing.
:param keystore: The path to a directory which contains the keys
used in signing. If not specified, the instance's
``gpg_home`` attribute is used instead.
:return: The absolute pathname of the file where the signature is
stored.
"""
cmd, sig_file = self.get_sign_command(filename, signer, sign_password,
keystore)
rc, stdout, stderr = self.run_command(cmd,
sign_password.encode('utf-8'))
if rc != 0:
raise DistlibException('sign command failed with error '
'code %s' % rc)
return sig_file
def upload_file(self, metadata, filename, signer=None, sign_password=None,
filetype='sdist', pyversion='source', keystore=None):
"""
Upload a release file to the index.
:param metadata: A :class:`Metadata` instance defining at least a name
and version number for the file to be uploaded.
:param filename: The pathname of the file to be uploaded.
:param signer: The identifier of the signer of the file.
:param sign_password: The passphrase for the signer's
private key used for signing.
:param filetype: The type of the file being uploaded. This is the
distutils command which produced that file, e.g.
``sdist`` or ``bdist_wheel``.
:param pyversion: The version of Python which the release relates
to. For code compatible with any Python, this would
be ``source``, otherwise it would be e.g. ``3.2``.
:param keystore: The path to a directory which contains the keys
used in signing. If not specified, the instance's
``gpg_home`` attribute is used instead.
:return: The HTTP response received from PyPI upon submission of the
request.
"""
self.check_credentials()
if not os.path.exists(filename):
raise DistlibException('not found: %s' % filename)
metadata.validate()
d = metadata.todict()
sig_file = None
if signer:
if not self.gpg:
logger.warning('no signing program available - not signed')
else:
sig_file = self.sign_file(filename, signer, sign_password,
keystore)
with open(filename, 'rb') as f:
file_data = f.read()
md5_digest = hashlib.md5(file_data).hexdigest()
sha256_digest = hashlib.sha256(file_data).hexdigest()
d.update({
':action': 'file_upload',
'protocol_version': '1',
'filetype': filetype,
'pyversion': pyversion,
'md5_digest': md5_digest,
'sha256_digest': sha256_digest,
})
files = [('content', os.path.basename(filename), file_data)]
if sig_file:
with open(sig_file, 'rb') as f:
sig_data = f.read()
files.append(('gpg_signature', os.path.basename(sig_file),
sig_data))
shutil.rmtree(os.path.dirname(sig_file))
request = self.encode_request(d.items(), files)
return self.send_request(request)
def upload_documentation(self, metadata, doc_dir):
"""
Upload documentation to the index.
:param metadata: A :class:`Metadata` instance defining at least a name
and version number for the documentation to be
uploaded.
:param doc_dir: The pathname of the directory which contains the
documentation. This should be the directory that
contains the ``index.html`` for the documentation.
:return: The HTTP response received from PyPI upon submission of the
request.
"""
self.check_credentials()
if not os.path.isdir(doc_dir):
raise DistlibException('not a directory: %r' % doc_dir)
fn = os.path.join(doc_dir, 'index.html')
if not os.path.exists(fn):
raise DistlibException('not found: %r' % fn)
metadata.validate()
name, version = metadata.name, metadata.version
zip_data = zip_dir(doc_dir).getvalue()
fields = [(':action', 'doc_upload'),
('name', name), ('version', version)]
files = [('content', name, zip_data)]
request = self.encode_request(fields, files)
return self.send_request(request)
def get_verify_command(self, signature_filename, data_filename,
keystore=None):
"""
Return a suitable command for verifying a file.
:param signature_filename: The pathname to the file containing the
signature.
:param data_filename: The pathname to the file containing the
signed data.
:param keystore: The path to a directory which contains the keys
used in verification. If not specified, the
instance's ``gpg_home`` attribute is used instead.
:return: The verifying command as a list suitable to be
passed to :class:`subprocess.Popen`.
"""
cmd = [self.gpg, '--status-fd', '2', '--no-tty']
if keystore is None:
keystore = self.gpg_home
if keystore:
cmd.extend(['--homedir', keystore])
cmd.extend(['--verify', signature_filename, data_filename])
logger.debug('invoking: %s', ' '.join(cmd))
return cmd
def verify_signature(self, signature_filename, data_filename,
keystore=None):
"""
Verify a signature for a file.
:param signature_filename: The pathname to the file containing the
signature.
:param data_filename: The pathname to the file containing the
signed data.
:param keystore: The path to a directory which contains the keys
used in verification. If not specified, the
instance's ``gpg_home`` attribute is used instead.
:return: True if the signature was verified, else False.
"""
if not self.gpg:
raise DistlibException('verification unavailable because gpg '
'unavailable')
cmd = self.get_verify_command(signature_filename, data_filename,
keystore)
rc, stdout, stderr = self.run_command(cmd)
if rc not in (0, 1):
raise DistlibException('verify command failed with error '
'code %s' % rc)
return rc == 0
def download_file(self, url, destfile, digest=None, reporthook=None):
"""
This is a convenience method for downloading a file from an URL.
Normally, this will be a file from the index, though currently
no check is made for this (i.e. a file can be downloaded from
anywhere).
The method is just like the :func:`urlretrieve` function in the
standard library, except that it allows digest computation to be
done during download and checking that the downloaded data
matched any expected value.
:param url: The URL of the file to be downloaded (assumed to be
available via an HTTP GET request).
:param destfile: The pathname where the downloaded file is to be
saved.
:param digest: If specified, this must be a (hasher, value)
tuple, where hasher is the algorithm used (e.g.
``'md5'``) and ``value`` is the expected value.
:param reporthook: The same as for :func:`urlretrieve` in the
standard library.
"""
if digest is None:
digester = None
logger.debug('No digest specified')
else:
if isinstance(digest, (list, tuple)):
hasher, digest = digest
else:
hasher = 'md5'
digester = getattr(hashlib, hasher)()
logger.debug('Digest specified: %s' % digest)
# The following code is equivalent to urlretrieve.
# We need to do it this way so that we can compute the
# digest of the file as we go.
with open(destfile, 'wb') as dfp:
# addinfourl is not a context manager on 2.x
# so we have to use try/finally
sfp = self.send_request(Request(url))
try:
headers = sfp.info()
blocksize = 8192
size = -1
read = 0
blocknum = 0
if "content-length" in headers:
size = int(headers["Content-Length"])
if reporthook:
reporthook(blocknum, blocksize, size)
while True:
block = sfp.read(blocksize)
if not block:
break
read += len(block)
dfp.write(block)
if digester:
digester.update(block)
blocknum += 1
if reporthook:
reporthook(blocknum, blocksize, size)
finally:
sfp.close()
# check that we got the whole file, if we can
if size >= 0 and read < size:
raise DistlibException(
'retrieval incomplete: got only %d out of %d bytes'
% (read, size))
# if we have a digest, it must match.
if digester:
actual = digester.hexdigest()
if digest != actual:
raise DistlibException('%s digest mismatch for %s: expected '
'%s, got %s' % (hasher, destfile,
digest, actual))
logger.debug('Digest verified: %s', digest)
def send_request(self, req):
"""
Send a standard library :class:`Request` to PyPI and return its
response.
:param req: The request to send.
:return: The HTTP response from PyPI (a standard library HTTPResponse).
"""
handlers = []
if self.password_handler:
handlers.append(self.password_handler)
if self.ssl_verifier:
handlers.append(self.ssl_verifier)
opener = build_opener(*handlers)
return opener.open(req)
def encode_request(self, fields, files):
"""
Encode fields and files for posting to an HTTP server.
:param fields: The fields to send as a list of (fieldname, value)
tuples.
:param files: The files to send as a list of (fieldname, filename,
file_bytes) tuple.
"""
# Adapted from packaging, which in turn was adapted from
# http://code.activestate.com/recipes/146306
parts = []
boundary = self.boundary
for k, values in fields:
if not isinstance(values, (list, tuple)):
values = [values]
for v in values:
parts.extend((
b'--' + boundary,
('Content-Disposition: form-data; name="%s"' %
k).encode('utf-8'),
b'',
v.encode('utf-8')))
for key, filename, value in files:
parts.extend((
b'--' + boundary,
('Content-Disposition: form-data; name="%s"; filename="%s"' %
(key, filename)).encode('utf-8'),
b'',
value))
parts.extend((b'--' + boundary + b'--', b''))
body = b'\r\n'.join(parts)
ct = b'multipart/form-data; boundary=' + boundary
headers = {
'Content-type': ct,
'Content-length': str(len(body))
}
return Request(self.url, body, headers)
def search(self, terms, operator=None):
if isinstance(terms, string_types):
terms = {'name': terms}
rpc_proxy = ServerProxy(self.url, timeout=3.0)
try:
return rpc_proxy.search(terms, operator or 'and')
finally:
rpc_proxy('close')()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,393 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2013 Python Software Foundation.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
"""
Class representing the list of files in a distribution.
Equivalent to distutils.filelist, but fixes some problems.
"""
import fnmatch
import logging
import os
import re
import sys
from . import DistlibException
from .compat import fsdecode
from .util import convert_path
__all__ = ['Manifest']
logger = logging.getLogger(__name__)
# a \ followed by some spaces + EOL
_COLLAPSE_PATTERN = re.compile('\\\\w*\n', re.M)
_COMMENTED_LINE = re.compile('#.*?(?=\n)|\n(?=$)', re.M | re.S)
#
# Due to the different results returned by fnmatch.translate, we need
# to do slightly different processing for Python 2.7 and 3.2 ... this needed
# to be brought in for Python 3.6 onwards.
#
_PYTHON_VERSION = sys.version_info[:2]
class Manifest(object):
"""A list of files built by on exploring the filesystem and filtered by
applying various patterns to what we find there.
"""
def __init__(self, base=None):
"""
Initialise an instance.
:param base: The base directory to explore under.
"""
self.base = os.path.abspath(os.path.normpath(base or os.getcwd()))
self.prefix = self.base + os.sep
self.allfiles = None
self.files = set()
#
# Public API
#
def findall(self):
"""Find all files under the base and set ``allfiles`` to the absolute
pathnames of files found.
"""
from stat import S_ISREG, S_ISDIR, S_ISLNK
self.allfiles = allfiles = []
root = self.base
stack = [root]
pop = stack.pop
push = stack.append
while stack:
root = pop()
names = os.listdir(root)
for name in names:
fullname = os.path.join(root, name)
# Avoid excess stat calls -- just one will do, thank you!
stat = os.stat(fullname)
mode = stat.st_mode
if S_ISREG(mode):
allfiles.append(fsdecode(fullname))
elif S_ISDIR(mode) and not S_ISLNK(mode):
push(fullname)
def add(self, item):
"""
Add a file to the manifest.
:param item: The pathname to add. This can be relative to the base.
"""
if not item.startswith(self.prefix):
item = os.path.join(self.base, item)
self.files.add(os.path.normpath(item))
def add_many(self, items):
"""
Add a list of files to the manifest.
:param items: The pathnames to add. These can be relative to the base.
"""
for item in items:
self.add(item)
def sorted(self, wantdirs=False):
"""
Return sorted files in directory order
"""
def add_dir(dirs, d):
dirs.add(d)
logger.debug('add_dir added %s', d)
if d != self.base:
parent, _ = os.path.split(d)
assert parent not in ('', '/')
add_dir(dirs, parent)
result = set(self.files) # make a copy!
if wantdirs:
dirs = set()
for f in result:
add_dir(dirs, os.path.dirname(f))
result |= dirs
return [os.path.join(*path_tuple) for path_tuple in
sorted(os.path.split(path) for path in result)]
def clear(self):
"""Clear all collected files."""
self.files = set()
self.allfiles = []
def process_directive(self, directive):
"""
Process a directive which either adds some files from ``allfiles`` to
``files``, or removes some files from ``files``.
:param directive: The directive to process. This should be in a format
compatible with distutils ``MANIFEST.in`` files:
http://docs.python.org/distutils/sourcedist.html#commands
"""
# Parse the line: split it up, make sure the right number of words
# is there, and return the relevant words. 'action' is always
# defined: it's the first word of the line. Which of the other
# three are defined depends on the action; it'll be either
# patterns, (dir and patterns), or (dirpattern).
action, patterns, thedir, dirpattern = self._parse_directive(directive)
# OK, now we know that the action is valid and we have the
# right number of words on the line for that action -- so we
# can proceed with minimal error-checking.
if action == 'include':
for pattern in patterns:
if not self._include_pattern(pattern, anchor=True):
logger.warning('no files found matching %r', pattern)
elif action == 'exclude':
for pattern in patterns:
found = self._exclude_pattern(pattern, anchor=True)
#if not found:
# logger.warning('no previously-included files '
# 'found matching %r', pattern)
elif action == 'global-include':
for pattern in patterns:
if not self._include_pattern(pattern, anchor=False):
logger.warning('no files found matching %r '
'anywhere in distribution', pattern)
elif action == 'global-exclude':
for pattern in patterns:
found = self._exclude_pattern(pattern, anchor=False)
#if not found:
# logger.warning('no previously-included files '
# 'matching %r found anywhere in '
# 'distribution', pattern)
elif action == 'recursive-include':
for pattern in patterns:
if not self._include_pattern(pattern, prefix=thedir):
logger.warning('no files found matching %r '
'under directory %r', pattern, thedir)
elif action == 'recursive-exclude':
for pattern in patterns:
found = self._exclude_pattern(pattern, prefix=thedir)
#if not found:
# logger.warning('no previously-included files '
# 'matching %r found under directory %r',
# pattern, thedir)
elif action == 'graft':
if not self._include_pattern(None, prefix=dirpattern):
logger.warning('no directories found matching %r',
dirpattern)
elif action == 'prune':
if not self._exclude_pattern(None, prefix=dirpattern):
logger.warning('no previously-included directories found '
'matching %r', dirpattern)
else: # pragma: no cover
# This should never happen, as it should be caught in
# _parse_template_line
raise DistlibException(
'invalid action %r' % action)
#
# Private API
#
def _parse_directive(self, directive):
"""
Validate a directive.
:param directive: The directive to validate.
:return: A tuple of action, patterns, thedir, dir_patterns
"""
words = directive.split()
if len(words) == 1 and words[0] not in ('include', 'exclude',
'global-include',
'global-exclude',
'recursive-include',
'recursive-exclude',
'graft', 'prune'):
# no action given, let's use the default 'include'
words.insert(0, 'include')
action = words[0]
patterns = thedir = dir_pattern = None
if action in ('include', 'exclude',
'global-include', 'global-exclude'):
if len(words) < 2:
raise DistlibException(
'%r expects <pattern1> <pattern2> ...' % action)
patterns = [convert_path(word) for word in words[1:]]
elif action in ('recursive-include', 'recursive-exclude'):
if len(words) < 3:
raise DistlibException(
'%r expects <dir> <pattern1> <pattern2> ...' % action)
thedir = convert_path(words[1])
patterns = [convert_path(word) for word in words[2:]]
elif action in ('graft', 'prune'):
if len(words) != 2:
raise DistlibException(
'%r expects a single <dir_pattern>' % action)
dir_pattern = convert_path(words[1])
else:
raise DistlibException('unknown action %r' % action)
return action, patterns, thedir, dir_pattern
def _include_pattern(self, pattern, anchor=True, prefix=None,
is_regex=False):
"""Select strings (presumably filenames) from 'self.files' that
match 'pattern', a Unix-style wildcard (glob) pattern.
Patterns are not quite the same as implemented by the 'fnmatch'
module: '*' and '?' match non-special characters, where "special"
is platform-dependent: slash on Unix; colon, slash, and backslash on
DOS/Windows; and colon on Mac OS.
If 'anchor' is true (the default), then the pattern match is more
stringent: "*.py" will match "foo.py" but not "foo/bar.py". If
'anchor' is false, both of these will match.
If 'prefix' is supplied, then only filenames starting with 'prefix'
(itself a pattern) and ending with 'pattern', with anything in between
them, will match. 'anchor' is ignored in this case.
If 'is_regex' is true, 'anchor' and 'prefix' are ignored, and
'pattern' is assumed to be either a string containing a regex or a
regex object -- no translation is done, the regex is just compiled
and used as-is.
Selected strings will be added to self.files.
Return True if files are found.
"""
# XXX docstring lying about what the special chars are?
found = False
pattern_re = self._translate_pattern(pattern, anchor, prefix, is_regex)
# delayed loading of allfiles list
if self.allfiles is None:
self.findall()
for name in self.allfiles:
if pattern_re.search(name):
self.files.add(name)
found = True
return found
def _exclude_pattern(self, pattern, anchor=True, prefix=None,
is_regex=False):
"""Remove strings (presumably filenames) from 'files' that match
'pattern'.
Other parameters are the same as for 'include_pattern()', above.
The list 'self.files' is modified in place. Return True if files are
found.
This API is public to allow e.g. exclusion of SCM subdirs, e.g. when
packaging source distributions
"""
found = False
pattern_re = self._translate_pattern(pattern, anchor, prefix, is_regex)
for f in list(self.files):
if pattern_re.search(f):
self.files.remove(f)
found = True
return found
def _translate_pattern(self, pattern, anchor=True, prefix=None,
is_regex=False):
"""Translate a shell-like wildcard pattern to a compiled regular
expression.
Return the compiled regex. If 'is_regex' true,
then 'pattern' is directly compiled to a regex (if it's a string)
or just returned as-is (assumes it's a regex object).
"""
if is_regex:
if isinstance(pattern, str):
return re.compile(pattern)
else:
return pattern
if _PYTHON_VERSION > (3, 2):
# ditch start and end characters
start, _, end = self._glob_to_re('_').partition('_')
if pattern:
pattern_re = self._glob_to_re(pattern)
if _PYTHON_VERSION > (3, 2):
assert pattern_re.startswith(start) and pattern_re.endswith(end)
else:
pattern_re = ''
base = re.escape(os.path.join(self.base, ''))
if prefix is not None:
# ditch end of pattern character
if _PYTHON_VERSION <= (3, 2):
empty_pattern = self._glob_to_re('')
prefix_re = self._glob_to_re(prefix)[:-len(empty_pattern)]
else:
prefix_re = self._glob_to_re(prefix)
assert prefix_re.startswith(start) and prefix_re.endswith(end)
prefix_re = prefix_re[len(start): len(prefix_re) - len(end)]
sep = os.sep
if os.sep == '\\':
sep = r'\\'
if _PYTHON_VERSION <= (3, 2):
pattern_re = '^' + base + sep.join((prefix_re,
'.*' + pattern_re))
else:
pattern_re = pattern_re[len(start): len(pattern_re) - len(end)]
pattern_re = r'%s%s%s%s.*%s%s' % (start, base, prefix_re, sep,
pattern_re, end)
else: # no prefix -- respect anchor flag
if anchor:
if _PYTHON_VERSION <= (3, 2):
pattern_re = '^' + base + pattern_re
else:
pattern_re = r'%s%s%s' % (start, base, pattern_re[len(start):])
return re.compile(pattern_re)
def _glob_to_re(self, pattern):
"""Translate a shell-like glob pattern to a regular expression.
Return a string containing the regex. Differs from
'fnmatch.translate()' in that '*' does not match "special characters"
(which are platform-specific).
"""
pattern_re = fnmatch.translate(pattern)
# '?' and '*' in the glob pattern become '.' and '.*' in the RE, which
# IMHO is wrong -- '?' and '*' aren't supposed to match slash in Unix,
# and by extension they shouldn't match such "special characters" under
# any OS. So change all non-escaped dots in the RE to match any
# character except the special characters (currently: just os.sep).
sep = os.sep
if os.sep == '\\':
# we're using a regex to manipulate a regex, so we need
# to escape the backslash twice
sep = r'\\\\'
escaped = r'\1[^%s]' % sep
pattern_re = re.sub(r'((?<!\\)(\\\\)*)\.', escaped, pattern_re)
return pattern_re

View File

@ -0,0 +1,147 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2017 Vinay Sajip.
# Licensed to the Python Software Foundation under a contributor agreement.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
"""
Parser for the environment markers micro-language defined in PEP 508.
"""
# Note: In PEP 345, the micro-language was Python compatible, so the ast
# module could be used to parse it. However, PEP 508 introduced operators such
# as ~= and === which aren't in Python, necessitating a different approach.
import os
import re
import sys
import platform
from .compat import string_types
from .util import in_venv, parse_marker
from .version import NormalizedVersion as NV
__all__ = ['interpret']
_VERSION_PATTERN = re.compile(r'((\d+(\.\d+)*\w*)|\'(\d+(\.\d+)*\w*)\'|\"(\d+(\.\d+)*\w*)\")')
def _is_literal(o):
if not isinstance(o, string_types) or not o:
return False
return o[0] in '\'"'
def _get_versions(s):
result = []
for m in _VERSION_PATTERN.finditer(s):
result.append(NV(m.groups()[0]))
return set(result)
class Evaluator(object):
"""
This class is used to evaluate marker expessions.
"""
operations = {
'==': lambda x, y: x == y,
'===': lambda x, y: x == y,
'~=': lambda x, y: x == y or x > y,
'!=': lambda x, y: x != y,
'<': lambda x, y: x < y,
'<=': lambda x, y: x == y or x < y,
'>': lambda x, y: x > y,
'>=': lambda x, y: x == y or x > y,
'and': lambda x, y: x and y,
'or': lambda x, y: x or y,
'in': lambda x, y: x in y,
'not in': lambda x, y: x not in y,
}
def evaluate(self, expr, context):
"""
Evaluate a marker expression returned by the :func:`parse_requirement`
function in the specified context.
"""
if isinstance(expr, string_types):
if expr[0] in '\'"':
result = expr[1:-1]
else:
if expr not in context:
raise SyntaxError('unknown variable: %s' % expr)
result = context[expr]
else:
assert isinstance(expr, dict)
op = expr['op']
if op not in self.operations:
raise NotImplementedError('op not implemented: %s' % op)
elhs = expr['lhs']
erhs = expr['rhs']
if _is_literal(expr['lhs']) and _is_literal(expr['rhs']):
raise SyntaxError('invalid comparison: %s %s %s' % (elhs, op, erhs))
lhs = self.evaluate(elhs, context)
rhs = self.evaluate(erhs, context)
if ((elhs == 'python_version' or erhs == 'python_version') and
op in ('<', '<=', '>', '>=', '===', '==', '!=', '~=')):
lhs = NV(lhs)
rhs = NV(rhs)
elif elhs == 'python_version' and op in ('in', 'not in'):
lhs = NV(lhs)
rhs = _get_versions(rhs)
result = self.operations[op](lhs, rhs)
return result
def default_context():
def format_full_version(info):
version = '%s.%s.%s' % (info.major, info.minor, info.micro)
kind = info.releaselevel
if kind != 'final':
version += kind[0] + str(info.serial)
return version
if hasattr(sys, 'implementation'):
implementation_version = format_full_version(sys.implementation.version)
implementation_name = sys.implementation.name
else:
implementation_version = '0'
implementation_name = ''
result = {
'implementation_name': implementation_name,
'implementation_version': implementation_version,
'os_name': os.name,
'platform_machine': platform.machine(),
'platform_python_implementation': platform.python_implementation(),
'platform_release': platform.release(),
'platform_system': platform.system(),
'platform_version': platform.version(),
'platform_in_venv': str(in_venv()),
'python_full_version': platform.python_version(),
'python_version': platform.python_version()[:3],
'sys_platform': sys.platform,
}
return result
DEFAULT_CONTEXT = default_context()
del default_context
evaluator = Evaluator()
def interpret(marker, execution_context=None):
"""
Interpret a marker and return a result depending on environment.
:param marker: The marker to interpret.
:type marker: str
:param execution_context: The context used for name lookup.
:type execution_context: mapping
"""
try:
expr, rest = parse_marker(marker)
except Exception as e:
raise SyntaxError('Unable to interpret marker syntax: %s: %s' % (marker, e))
if rest and rest[0] != '#':
raise SyntaxError('unexpected trailing data in marker: %s: %s' % (marker, rest))
context = dict(DEFAULT_CONTEXT)
if execution_context:
context.update(execution_context)
return evaluator.evaluate(expr, context)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,358 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013-2017 Vinay Sajip.
# Licensed to the Python Software Foundation under a contributor agreement.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
from __future__ import unicode_literals
import bisect
import io
import logging
import os
import pkgutil
import sys
import types
import zipimport
from . import DistlibException
from .util import cached_property, get_cache_base, Cache
logger = logging.getLogger(__name__)
cache = None # created when needed
class ResourceCache(Cache):
def __init__(self, base=None):
if base is None:
# Use native string to avoid issues on 2.x: see Python #20140.
base = os.path.join(get_cache_base(), str('resource-cache'))
super(ResourceCache, self).__init__(base)
def is_stale(self, resource, path):
"""
Is the cache stale for the given resource?
:param resource: The :class:`Resource` being cached.
:param path: The path of the resource in the cache.
:return: True if the cache is stale.
"""
# Cache invalidation is a hard problem :-)
return True
def get(self, resource):
"""
Get a resource into the cache,
:param resource: A :class:`Resource` instance.
:return: The pathname of the resource in the cache.
"""
prefix, path = resource.finder.get_cache_info(resource)
if prefix is None:
result = path
else:
result = os.path.join(self.base, self.prefix_to_dir(prefix), path)
dirname = os.path.dirname(result)
if not os.path.isdir(dirname):
os.makedirs(dirname)
if not os.path.exists(result):
stale = True
else:
stale = self.is_stale(resource, path)
if stale:
# write the bytes of the resource to the cache location
with open(result, 'wb') as f:
f.write(resource.bytes)
return result
class ResourceBase(object):
def __init__(self, finder, name):
self.finder = finder
self.name = name
class Resource(ResourceBase):
"""
A class representing an in-package resource, such as a data file. This is
not normally instantiated by user code, but rather by a
:class:`ResourceFinder` which manages the resource.
"""
is_container = False # Backwards compatibility
def as_stream(self):
"""
Get the resource as a stream.
This is not a property to make it obvious that it returns a new stream
each time.
"""
return self.finder.get_stream(self)
@cached_property
def file_path(self):
global cache
if cache is None:
cache = ResourceCache()
return cache.get(self)
@cached_property
def bytes(self):
return self.finder.get_bytes(self)
@cached_property
def size(self):
return self.finder.get_size(self)
class ResourceContainer(ResourceBase):
is_container = True # Backwards compatibility
@cached_property
def resources(self):
return self.finder.get_resources(self)
class ResourceFinder(object):
"""
Resource finder for file system resources.
"""
if sys.platform.startswith('java'):
skipped_extensions = ('.pyc', '.pyo', '.class')
else:
skipped_extensions = ('.pyc', '.pyo')
def __init__(self, module):
self.module = module
self.loader = getattr(module, '__loader__', None)
self.base = os.path.dirname(getattr(module, '__file__', ''))
def _adjust_path(self, path):
return os.path.realpath(path)
def _make_path(self, resource_name):
# Issue #50: need to preserve type of path on Python 2.x
# like os.path._get_sep
if isinstance(resource_name, bytes): # should only happen on 2.x
sep = b'/'
else:
sep = '/'
parts = resource_name.split(sep)
parts.insert(0, self.base)
result = os.path.join(*parts)
return self._adjust_path(result)
def _find(self, path):
return os.path.exists(path)
def get_cache_info(self, resource):
return None, resource.path
def find(self, resource_name):
path = self._make_path(resource_name)
if not self._find(path):
result = None
else:
if self._is_directory(path):
result = ResourceContainer(self, resource_name)
else:
result = Resource(self, resource_name)
result.path = path
return result
def get_stream(self, resource):
return open(resource.path, 'rb')
def get_bytes(self, resource):
with open(resource.path, 'rb') as f:
return f.read()
def get_size(self, resource):
return os.path.getsize(resource.path)
def get_resources(self, resource):
def allowed(f):
return (f != '__pycache__' and not
f.endswith(self.skipped_extensions))
return set([f for f in os.listdir(resource.path) if allowed(f)])
def is_container(self, resource):
return self._is_directory(resource.path)
_is_directory = staticmethod(os.path.isdir)
def iterator(self, resource_name):
resource = self.find(resource_name)
if resource is not None:
todo = [resource]
while todo:
resource = todo.pop(0)
yield resource
if resource.is_container:
rname = resource.name
for name in resource.resources:
if not rname:
new_name = name
else:
new_name = '/'.join([rname, name])
child = self.find(new_name)
if child.is_container:
todo.append(child)
else:
yield child
class ZipResourceFinder(ResourceFinder):
"""
Resource finder for resources in .zip files.
"""
def __init__(self, module):
super(ZipResourceFinder, self).__init__(module)
archive = self.loader.archive
self.prefix_len = 1 + len(archive)
# PyPy doesn't have a _files attr on zipimporter, and you can't set one
if hasattr(self.loader, '_files'):
self._files = self.loader._files
else:
self._files = zipimport._zip_directory_cache[archive]
self.index = sorted(self._files)
def _adjust_path(self, path):
return path
def _find(self, path):
path = path[self.prefix_len:]
if path in self._files:
result = True
else:
if path and path[-1] != os.sep:
path = path + os.sep
i = bisect.bisect(self.index, path)
try:
result = self.index[i].startswith(path)
except IndexError:
result = False
if not result:
logger.debug('_find failed: %r %r', path, self.loader.prefix)
else:
logger.debug('_find worked: %r %r', path, self.loader.prefix)
return result
def get_cache_info(self, resource):
prefix = self.loader.archive
path = resource.path[1 + len(prefix):]
return prefix, path
def get_bytes(self, resource):
return self.loader.get_data(resource.path)
def get_stream(self, resource):
return io.BytesIO(self.get_bytes(resource))
def get_size(self, resource):
path = resource.path[self.prefix_len:]
return self._files[path][3]
def get_resources(self, resource):
path = resource.path[self.prefix_len:]
if path and path[-1] != os.sep:
path += os.sep
plen = len(path)
result = set()
i = bisect.bisect(self.index, path)
while i < len(self.index):
if not self.index[i].startswith(path):
break
s = self.index[i][plen:]
result.add(s.split(os.sep, 1)[0]) # only immediate children
i += 1
return result
def _is_directory(self, path):
path = path[self.prefix_len:]
if path and path[-1] != os.sep:
path += os.sep
i = bisect.bisect(self.index, path)
try:
result = self.index[i].startswith(path)
except IndexError:
result = False
return result
_finder_registry = {
type(None): ResourceFinder,
zipimport.zipimporter: ZipResourceFinder
}
try:
# In Python 3.6, _frozen_importlib -> _frozen_importlib_external
try:
import _frozen_importlib_external as _fi
except ImportError:
import _frozen_importlib as _fi
_finder_registry[_fi.SourceFileLoader] = ResourceFinder
_finder_registry[_fi.FileFinder] = ResourceFinder
# See issue #146
_finder_registry[_fi.SourcelessFileLoader] = ResourceFinder
del _fi
except (ImportError, AttributeError):
pass
def register_finder(loader, finder_maker):
_finder_registry[type(loader)] = finder_maker
_finder_cache = {}
def finder(package):
"""
Return a resource finder for a package.
:param package: The name of the package.
:return: A :class:`ResourceFinder` instance for the package.
"""
if package in _finder_cache:
result = _finder_cache[package]
else:
if package not in sys.modules:
__import__(package)
module = sys.modules[package]
path = getattr(module, '__path__', None)
if path is None:
raise DistlibException('You cannot get a finder for a module, '
'only for a package')
loader = getattr(module, '__loader__', None)
finder_maker = _finder_registry.get(type(loader))
if finder_maker is None:
raise DistlibException('Unable to locate finder for %r' % package)
result = finder_maker(module)
_finder_cache[package] = result
return result
_dummy_module = types.ModuleType(str('__dummy__'))
def finder_for_path(path):
"""
Return a resource finder for a path, which should represent a container.
:param path: The path.
:return: A :class:`ResourceFinder` instance for the path.
"""
result = None
# calls any path hooks, gets importer into cache
pkgutil.get_importer(path)
loader = sys.path_importer_cache.get(path)
finder = _finder_registry.get(type(loader))
if finder:
module = _dummy_module
module.__file__ = os.path.join(path, '')
module.__loader__ = loader
result = finder(module)
return result

View File

@ -0,0 +1,429 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013-2015 Vinay Sajip.
# Licensed to the Python Software Foundation under a contributor agreement.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
from io import BytesIO
import logging
import os
import re
import struct
import sys
from .compat import sysconfig, detect_encoding, ZipFile
from .resources import finder
from .util import (FileOperator, get_export_entry, convert_path,
get_executable, get_platform, in_venv)
logger = logging.getLogger(__name__)
_DEFAULT_MANIFEST = '''
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
<assemblyIdentity version="1.0.0.0"
processorArchitecture="X86"
name="%s"
type="win32"/>
<!-- Identify the application security requirements. -->
<trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
<security>
<requestedPrivileges>
<requestedExecutionLevel level="asInvoker" uiAccess="false"/>
</requestedPrivileges>
</security>
</trustInfo>
</assembly>'''.strip()
# check if Python is called on the first line with this expression
FIRST_LINE_RE = re.compile(b'^#!.*pythonw?[0-9.]*([ \t].*)?$')
SCRIPT_TEMPLATE = r'''# -*- coding: utf-8 -*-
import re
import sys
from %(module)s import %(import_name)s
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(%(func)s())
'''
def enquote_executable(executable):
if ' ' in executable:
# make sure we quote only the executable in case of env
# for example /usr/bin/env "/dir with spaces/bin/jython"
# instead of "/usr/bin/env /dir with spaces/bin/jython"
# otherwise whole
if executable.startswith('/usr/bin/env '):
env, _executable = executable.split(' ', 1)
if ' ' in _executable and not _executable.startswith('"'):
executable = '%s "%s"' % (env, _executable)
else:
if not executable.startswith('"'):
executable = '"%s"' % executable
return executable
# Keep the old name around (for now), as there is at least one project using it!
_enquote_executable = enquote_executable
class ScriptMaker(object):
"""
A class to copy or create scripts from source scripts or callable
specifications.
"""
script_template = SCRIPT_TEMPLATE
executable = None # for shebangs
def __init__(self, source_dir, target_dir, add_launchers=True,
dry_run=False, fileop=None):
self.source_dir = source_dir
self.target_dir = target_dir
self.add_launchers = add_launchers
self.force = False
self.clobber = False
# It only makes sense to set mode bits on POSIX.
self.set_mode = (os.name == 'posix') or (os.name == 'java' and
os._name == 'posix')
self.variants = set(('', 'X.Y'))
self._fileop = fileop or FileOperator(dry_run)
self._is_nt = os.name == 'nt' or (
os.name == 'java' and os._name == 'nt')
self.version_info = sys.version_info
def _get_alternate_executable(self, executable, options):
if options.get('gui', False) and self._is_nt: # pragma: no cover
dn, fn = os.path.split(executable)
fn = fn.replace('python', 'pythonw')
executable = os.path.join(dn, fn)
return executable
if sys.platform.startswith('java'): # pragma: no cover
def _is_shell(self, executable):
"""
Determine if the specified executable is a script
(contains a #! line)
"""
try:
with open(executable) as fp:
return fp.read(2) == '#!'
except (OSError, IOError):
logger.warning('Failed to open %s', executable)
return False
def _fix_jython_executable(self, executable):
if self._is_shell(executable):
# Workaround for Jython is not needed on Linux systems.
import java
if java.lang.System.getProperty('os.name') == 'Linux':
return executable
elif executable.lower().endswith('jython.exe'):
# Use wrapper exe for Jython on Windows
return executable
return '/usr/bin/env %s' % executable
def _build_shebang(self, executable, post_interp):
"""
Build a shebang line. In the simple case (on Windows, or a shebang line
which is not too long or contains spaces) use a simple formulation for
the shebang. Otherwise, use /bin/sh as the executable, with a contrived
shebang which allows the script to run either under Python or sh, using
suitable quoting. Thanks to Harald Nordgren for his input.
See also: http://www.in-ulm.de/~mascheck/various/shebang/#length
https://hg.mozilla.org/mozilla-central/file/tip/mach
"""
if os.name != 'posix':
simple_shebang = True
else:
# Add 3 for '#!' prefix and newline suffix.
shebang_length = len(executable) + len(post_interp) + 3
if sys.platform == 'darwin':
max_shebang_length = 512
else:
max_shebang_length = 127
simple_shebang = ((b' ' not in executable) and
(shebang_length <= max_shebang_length))
if simple_shebang:
result = b'#!' + executable + post_interp + b'\n'
else:
result = b'#!/bin/sh\n'
result += b"'''exec' " + executable + post_interp + b' "$0" "$@"\n'
result += b"' '''"
return result
def _get_shebang(self, encoding, post_interp=b'', options=None):
enquote = True
if self.executable:
executable = self.executable
enquote = False # assume this will be taken care of
elif not sysconfig.is_python_build():
executable = get_executable()
elif in_venv(): # pragma: no cover
executable = os.path.join(sysconfig.get_path('scripts'),
'python%s' % sysconfig.get_config_var('EXE'))
else: # pragma: no cover
executable = os.path.join(
sysconfig.get_config_var('BINDIR'),
'python%s%s' % (sysconfig.get_config_var('VERSION'),
sysconfig.get_config_var('EXE')))
if not os.path.isfile(executable):
# for Python builds from source on Windows, no Python executables with
# a version suffix are created, so we use python.exe
executable = os.path.join(sysconfig.get_config_var('BINDIR'),
'python%s' % (sysconfig.get_config_var('EXE')))
if options:
executable = self._get_alternate_executable(executable, options)
if sys.platform.startswith('java'): # pragma: no cover
executable = self._fix_jython_executable(executable)
# Normalise case for Windows - COMMENTED OUT
# executable = os.path.normcase(executable)
# N.B. The normalising operation above has been commented out: See
# issue #124. Although paths in Windows are generally case-insensitive,
# they aren't always. For example, a path containing a ẞ (which is a
# LATIN CAPITAL LETTER SHARP S - U+1E9E) is normcased to ß (which is a
# LATIN SMALL LETTER SHARP S' - U+00DF). The two are not considered by
# Windows as equivalent in path names.
# If the user didn't specify an executable, it may be necessary to
# cater for executable paths with spaces (not uncommon on Windows)
if enquote:
executable = enquote_executable(executable)
# Issue #51: don't use fsencode, since we later try to
# check that the shebang is decodable using utf-8.
executable = executable.encode('utf-8')
# in case of IronPython, play safe and enable frames support
if (sys.platform == 'cli' and '-X:Frames' not in post_interp
and '-X:FullFrames' not in post_interp): # pragma: no cover
post_interp += b' -X:Frames'
shebang = self._build_shebang(executable, post_interp)
# Python parser starts to read a script using UTF-8 until
# it gets a #coding:xxx cookie. The shebang has to be the
# first line of a file, the #coding:xxx cookie cannot be
# written before. So the shebang has to be decodable from
# UTF-8.
try:
shebang.decode('utf-8')
except UnicodeDecodeError: # pragma: no cover
raise ValueError(
'The shebang (%r) is not decodable from utf-8' % shebang)
# If the script is encoded to a custom encoding (use a
# #coding:xxx cookie), the shebang has to be decodable from
# the script encoding too.
if encoding != 'utf-8':
try:
shebang.decode(encoding)
except UnicodeDecodeError: # pragma: no cover
raise ValueError(
'The shebang (%r) is not decodable '
'from the script encoding (%r)' % (shebang, encoding))
return shebang
def _get_script_text(self, entry):
return self.script_template % dict(module=entry.prefix,
import_name=entry.suffix.split('.')[0],
func=entry.suffix)
manifest = _DEFAULT_MANIFEST
def get_manifest(self, exename):
base = os.path.basename(exename)
return self.manifest % base
def _write_script(self, names, shebang, script_bytes, filenames, ext):
use_launcher = self.add_launchers and self._is_nt
linesep = os.linesep.encode('utf-8')
if not shebang.endswith(linesep):
shebang += linesep
if not use_launcher:
script_bytes = shebang + script_bytes
else: # pragma: no cover
if ext == 'py':
launcher = self._get_launcher('t')
else:
launcher = self._get_launcher('w')
stream = BytesIO()
with ZipFile(stream, 'w') as zf:
zf.writestr('__main__.py', script_bytes)
zip_data = stream.getvalue()
script_bytes = launcher + shebang + zip_data
for name in names:
outname = os.path.join(self.target_dir, name)
if use_launcher: # pragma: no cover
n, e = os.path.splitext(outname)
if e.startswith('.py'):
outname = n
outname = '%s.exe' % outname
try:
self._fileop.write_binary_file(outname, script_bytes)
except Exception:
# Failed writing an executable - it might be in use.
logger.warning('Failed to write executable - trying to '
'use .deleteme logic')
dfname = '%s.deleteme' % outname
if os.path.exists(dfname):
os.remove(dfname) # Not allowed to fail here
os.rename(outname, dfname) # nor here
self._fileop.write_binary_file(outname, script_bytes)
logger.debug('Able to replace executable using '
'.deleteme logic')
try:
os.remove(dfname)
except Exception:
pass # still in use - ignore error
else:
if self._is_nt and not outname.endswith('.' + ext): # pragma: no cover
outname = '%s.%s' % (outname, ext)
if os.path.exists(outname) and not self.clobber:
logger.warning('Skipping existing file %s', outname)
continue
self._fileop.write_binary_file(outname, script_bytes)
if self.set_mode:
self._fileop.set_executable_mode([outname])
filenames.append(outname)
variant_separator = '-'
def get_script_filenames(self, name):
result = set()
if '' in self.variants:
result.add(name)
if 'X' in self.variants:
result.add('%s%s' % (name, self.version_info[0]))
if 'X.Y' in self.variants:
result.add('%s%s%s.%s' % (name, self.variant_separator,
self.version_info[0], self.version_info[1]))
return result
def _make_script(self, entry, filenames, options=None):
post_interp = b''
if options:
args = options.get('interpreter_args', [])
if args:
args = ' %s' % ' '.join(args)
post_interp = args.encode('utf-8')
shebang = self._get_shebang('utf-8', post_interp, options=options)
script = self._get_script_text(entry).encode('utf-8')
scriptnames = self.get_script_filenames(entry.name)
if options and options.get('gui', False):
ext = 'pyw'
else:
ext = 'py'
self._write_script(scriptnames, shebang, script, filenames, ext)
def _copy_script(self, script, filenames):
adjust = False
script = os.path.join(self.source_dir, convert_path(script))
outname = os.path.join(self.target_dir, os.path.basename(script))
if not self.force and not self._fileop.newer(script, outname):
logger.debug('not copying %s (up-to-date)', script)
return
# Always open the file, but ignore failures in dry-run mode --
# that way, we'll get accurate feedback if we can read the
# script.
try:
f = open(script, 'rb')
except IOError: # pragma: no cover
if not self.dry_run:
raise
f = None
else:
first_line = f.readline()
if not first_line: # pragma: no cover
logger.warning('%s is an empty file (skipping)', script)
return
match = FIRST_LINE_RE.match(first_line.replace(b'\r\n', b'\n'))
if match:
adjust = True
post_interp = match.group(1) or b''
if not adjust:
if f:
f.close()
self._fileop.copy_file(script, outname)
if self.set_mode:
self._fileop.set_executable_mode([outname])
filenames.append(outname)
else:
logger.info('copying and adjusting %s -> %s', script,
self.target_dir)
if not self._fileop.dry_run:
encoding, lines = detect_encoding(f.readline)
f.seek(0)
shebang = self._get_shebang(encoding, post_interp)
if b'pythonw' in first_line: # pragma: no cover
ext = 'pyw'
else:
ext = 'py'
n = os.path.basename(outname)
self._write_script([n], shebang, f.read(), filenames, ext)
if f:
f.close()
@property
def dry_run(self):
return self._fileop.dry_run
@dry_run.setter
def dry_run(self, value):
self._fileop.dry_run = value
if os.name == 'nt' or (os.name == 'java' and os._name == 'nt'): # pragma: no cover
# Executable launcher support.
# Launchers are from https://bitbucket.org/vinay.sajip/simple_launcher/
def _get_launcher(self, kind):
if struct.calcsize('P') == 8: # 64-bit
bits = '64'
else:
bits = '32'
platform_suffix = '-arm' if get_platform() == 'win-arm64' else ''
name = '%s%s%s.exe' % (kind, bits, platform_suffix)
# Issue 31: don't hardcode an absolute package name, but
# determine it relative to the current package
distlib_package = __name__.rsplit('.', 1)[0]
resource = finder(distlib_package).find(name)
if not resource:
msg = ('Unable to find resource %s in package %s' % (name,
distlib_package))
raise ValueError(msg)
return resource.bytes
# Public API follows
def make(self, specification, options=None):
"""
Make a script.
:param specification: The specification, which is either a valid export
entry specification (to make a script from a
callable) or a filename (to make a script by
copying from a source location).
:param options: A dictionary of options controlling script generation.
:return: A list of all absolute pathnames written to.
"""
filenames = []
entry = get_export_entry(specification)
if entry is None:
self._copy_script(specification, filenames)
else:
self._make_script(entry, filenames, options=options)
return filenames
def make_multiple(self, specifications, options=None):
"""
Take a list of specifications and make scripts from them,
:param specifications: A list of specifications.
:return: A list of all absolute pathnames written to,
"""
filenames = []
for specification in specifications:
filenames.extend(self.make(specification, options))
return filenames

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,739 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2017 The Python Software Foundation.
# See LICENSE.txt and CONTRIBUTORS.txt.
#
"""
Implementation of a flexible versioning scheme providing support for PEP-440,
setuptools-compatible and semantic versioning.
"""
import logging
import re
from .compat import string_types
from .util import parse_requirement
__all__ = ['NormalizedVersion', 'NormalizedMatcher',
'LegacyVersion', 'LegacyMatcher',
'SemanticVersion', 'SemanticMatcher',
'UnsupportedVersionError', 'get_scheme']
logger = logging.getLogger(__name__)
class UnsupportedVersionError(ValueError):
"""This is an unsupported version."""
pass
class Version(object):
def __init__(self, s):
self._string = s = s.strip()
self._parts = parts = self.parse(s)
assert isinstance(parts, tuple)
assert len(parts) > 0
def parse(self, s):
raise NotImplementedError('please implement in a subclass')
def _check_compatible(self, other):
if type(self) != type(other):
raise TypeError('cannot compare %r and %r' % (self, other))
def __eq__(self, other):
self._check_compatible(other)
return self._parts == other._parts
def __ne__(self, other):
return not self.__eq__(other)
def __lt__(self, other):
self._check_compatible(other)
return self._parts < other._parts
def __gt__(self, other):
return not (self.__lt__(other) or self.__eq__(other))
def __le__(self, other):
return self.__lt__(other) or self.__eq__(other)
def __ge__(self, other):
return self.__gt__(other) or self.__eq__(other)
# See http://docs.python.org/reference/datamodel#object.__hash__
def __hash__(self):
return hash(self._parts)
def __repr__(self):
return "%s('%s')" % (self.__class__.__name__, self._string)
def __str__(self):
return self._string
@property
def is_prerelease(self):
raise NotImplementedError('Please implement in subclasses.')
class Matcher(object):
version_class = None
# value is either a callable or the name of a method
_operators = {
'<': lambda v, c, p: v < c,
'>': lambda v, c, p: v > c,
'<=': lambda v, c, p: v == c or v < c,
'>=': lambda v, c, p: v == c or v > c,
'==': lambda v, c, p: v == c,
'===': lambda v, c, p: v == c,
# by default, compatible => >=.
'~=': lambda v, c, p: v == c or v > c,
'!=': lambda v, c, p: v != c,
}
# this is a method only to support alternative implementations
# via overriding
def parse_requirement(self, s):
return parse_requirement(s)
def __init__(self, s):
if self.version_class is None:
raise ValueError('Please specify a version class')
self._string = s = s.strip()
r = self.parse_requirement(s)
if not r:
raise ValueError('Not valid: %r' % s)
self.name = r.name
self.key = self.name.lower() # for case-insensitive comparisons
clist = []
if r.constraints:
# import pdb; pdb.set_trace()
for op, s in r.constraints:
if s.endswith('.*'):
if op not in ('==', '!='):
raise ValueError('\'.*\' not allowed for '
'%r constraints' % op)
# Could be a partial version (e.g. for '2.*') which
# won't parse as a version, so keep it as a string
vn, prefix = s[:-2], True
# Just to check that vn is a valid version
self.version_class(vn)
else:
# Should parse as a version, so we can create an
# instance for the comparison
vn, prefix = self.version_class(s), False
clist.append((op, vn, prefix))
self._parts = tuple(clist)
def match(self, version):
"""
Check if the provided version matches the constraints.
:param version: The version to match against this instance.
:type version: String or :class:`Version` instance.
"""
if isinstance(version, string_types):
version = self.version_class(version)
for operator, constraint, prefix in self._parts:
f = self._operators.get(operator)
if isinstance(f, string_types):
f = getattr(self, f)
if not f:
msg = ('%r not implemented '
'for %s' % (operator, self.__class__.__name__))
raise NotImplementedError(msg)
if not f(version, constraint, prefix):
return False
return True
@property
def exact_version(self):
result = None
if len(self._parts) == 1 and self._parts[0][0] in ('==', '==='):
result = self._parts[0][1]
return result
def _check_compatible(self, other):
if type(self) != type(other) or self.name != other.name:
raise TypeError('cannot compare %s and %s' % (self, other))
def __eq__(self, other):
self._check_compatible(other)
return self.key == other.key and self._parts == other._parts
def __ne__(self, other):
return not self.__eq__(other)
# See http://docs.python.org/reference/datamodel#object.__hash__
def __hash__(self):
return hash(self.key) + hash(self._parts)
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self._string)
def __str__(self):
return self._string
PEP440_VERSION_RE = re.compile(r'^v?(\d+!)?(\d+(\.\d+)*)((a|b|c|rc)(\d+))?'
r'(\.(post)(\d+))?(\.(dev)(\d+))?'
r'(\+([a-zA-Z\d]+(\.[a-zA-Z\d]+)?))?$')
def _pep_440_key(s):
s = s.strip()
m = PEP440_VERSION_RE.match(s)
if not m:
raise UnsupportedVersionError('Not a valid version: %s' % s)
groups = m.groups()
nums = tuple(int(v) for v in groups[1].split('.'))
while len(nums) > 1 and nums[-1] == 0:
nums = nums[:-1]
if not groups[0]:
epoch = 0
else:
epoch = int(groups[0][:-1])
pre = groups[4:6]
post = groups[7:9]
dev = groups[10:12]
local = groups[13]
if pre == (None, None):
pre = ()
else:
pre = pre[0], int(pre[1])
if post == (None, None):
post = ()
else:
post = post[0], int(post[1])
if dev == (None, None):
dev = ()
else:
dev = dev[0], int(dev[1])
if local is None:
local = ()
else:
parts = []
for part in local.split('.'):
# to ensure that numeric compares as > lexicographic, avoid
# comparing them directly, but encode a tuple which ensures
# correct sorting
if part.isdigit():
part = (1, int(part))
else:
part = (0, part)
parts.append(part)
local = tuple(parts)
if not pre:
# either before pre-release, or final release and after
if not post and dev:
# before pre-release
pre = ('a', -1) # to sort before a0
else:
pre = ('z',) # to sort after all pre-releases
# now look at the state of post and dev.
if not post:
post = ('_',) # sort before 'a'
if not dev:
dev = ('final',)
#print('%s -> %s' % (s, m.groups()))
return epoch, nums, pre, post, dev, local
_normalized_key = _pep_440_key
class NormalizedVersion(Version):
"""A rational version.
Good:
1.2 # equivalent to "1.2.0"
1.2.0
1.2a1
1.2.3a2
1.2.3b1
1.2.3c1
1.2.3.4
TODO: fill this out
Bad:
1 # minimum two numbers
1.2a # release level must have a release serial
1.2.3b
"""
def parse(self, s):
result = _normalized_key(s)
# _normalized_key loses trailing zeroes in the release
# clause, since that's needed to ensure that X.Y == X.Y.0 == X.Y.0.0
# However, PEP 440 prefix matching needs it: for example,
# (~= 1.4.5.0) matches differently to (~= 1.4.5.0.0).
m = PEP440_VERSION_RE.match(s) # must succeed
groups = m.groups()
self._release_clause = tuple(int(v) for v in groups[1].split('.'))
return result
PREREL_TAGS = set(['a', 'b', 'c', 'rc', 'dev'])
@property
def is_prerelease(self):
return any(t[0] in self.PREREL_TAGS for t in self._parts if t)
def _match_prefix(x, y):
x = str(x)
y = str(y)
if x == y:
return True
if not x.startswith(y):
return False
n = len(y)
return x[n] == '.'
class NormalizedMatcher(Matcher):
version_class = NormalizedVersion
# value is either a callable or the name of a method
_operators = {
'~=': '_match_compatible',
'<': '_match_lt',
'>': '_match_gt',
'<=': '_match_le',
'>=': '_match_ge',
'==': '_match_eq',
'===': '_match_arbitrary',
'!=': '_match_ne',
}
def _adjust_local(self, version, constraint, prefix):
if prefix:
strip_local = '+' not in constraint and version._parts[-1]
else:
# both constraint and version are
# NormalizedVersion instances.
# If constraint does not have a local component,
# ensure the version doesn't, either.
strip_local = not constraint._parts[-1] and version._parts[-1]
if strip_local:
s = version._string.split('+', 1)[0]
version = self.version_class(s)
return version, constraint
def _match_lt(self, version, constraint, prefix):
version, constraint = self._adjust_local(version, constraint, prefix)
if version >= constraint:
return False
release_clause = constraint._release_clause
pfx = '.'.join([str(i) for i in release_clause])
return not _match_prefix(version, pfx)
def _match_gt(self, version, constraint, prefix):
version, constraint = self._adjust_local(version, constraint, prefix)
if version <= constraint:
return False
release_clause = constraint._release_clause
pfx = '.'.join([str(i) for i in release_clause])
return not _match_prefix(version, pfx)
def _match_le(self, version, constraint, prefix):
version, constraint = self._adjust_local(version, constraint, prefix)
return version <= constraint
def _match_ge(self, version, constraint, prefix):
version, constraint = self._adjust_local(version, constraint, prefix)
return version >= constraint
def _match_eq(self, version, constraint, prefix):
version, constraint = self._adjust_local(version, constraint, prefix)
if not prefix:
result = (version == constraint)
else:
result = _match_prefix(version, constraint)
return result
def _match_arbitrary(self, version, constraint, prefix):
return str(version) == str(constraint)
def _match_ne(self, version, constraint, prefix):
version, constraint = self._adjust_local(version, constraint, prefix)
if not prefix:
result = (version != constraint)
else:
result = not _match_prefix(version, constraint)
return result
def _match_compatible(self, version, constraint, prefix):
version, constraint = self._adjust_local(version, constraint, prefix)
if version == constraint:
return True
if version < constraint:
return False
# if not prefix:
# return True
release_clause = constraint._release_clause
if len(release_clause) > 1:
release_clause = release_clause[:-1]
pfx = '.'.join([str(i) for i in release_clause])
return _match_prefix(version, pfx)
_REPLACEMENTS = (
(re.compile('[.+-]$'), ''), # remove trailing puncts
(re.compile(r'^[.](\d)'), r'0.\1'), # .N -> 0.N at start
(re.compile('^[.-]'), ''), # remove leading puncts
(re.compile(r'^\((.*)\)$'), r'\1'), # remove parentheses
(re.compile(r'^v(ersion)?\s*(\d+)'), r'\2'), # remove leading v(ersion)
(re.compile(r'^r(ev)?\s*(\d+)'), r'\2'), # remove leading v(ersion)
(re.compile('[.]{2,}'), '.'), # multiple runs of '.'
(re.compile(r'\b(alfa|apha)\b'), 'alpha'), # misspelt alpha
(re.compile(r'\b(pre-alpha|prealpha)\b'),
'pre.alpha'), # standardise
(re.compile(r'\(beta\)$'), 'beta'), # remove parentheses
)
_SUFFIX_REPLACEMENTS = (
(re.compile('^[:~._+-]+'), ''), # remove leading puncts
(re.compile('[,*")([\\]]'), ''), # remove unwanted chars
(re.compile('[~:+_ -]'), '.'), # replace illegal chars
(re.compile('[.]{2,}'), '.'), # multiple runs of '.'
(re.compile(r'\.$'), ''), # trailing '.'
)
_NUMERIC_PREFIX = re.compile(r'(\d+(\.\d+)*)')
def _suggest_semantic_version(s):
"""
Try to suggest a semantic form for a version for which
_suggest_normalized_version couldn't come up with anything.
"""
result = s.strip().lower()
for pat, repl in _REPLACEMENTS:
result = pat.sub(repl, result)
if not result:
result = '0.0.0'
# Now look for numeric prefix, and separate it out from
# the rest.
#import pdb; pdb.set_trace()
m = _NUMERIC_PREFIX.match(result)
if not m:
prefix = '0.0.0'
suffix = result
else:
prefix = m.groups()[0].split('.')
prefix = [int(i) for i in prefix]
while len(prefix) < 3:
prefix.append(0)
if len(prefix) == 3:
suffix = result[m.end():]
else:
suffix = '.'.join([str(i) for i in prefix[3:]]) + result[m.end():]
prefix = prefix[:3]
prefix = '.'.join([str(i) for i in prefix])
suffix = suffix.strip()
if suffix:
#import pdb; pdb.set_trace()
# massage the suffix.
for pat, repl in _SUFFIX_REPLACEMENTS:
suffix = pat.sub(repl, suffix)
if not suffix:
result = prefix
else:
sep = '-' if 'dev' in suffix else '+'
result = prefix + sep + suffix
if not is_semver(result):
result = None
return result
def _suggest_normalized_version(s):
"""Suggest a normalized version close to the given version string.
If you have a version string that isn't rational (i.e. NormalizedVersion
doesn't like it) then you might be able to get an equivalent (or close)
rational version from this function.
This does a number of simple normalizations to the given string, based
on observation of versions currently in use on PyPI. Given a dump of
those version during PyCon 2009, 4287 of them:
- 2312 (53.93%) match NormalizedVersion without change
with the automatic suggestion
- 3474 (81.04%) match when using this suggestion method
@param s {str} An irrational version string.
@returns A rational version string, or None, if couldn't determine one.
"""
try:
_normalized_key(s)
return s # already rational
except UnsupportedVersionError:
pass
rs = s.lower()
# part of this could use maketrans
for orig, repl in (('-alpha', 'a'), ('-beta', 'b'), ('alpha', 'a'),
('beta', 'b'), ('rc', 'c'), ('-final', ''),
('-pre', 'c'),
('-release', ''), ('.release', ''), ('-stable', ''),
('+', '.'), ('_', '.'), (' ', ''), ('.final', ''),
('final', '')):
rs = rs.replace(orig, repl)
# if something ends with dev or pre, we add a 0
rs = re.sub(r"pre$", r"pre0", rs)
rs = re.sub(r"dev$", r"dev0", rs)
# if we have something like "b-2" or "a.2" at the end of the
# version, that is probably beta, alpha, etc
# let's remove the dash or dot
rs = re.sub(r"([abc]|rc)[\-\.](\d+)$", r"\1\2", rs)
# 1.0-dev-r371 -> 1.0.dev371
# 0.1-dev-r79 -> 0.1.dev79
rs = re.sub(r"[\-\.](dev)[\-\.]?r?(\d+)$", r".\1\2", rs)
# Clean: 2.0.a.3, 2.0.b1, 0.9.0~c1
rs = re.sub(r"[.~]?([abc])\.?", r"\1", rs)
# Clean: v0.3, v1.0
if rs.startswith('v'):
rs = rs[1:]
# Clean leading '0's on numbers.
#TODO: unintended side-effect on, e.g., "2003.05.09"
# PyPI stats: 77 (~2%) better
rs = re.sub(r"\b0+(\d+)(?!\d)", r"\1", rs)
# Clean a/b/c with no version. E.g. "1.0a" -> "1.0a0". Setuptools infers
# zero.
# PyPI stats: 245 (7.56%) better
rs = re.sub(r"(\d+[abc])$", r"\g<1>0", rs)
# the 'dev-rNNN' tag is a dev tag
rs = re.sub(r"\.?(dev-r|dev\.r)\.?(\d+)$", r".dev\2", rs)
# clean the - when used as a pre delimiter
rs = re.sub(r"-(a|b|c)(\d+)$", r"\1\2", rs)
# a terminal "dev" or "devel" can be changed into ".dev0"
rs = re.sub(r"[\.\-](dev|devel)$", r".dev0", rs)
# a terminal "dev" can be changed into ".dev0"
rs = re.sub(r"(?![\.\-])dev$", r".dev0", rs)
# a terminal "final" or "stable" can be removed
rs = re.sub(r"(final|stable)$", "", rs)
# The 'r' and the '-' tags are post release tags
# 0.4a1.r10 -> 0.4a1.post10
# 0.9.33-17222 -> 0.9.33.post17222
# 0.9.33-r17222 -> 0.9.33.post17222
rs = re.sub(r"\.?(r|-|-r)\.?(\d+)$", r".post\2", rs)
# Clean 'r' instead of 'dev' usage:
# 0.9.33+r17222 -> 0.9.33.dev17222
# 1.0dev123 -> 1.0.dev123
# 1.0.git123 -> 1.0.dev123
# 1.0.bzr123 -> 1.0.dev123
# 0.1a0dev.123 -> 0.1a0.dev123
# PyPI stats: ~150 (~4%) better
rs = re.sub(r"\.?(dev|git|bzr)\.?(\d+)$", r".dev\2", rs)
# Clean '.pre' (normalized from '-pre' above) instead of 'c' usage:
# 0.2.pre1 -> 0.2c1
# 0.2-c1 -> 0.2c1
# 1.0preview123 -> 1.0c123
# PyPI stats: ~21 (0.62%) better
rs = re.sub(r"\.?(pre|preview|-c)(\d+)$", r"c\g<2>", rs)
# Tcl/Tk uses "px" for their post release markers
rs = re.sub(r"p(\d+)$", r".post\1", rs)
try:
_normalized_key(rs)
except UnsupportedVersionError:
rs = None
return rs
#
# Legacy version processing (distribute-compatible)
#
_VERSION_PART = re.compile(r'([a-z]+|\d+|[\.-])', re.I)
_VERSION_REPLACE = {
'pre': 'c',
'preview': 'c',
'-': 'final-',
'rc': 'c',
'dev': '@',
'': None,
'.': None,
}
def _legacy_key(s):
def get_parts(s):
result = []
for p in _VERSION_PART.split(s.lower()):
p = _VERSION_REPLACE.get(p, p)
if p:
if '0' <= p[:1] <= '9':
p = p.zfill(8)
else:
p = '*' + p
result.append(p)
result.append('*final')
return result
result = []
for p in get_parts(s):
if p.startswith('*'):
if p < '*final':
while result and result[-1] == '*final-':
result.pop()
while result and result[-1] == '00000000':
result.pop()
result.append(p)
return tuple(result)
class LegacyVersion(Version):
def parse(self, s):
return _legacy_key(s)
@property
def is_prerelease(self):
result = False
for x in self._parts:
if (isinstance(x, string_types) and x.startswith('*') and
x < '*final'):
result = True
break
return result
class LegacyMatcher(Matcher):
version_class = LegacyVersion
_operators = dict(Matcher._operators)
_operators['~='] = '_match_compatible'
numeric_re = re.compile(r'^(\d+(\.\d+)*)')
def _match_compatible(self, version, constraint, prefix):
if version < constraint:
return False
m = self.numeric_re.match(str(constraint))
if not m:
logger.warning('Cannot compute compatible match for version %s '
' and constraint %s', version, constraint)
return True
s = m.groups()[0]
if '.' in s:
s = s.rsplit('.', 1)[0]
return _match_prefix(version, s)
#
# Semantic versioning
#
_SEMVER_RE = re.compile(r'^(\d+)\.(\d+)\.(\d+)'
r'(-[a-z0-9]+(\.[a-z0-9-]+)*)?'
r'(\+[a-z0-9]+(\.[a-z0-9-]+)*)?$', re.I)
def is_semver(s):
return _SEMVER_RE.match(s)
def _semantic_key(s):
def make_tuple(s, absent):
if s is None:
result = (absent,)
else:
parts = s[1:].split('.')
# We can't compare ints and strings on Python 3, so fudge it
# by zero-filling numeric values so simulate a numeric comparison
result = tuple([p.zfill(8) if p.isdigit() else p for p in parts])
return result
m = is_semver(s)
if not m:
raise UnsupportedVersionError(s)
groups = m.groups()
major, minor, patch = [int(i) for i in groups[:3]]
# choose the '|' and '*' so that versions sort correctly
pre, build = make_tuple(groups[3], '|'), make_tuple(groups[5], '*')
return (major, minor, patch), pre, build
class SemanticVersion(Version):
def parse(self, s):
return _semantic_key(s)
@property
def is_prerelease(self):
return self._parts[1][0] != '|'
class SemanticMatcher(Matcher):
version_class = SemanticVersion
class VersionScheme(object):
def __init__(self, key, matcher, suggester=None):
self.key = key
self.matcher = matcher
self.suggester = suggester
def is_valid_version(self, s):
try:
self.matcher.version_class(s)
result = True
except UnsupportedVersionError:
result = False
return result
def is_valid_matcher(self, s):
try:
self.matcher(s)
result = True
except UnsupportedVersionError:
result = False
return result
def is_valid_constraint_list(self, s):
"""
Used for processing some metadata fields
"""
# See issue #140. Be tolerant of a single trailing comma.
if s.endswith(','):
s = s[:-1]
return self.is_valid_matcher('dummy_name (%s)' % s)
def suggest(self, s):
if self.suggester is None:
result = None
else:
result = self.suggester(s)
return result
_SCHEMES = {
'normalized': VersionScheme(_normalized_key, NormalizedMatcher,
_suggest_normalized_version),
'legacy': VersionScheme(_legacy_key, LegacyMatcher, lambda self, s: s),
'semantic': VersionScheme(_semantic_key, SemanticMatcher,
_suggest_semantic_version),
}
_SCHEMES['default'] = _SCHEMES['normalized']
def get_scheme(name):
if name not in _SCHEMES:
raise ValueError('unknown scheme name: %r' % name)
return _SCHEMES[name]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,35 @@
"""
HTML parsing library based on the `WHATWG HTML specification
<https://whatwg.org/html>`_. The parser is designed to be compatible with
existing HTML found in the wild and implements well-defined error recovery that
is largely compatible with modern desktop web browsers.
Example usage::
from pip._vendor import html5lib
with open("my_document.html", "rb") as f:
tree = html5lib.parse(f)
For convenience, this module re-exports the following names:
* :func:`~.html5parser.parse`
* :func:`~.html5parser.parseFragment`
* :class:`~.html5parser.HTMLParser`
* :func:`~.treebuilders.getTreeBuilder`
* :func:`~.treewalkers.getTreeWalker`
* :func:`~.serializer.serialize`
"""
from __future__ import absolute_import, division, unicode_literals
from .html5parser import HTMLParser, parse, parseFragment
from .treebuilders import getTreeBuilder
from .treewalkers import getTreeWalker
from .serializer import serialize
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"]
# this has to be at the top level, see how setup.py parses this
#: Distribution version number.
__version__ = "1.1"

View File

@ -0,0 +1,289 @@
from __future__ import absolute_import, division, unicode_literals
import re
import warnings
from .constants import DataLossWarning
baseChar = """
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
combiningCharacter = """
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
#x3099 | #x309A"""
digit = """
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
extender = """
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
letter = " | ".join([baseChar, ideographic])
# Without the
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
extender])
nameFirst = " | ".join([letter, "_"])
reChar = re.compile(r"#x([\d|A-F]{4,4})")
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")]
rv = []
for item in charRanges:
foundMatch = False
for regexp in (reChar, reCharRange):
match = regexp.match(item)
if match is not None:
rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1:
rv[-1] = rv[-1] * 2
foundMatch = True
break
if not foundMatch:
assert len(item) == 1
rv.append([ord(item)] * 2)
rv = normaliseCharList(rv)
return rv
def normaliseCharList(charList):
charList = sorted(charList)
for item in charList:
assert item[1] >= item[0]
rv = []
i = 0
while i < len(charList):
j = 1
rv.append(charList[i])
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
rv[-1][1] = charList[i + j][1]
j += 1
i += j
return rv
# We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16)
def missingRanges(charList):
rv = []
if charList[0] != 0:
rv.append([0, charList[0][0] - 1])
for i, item in enumerate(charList[:-1]):
rv.append([item[1] + 1, charList[i + 1][0] - 1])
if charList[-1][1] != max_unicode:
rv.append([charList[-1][1] + 1, max_unicode])
return rv
def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
rv.append(escapeRegexp(chr(item[0])))
else:
rv.append(escapeRegexp(chr(item[0])) + "-" +
escapeRegexp(chr(item[1])))
return "[%s]" % "".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
string = string.replace(char, "\\" + char)
return string
# output from the above
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
# Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self,
dropXmlnsLocalName=False,
dropXmlnsAttrNs=False,
preventDoubleDashComments=False,
preventDashAtCommentEnd=False,
replaceFormFeedCharacters=True,
preventSingleQuotePubid=False):
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
self.preventDoubleDashComments = preventDoubleDashComments
self.preventDashAtCommentEnd = preventDashAtCommentEnd
self.replaceFormFeedCharacters = replaceFormFeedCharacters
self.preventSingleQuotePubid = preventSingleQuotePubid
self.replaceCache = {}
def coerceAttribute(self, name, namespace=None):
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
return None
elif (self.dropXmlnsAttrNs and
namespace == "http://www.w3.org/2000/xmlns/"):
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
return None
else:
return self.toXmlName(name)
def coerceElement(self, name):
return self.toXmlName(name)
def coerceComment(self, data):
if self.preventDoubleDashComments:
while "--" in data:
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
data = data.replace("--", "- -")
if data.endswith("-"):
warnings.warn("Comments cannot end in a dash", DataLossWarning)
data += " "
return data
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
for _ in range(data.count("\x0C")):
warnings.warn("Text cannot contain U+000C", DataLossWarning)
data = data.replace("\x0C", " ")
# Other non-xml characters
return data
def coercePubid(self, data):
dataOutput = data
for char in nonPubidCharRegexp.findall(data):
warnings.warn("Coercing non-XML pubid", DataLossWarning)
replacement = self.getReplacementCharacter(char)
dataOutput = dataOutput.replace(char, replacement)
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
return dataOutput
def toXmlName(self, name):
nameFirst = name[0]
nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m:
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
nameFirstOutput = self.getReplacementCharacter(nameFirst)
else:
nameFirstOutput = nameFirst
nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
replacement = self.getReplacementCharacter(char)
nameRestOutput = nameRestOutput.replace(char, replacement)
return nameFirstOutput + nameRestOutput
def getReplacementCharacter(self, char):
if char in self.replaceCache:
replacement = self.replaceCache[char]
else:
replacement = self.escapeChar(char)
return replacement
def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)):
name = name.replace(item, self.unescapeChar(item))
return name
def escapeChar(self, char):
replacement = "U%05X" % ord(char)
self.replaceCache[char] = replacement
return replacement
def unescapeChar(self, charcode):
return chr(int(charcode[1:], 16))

View File

@ -0,0 +1,918 @@
from __future__ import absolute_import, division, unicode_literals
from pip._vendor.six import text_type
from pip._vendor.six.moves import http_client, urllib
import codecs
import re
from io import BytesIO, StringIO
from pip._vendor import webencodings
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import _ReparseException
from . import _utils
# Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
if _utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# eval. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
"]")
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF}
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
# Cache for charsUntil()
charsUntilRegEx = {}
class BufferedStream(object):
"""Buffering for streams that do not have buffering of their own
The buffer is implemented as a list of chunks on the assumption that
joining many strings will be slow since it is O(n**2)
"""
def __init__(self, stream):
self.stream = stream
self.buffer = []
self.position = [-1, 0] # chunk number, offset
def tell(self):
pos = 0
for chunk in self.buffer[:self.position[0]]:
pos += len(chunk)
pos += self.position[1]
return pos
def seek(self, pos):
assert pos <= self._bufferedBytes()
offset = pos
i = 0
while len(self.buffer[i]) < offset:
offset -= len(self.buffer[i])
i += 1
self.position = [i, offset]
def read(self, bytes):
if not self.buffer:
return self._readStream(bytes)
elif (self.position[0] == len(self.buffer) and
self.position[1] == len(self.buffer[-1])):
return self._readStream(bytes)
else:
return self._readFromBuffer(bytes)
def _bufferedBytes(self):
return sum([len(item) for item in self.buffer])
def _readStream(self, bytes):
data = self.stream.read(bytes)
self.buffer.append(data)
self.position[0] += 1
self.position[1] = len(data)
return data
def _readFromBuffer(self, bytes):
remainingBytes = bytes
rv = []
bufferIndex = self.position[0]
bufferOffset = self.position[1]
while bufferIndex < len(self.buffer) and remainingBytes != 0:
assert remainingBytes > 0
bufferedData = self.buffer[bufferIndex]
if remainingBytes <= len(bufferedData) - bufferOffset:
bytesToRead = remainingBytes
self.position = [bufferIndex, bufferOffset + bytesToRead]
else:
bytesToRead = len(bufferedData) - bufferOffset
self.position = [bufferIndex, len(bufferedData)]
bufferIndex += 1
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
remainingBytes -= bytesToRead
bufferOffset = 0
if remainingBytes:
rv.append(self._readStream(remainingBytes))
return b"".join(rv)
def HTMLInputStream(source, **kwargs):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
if (isinstance(source, http_client.HTTPResponse) or
# Also check for addinfourl wrapping HTTPResponse
(isinstance(source, urllib.response.addbase) and
isinstance(source.fp, http_client.HTTPResponse))):
isUnicode = False
elif hasattr(source, "read"):
isUnicode = isinstance(source.read(0), text_type)
else:
isUnicode = isinstance(source, text_type)
if isUnicode:
encodings = [x for x in kwargs if x.endswith("_encoding")]
if encodings:
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
return HTMLUnicodeInputStream(source, **kwargs)
else:
return HTMLBinaryInputStream(source, **kwargs)
class HTMLUnicodeInputStream(object):
"""Provides a unicode stream of characters to the HTMLTokenizer.
This class takes care of character encoding and removing or replacing
incorrect byte-sequences and also provides column and line tracking.
"""
_defaultChunkSize = 10240
def __init__(self, source):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
for use by html5lib.
source can be either a file-object, local filename or a string.
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
if not _utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
else:
self.reportCharacterErrors = self.characterErrorsUCS2
# List of where new lines occur
self.newLines = [0]
self.charEncoding = (lookupEncoding("utf-8"), "certain")
self.dataStream = self.openStream(source)
self.reset()
def reset(self):
self.chunk = ""
self.chunkSize = 0
self.chunkOffset = 0
self.errors = []
# number of (complete) lines in previous chunks
self.prevNumLines = 0
# number of columns in the last line of the previous chunk
self.prevNumCols = 0
# Deal with CR LF and surrogates split over chunk boundaries
self._bufferedCharacter = None
def openStream(self, source):
"""Produces a file object from source.
source can be either a file object, local filename or a string.
"""
# Already a file object
if hasattr(source, 'read'):
stream = source
else:
stream = StringIO(source)
return stream
def _position(self, offset):
chunk = self.chunk
nLines = chunk.count('\n', 0, offset)
positionLine = self.prevNumLines + nLines
lastLinePos = chunk.rfind('\n', 0, offset)
if lastLinePos == -1:
positionColumn = self.prevNumCols + offset
else:
positionColumn = offset - (lastLinePos + 1)
return (positionLine, positionColumn)
def position(self):
"""Returns (line, col) of the current position in the stream."""
line, col = self._position(self.chunkOffset)
return (line + 1, col)
def char(self):
""" Read one character from the stream or queue if available. Return
EOF when EOF is reached.
"""
# Read a new chunk from the input stream if necessary
if self.chunkOffset >= self.chunkSize:
if not self.readChunk():
return EOF
chunkOffset = self.chunkOffset
char = self.chunk[chunkOffset]
self.chunkOffset = chunkOffset + 1
return char
def readChunk(self, chunkSize=None):
if chunkSize is None:
chunkSize = self._defaultChunkSize
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
self.chunk = ""
self.chunkSize = 0
self.chunkOffset = 0
data = self.dataStream.read(chunkSize)
# Deal with CR LF and surrogates broken across chunks
if self._bufferedCharacter:
data = self._bufferedCharacter + data
self._bufferedCharacter = None
elif not data:
# We have no more data, bye-bye stream
return False
if len(data) > 1:
lastv = ord(data[-1])
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
self._bufferedCharacter = data[-1]
data = data[:-1]
if self.reportCharacterErrors:
self.reportCharacterErrors(data)
# Replace invalid characters
data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
self.chunk = data
self.chunkSize = len(data)
return True
def characterErrorsUCS4(self, data):
for _ in range(len(invalid_unicode_re.findall(data))):
self.errors.append("invalid-codepoint")
def characterErrorsUCS2(self, data):
# Someone picked the wrong compile option
# You lose
skip = False
for match in invalid_unicode_re.finditer(data):
if skip:
continue
codepoint = ord(match.group())
pos = match.start()
# Pretty sure there should be endianness issues here
if _utils.isSurrogatePair(data[pos:pos + 2]):
# We have a surrogate pair!
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
pos == len(data) - 1):
self.errors.append("invalid-codepoint")
else:
skip = False
self.errors.append("invalid-codepoint")
def charsUntil(self, characters, opposite=False):
""" Returns a string of characters from the stream up to but not
including any character in 'characters' or EOF. 'characters' must be
a container that supports the 'in' method and iteration over its
characters.
"""
# Use a cache of regexps to find the required characters
try:
chars = charsUntilRegEx[(characters, opposite)]
except KeyError:
if __debug__:
for c in characters:
assert(ord(c) < 128)
regex = "".join(["\\x%02x" % ord(c) for c in characters])
if not opposite:
regex = "^%s" % regex
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
rv = []
while True:
# Find the longest matching prefix
m = chars.match(self.chunk, self.chunkOffset)
if m is None:
# If nothing matched, and it wasn't because we ran out of chunk,
# then stop
if self.chunkOffset != self.chunkSize:
break
else:
end = m.end()
# If not the whole chunk matched, return everything
# up to the part that didn't match
if end != self.chunkSize:
rv.append(self.chunk[self.chunkOffset:end])
self.chunkOffset = end
break
# If the whole remainder of the chunk matched,
# use it all and read the next chunk
rv.append(self.chunk[self.chunkOffset:])
if not self.readChunk():
# Reached EOF
break
r = "".join(rv)
return r
def unget(self, char):
# Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget
if char is not EOF:
if self.chunkOffset == 0:
# unget is called quite rarely, so it's a good idea to do
# more work here if it saves a bit of work in the frequently
# called char and charsUntil.
# So, just prepend the ungotten character onto the current
# chunk:
self.chunk = char + self.chunk
self.chunkSize += 1
else:
self.chunkOffset -= 1
assert self.chunk[self.chunkOffset] == char
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
"""Provides a unicode stream of characters to the HTMLTokenizer.
This class takes care of character encoding and removing or replacing
incorrect byte-sequences and also provides column and line tracking.
"""
def __init__(self, source, override_encoding=None, transport_encoding=None,
same_origin_parent_encoding=None, likely_encoding=None,
default_encoding="windows-1252", useChardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
for use by html5lib.
source can be either a file-object, local filename or a string.
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
self.rawStream = self.openStream(source)
HTMLUnicodeInputStream.__init__(self, self.rawStream)
# Encoding Information
# Number of bytes to use when looking for a meta element with
# encoding information
self.numBytesMeta = 1024
# Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100
# Things from args
self.override_encoding = override_encoding
self.transport_encoding = transport_encoding
self.same_origin_parent_encoding = same_origin_parent_encoding
self.likely_encoding = likely_encoding
self.default_encoding = default_encoding
# Determine encoding
self.charEncoding = self.determineEncoding(useChardet)
assert self.charEncoding[0] is not None
# Call superclass
self.reset()
def reset(self):
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
HTMLUnicodeInputStream.reset(self)
def openStream(self, source):
"""Produces a file object from source.
source can be either a file object, local filename or a string.
"""
# Already a file object
if hasattr(source, 'read'):
stream = source
else:
stream = BytesIO(source)
try:
stream.seek(stream.tell())
except Exception:
stream = BufferedStream(stream)
return stream
def determineEncoding(self, chardet=True):
# BOMs take precedence over everything
# This will also read past the BOM if present
charEncoding = self.detectBOM(), "certain"
if charEncoding[0] is not None:
return charEncoding
# If we've been overridden, we've been overridden
charEncoding = lookupEncoding(self.override_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Now check the transport layer
charEncoding = lookupEncoding(self.transport_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Look for meta elements with encoding information
charEncoding = self.detectEncodingMeta(), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Parent document encoding
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
return charEncoding
# "likely" encoding
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Guess with chardet, if available
if chardet:
try:
from pip._vendor.chardet.universaldetector import UniversalDetector
except ImportError:
pass
else:
buffers = []
detector = UniversalDetector()
while not detector.done:
buffer = self.rawStream.read(self.numBytesChardet)
assert isinstance(buffer, bytes)
if not buffer:
break
buffers.append(buffer)
detector.feed(buffer)
detector.close()
encoding = lookupEncoding(detector.result['encoding'])
self.rawStream.seek(0)
if encoding is not None:
return encoding, "tentative"
# Try the default encoding
charEncoding = lookupEncoding(self.default_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Fallback to html5lib's default if even that hasn't worked
return lookupEncoding("windows-1252"), "tentative"
def changeEncoding(self, newEncoding):
assert self.charEncoding[1] != "certain"
newEncoding = lookupEncoding(newEncoding)
if newEncoding is None:
return
if newEncoding.name in ("utf-16be", "utf-16le"):
newEncoding = lookupEncoding("utf-8")
assert newEncoding is not None
elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0], "certain")
else:
self.rawStream.seek(0)
self.charEncoding = (newEncoding, "certain")
self.reset()
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
an encoding can be determined from the BOM return the name of the
encoding otherwise return None"""
bomDict = {
codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
}
# Go to beginning of file and read in 4 bytes
string = self.rawStream.read(4)
assert isinstance(string, bytes)
# Try detecting the BOM using bytes from the string
encoding = bomDict.get(string[:3]) # UTF-8
seek = 3
if not encoding:
# Need to detect UTF-32 before UTF-16
encoding = bomDict.get(string) # UTF-32
seek = 4
if not encoding:
encoding = bomDict.get(string[:2]) # UTF-16
seek = 2
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
if encoding:
self.rawStream.seek(seek)
return lookupEncoding(encoding)
else:
self.rawStream.seek(0)
return None
def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
"""
buffer = self.rawStream.read(self.numBytesMeta)
assert isinstance(buffer, bytes)
parser = EncodingParser(buffer)
self.rawStream.seek(0)
encoding = parser.getEncoding()
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
encoding = lookupEncoding("utf-8")
return encoding
class EncodingBytes(bytes):
"""String-like object with an associated position and various extra methods
If the position is ever greater than the string length then an exception is
raised"""
def __new__(self, value):
assert isinstance(value, bytes)
return bytes.__new__(self, value.lower())
def __init__(self, value):
# pylint:disable=unused-argument
self._position = -1
def __iter__(self):
return self
def __next__(self):
p = self._position = self._position + 1
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
return self[p:p + 1]
def next(self):
# Py2 compat
return self.__next__()
def previous(self):
p = self._position
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
self._position = p = p - 1
return self[p:p + 1]
def setPosition(self, position):
if self._position >= len(self):
raise StopIteration
self._position = position
def getPosition(self):
if self._position >= len(self):
raise StopIteration
if self._position >= 0:
return self._position
else:
return None
position = property(getPosition, setPosition)
def getCurrentByte(self):
return self[self.position:self.position + 1]
currentByte = property(getCurrentByte)
def skip(self, chars=spaceCharactersBytes):
"""Skip past a list of characters"""
p = self.position # use property for the error-checking
while p < len(self):
c = self[p:p + 1]
if c not in chars:
self._position = p
return c
p += 1
self._position = p
return None
def skipUntil(self, chars):
p = self.position
while p < len(self):
c = self[p:p + 1]
if c in chars:
self._position = p
return c
p += 1
self._position = p
return None
def matchBytes(self, bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
rv = self.startswith(bytes, self.position)
if rv:
self.position += len(bytes)
return rv
def jumpTo(self, bytes):
"""Look for the next sequence of bytes matching a given sequence. If
a match is found advance the position to the last byte of the match"""
try:
self._position = self.index(bytes, self.position) + len(bytes) - 1
except ValueError:
raise StopIteration
return True
class EncodingParser(object):
"""Mini parser for detecting character encoding from meta elements"""
def __init__(self, data):
"""string - the data to work on for encoding detection"""
self.data = EncodingBytes(data)
self.encoding = None
def getEncoding(self):
if b"<meta" not in self.data:
return None
methodDispatch = (
(b"<!--", self.handleComment),
(b"<meta", self.handleMeta),
(b"</", self.handlePossibleEndTag),
(b"<!", self.handleOther),
(b"<?", self.handleOther),
(b"<", self.handlePossibleStartTag))
for _ in self.data:
keepParsing = True
try:
self.data.jumpTo(b"<")
except StopIteration:
break
for key, method in methodDispatch:
if self.data.matchBytes(key):
try:
keepParsing = method()
break
except StopIteration:
keepParsing = False
break
if not keepParsing:
break
return self.encoding
def handleComment(self):
"""Skip over comments"""
return self.data.jumpTo(b"-->")
def handleMeta(self):
if self.data.currentByte not in spaceCharactersBytes:
# if we have <meta not followed by a space so just keep going
return True
# We have a valid meta element we want to search for attributes
hasPragma = False
pendingEncoding = None
while True:
# Try to find the next attribute after the current position
attr = self.getAttribute()
if attr is None:
return True
else:
if attr[0] == b"http-equiv":
hasPragma = attr[1] == b"content-type"
if hasPragma and pendingEncoding is not None:
self.encoding = pendingEncoding
return False
elif attr[0] == b"charset":
tentativeEncoding = attr[1]
codec = lookupEncoding(tentativeEncoding)
if codec is not None:
self.encoding = codec
return False
elif attr[0] == b"content":
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
tentativeEncoding = contentParser.parse()
if tentativeEncoding is not None:
codec = lookupEncoding(tentativeEncoding)
if codec is not None:
if hasPragma:
self.encoding = codec
return False
else:
pendingEncoding = codec
def handlePossibleStartTag(self):
return self.handlePossibleTag(False)
def handlePossibleEndTag(self):
next(self.data)
return self.handlePossibleTag(True)
def handlePossibleTag(self, endTag):
data = self.data
if data.currentByte not in asciiLettersBytes:
# If the next byte is not an ascii letter either ignore this
# fragment (possible start tag case) or treat it according to
# handleOther
if endTag:
data.previous()
self.handleOther()
return True
c = data.skipUntil(spacesAngleBrackets)
if c == b"<":
# return to the first step in the overall "two step" algorithm
# reprocessing the < byte
data.previous()
else:
# Read all attributes
attr = self.getAttribute()
while attr is not None:
attr = self.getAttribute()
return True
def handleOther(self):
return self.data.jumpTo(b">")
def getAttribute(self):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
data = self.data
# Step 1 (skip chars)
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
assert c is None or len(c) == 1
# Step 2
if c in (b">", None):
return None
# Step 3
attrName = []
attrValue = []
# Step 4 attribute name
while True:
if c == b"=" and attrName:
break
elif c in spaceCharactersBytes:
# Step 6!
c = data.skip()
break
elif c in (b"/", b">"):
return b"".join(attrName), b""
elif c in asciiUppercaseBytes:
attrName.append(c.lower())
elif c is None:
return None
else:
attrName.append(c)
# Step 5
c = next(data)
# Step 7
if c != b"=":
data.previous()
return b"".join(attrName), b""
# Step 8
next(data)
# Step 9
c = data.skip()
# Step 10
if c in (b"'", b'"'):
# 10.1
quoteChar = c
while True:
# 10.2
c = next(data)
# 10.3
if c == quoteChar:
next(data)
return b"".join(attrName), b"".join(attrValue)
# 10.4
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
# 10.5
else:
attrValue.append(c)
elif c == b">":
return b"".join(attrName), b""
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
elif c is None:
return None
else:
attrValue.append(c)
# Step 11
while True:
c = next(data)
if c in spacesAngleBrackets:
return b"".join(attrName), b"".join(attrValue)
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
elif c is None:
return None
else:
attrValue.append(c)
class ContentAttrParser(object):
def __init__(self, data):
assert isinstance(data, bytes)
self.data = data
def parse(self):
try:
# Check if the attr name is charset
# otherwise return
self.data.jumpTo(b"charset")
self.data.position += 1
self.data.skip()
if not self.data.currentByte == b"=":
# If there is no = sign keep looking for attrs
return None
self.data.position += 1
self.data.skip()
# Look for an encoding between matching quote marks
if self.data.currentByte in (b'"', b"'"):
quoteMark = self.data.currentByte
self.data.position += 1
oldPosition = self.data.position
if self.data.jumpTo(quoteMark):
return self.data[oldPosition:self.data.position]
else:
return None
else:
# Unquoted value
oldPosition = self.data.position
try:
self.data.skipUntil(spaceCharactersBytes)
return self.data[oldPosition:self.data.position]
except StopIteration:
# Return the whole remaining value
return self.data[oldPosition:]
except StopIteration:
return None
def lookupEncoding(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if isinstance(encoding, bytes):
try:
encoding = encoding.decode("ascii")
except UnicodeDecodeError:
return None
if encoding is not None:
try:
return webencodings.lookup(encoding)
except AttributeError:
return None
else:
return None

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from .py import Trie
__all__ = ["Trie"]

View File

@ -0,0 +1,40 @@
from __future__ import absolute_import, division, unicode_literals
try:
from collections.abc import Mapping
except ImportError: # Python 2.7
from collections import Mapping
class Trie(Mapping):
"""Abstract base class for tries"""
def keys(self, prefix=None):
# pylint:disable=arguments-differ
keys = super(Trie, self).keys()
if prefix is None:
return set(keys)
return {x for x in keys if x.startswith(prefix)}
def has_keys_with_prefix(self, prefix):
for key in self.keys():
if key.startswith(prefix):
return True
return False
def longest_prefix(self, prefix):
if prefix in self:
return prefix
for i in range(1, len(prefix) + 1):
if prefix[:-i] in self:
return prefix[:-i]
raise KeyError(prefix)
def longest_prefix_item(self, prefix):
lprefix = self.longest_prefix(prefix)
return (lprefix, self[lprefix])

View File

@ -0,0 +1,67 @@
from __future__ import absolute_import, division, unicode_literals
from pip._vendor.six import text_type
from bisect import bisect_left
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
if not all(isinstance(x, text_type) for x in data.keys()):
raise TypeError("All keys must be strings")
self._data = data
self._keys = sorted(data.keys())
self._cachestr = ""
self._cachepoints = (0, len(data))
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
return iter(self._data)
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
if prefix is None or prefix == "" or not self._keys:
return set(self._keys)
if prefix.startswith(self._cachestr):
lo, hi = self._cachepoints
start = i = bisect_left(self._keys, prefix, lo, hi)
else:
start = i = bisect_left(self._keys, prefix)
keys = set()
if start == len(self._keys):
return keys
while self._keys[i].startswith(prefix):
keys.add(self._keys[i])
i += 1
self._cachestr = prefix
self._cachepoints = (start, i)
return keys
def has_keys_with_prefix(self, prefix):
if prefix in self._data:
return True
if prefix.startswith(self._cachestr):
lo, hi = self._cachepoints
i = bisect_left(self._keys, prefix, lo, hi)
else:
i = bisect_left(self._keys, prefix)
if i == len(self._keys):
return False
return self._keys[i].startswith(prefix)

Some files were not shown because too many files have changed in this diff Show More