375 lines
13 KiB
Python
375 lines
13 KiB
Python
|
# This file is part of Hypothesis, which may be found at
|
||
|
# https://github.com/HypothesisWorks/hypothesis/
|
||
|
#
|
||
|
# Most of this work is copyright (C) 2013-2021 David R. MacIver
|
||
|
# (david@drmaciver.com), but it contains contributions by others. See
|
||
|
# CONTRIBUTING.rst for a full list of people who may hold copyright, and
|
||
|
# consult the git log if you need to determine who owns an individual
|
||
|
# contribution.
|
||
|
#
|
||
|
# This Source Code Form is subject to the terms of the Mozilla Public License,
|
||
|
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
|
||
|
# obtain one at https://mozilla.org/MPL/2.0/.
|
||
|
#
|
||
|
# END HEADER
|
||
|
|
||
|
import gzip
|
||
|
import json
|
||
|
import os
|
||
|
import sys
|
||
|
import tempfile
|
||
|
import unicodedata
|
||
|
from typing import Dict, Tuple
|
||
|
|
||
|
from hypothesis.configuration import mkdir_p, storage_directory
|
||
|
from hypothesis.errors import InvalidArgument
|
||
|
|
||
|
intervals = Tuple[Tuple[int, int], ...]
|
||
|
cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]
|
||
|
|
||
|
|
||
|
def charmap_file():
|
||
|
return storage_directory(
|
||
|
"unicode_data", unicodedata.unidata_version, "charmap.json.gz"
|
||
|
)
|
||
|
|
||
|
|
||
|
_charmap = None
|
||
|
|
||
|
|
||
|
def charmap():
|
||
|
"""Return a dict that maps a Unicode category, to a tuple of 2-tuples
|
||
|
covering the codepoint intervals for characters in that category.
|
||
|
|
||
|
>>> charmap()['Co']
|
||
|
((57344, 63743), (983040, 1048573), (1048576, 1114109))
|
||
|
"""
|
||
|
global _charmap
|
||
|
# Best-effort caching in the face of missing files and/or unwritable
|
||
|
# filesystems is fairly simple: check if loaded, else try loading,
|
||
|
# else calculate and try writing the cache.
|
||
|
if _charmap is None:
|
||
|
f = charmap_file()
|
||
|
try:
|
||
|
with gzip.GzipFile(f, "rb") as i:
|
||
|
tmp_charmap = dict(json.load(i))
|
||
|
|
||
|
except Exception:
|
||
|
# This loop is reduced to using only local variables for performance;
|
||
|
# indexing and updating containers is a ~3x slowdown. This doesn't fix
|
||
|
# https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.
|
||
|
category = unicodedata.category # Local variable -> ~20% speedup!
|
||
|
tmp_charmap = {}
|
||
|
last_cat = category(chr(0))
|
||
|
last_start = 0
|
||
|
for i in range(1, sys.maxunicode + 1):
|
||
|
cat = category(chr(i))
|
||
|
if cat != last_cat:
|
||
|
tmp_charmap.setdefault(last_cat, []).append([last_start, i - 1])
|
||
|
last_cat, last_start = cat, i
|
||
|
tmp_charmap.setdefault(last_cat, []).append([last_start, sys.maxunicode])
|
||
|
|
||
|
try:
|
||
|
# Write the Unicode table atomically
|
||
|
tmpdir = storage_directory("tmp")
|
||
|
mkdir_p(tmpdir)
|
||
|
fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
|
||
|
os.close(fd)
|
||
|
# Explicitly set the mtime to get reproducible output
|
||
|
with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
|
||
|
result = json.dumps(sorted(tmp_charmap.items()))
|
||
|
o.write(result.encode())
|
||
|
|
||
|
os.renames(tmpfile, f)
|
||
|
except Exception:
|
||
|
pass
|
||
|
|
||
|
# convert between lists and tuples
|
||
|
_charmap = {
|
||
|
k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
|
||
|
}
|
||
|
# each value is a tuple of 2-tuples (that is, tuples of length 2)
|
||
|
# and that both elements of that tuple are integers.
|
||
|
for vs in _charmap.values():
|
||
|
ints = list(sum(vs, ()))
|
||
|
assert all(isinstance(x, int) for x in ints)
|
||
|
assert ints == sorted(ints)
|
||
|
assert all(len(tup) == 2 for tup in vs)
|
||
|
|
||
|
assert _charmap is not None
|
||
|
return _charmap
|
||
|
|
||
|
|
||
|
_categories = None
|
||
|
|
||
|
|
||
|
def categories():
|
||
|
"""Return a tuple of Unicode categories in a normalised order.
|
||
|
|
||
|
>>> categories() # doctest: +ELLIPSIS
|
||
|
('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')
|
||
|
"""
|
||
|
global _categories
|
||
|
if _categories is None:
|
||
|
cm = charmap()
|
||
|
_categories = sorted(cm.keys(), key=lambda c: len(cm[c]))
|
||
|
_categories.remove("Cc") # Other, Control
|
||
|
_categories.remove("Cs") # Other, Surrogate
|
||
|
_categories.append("Cc")
|
||
|
_categories.append("Cs")
|
||
|
return tuple(_categories)
|
||
|
|
||
|
|
||
|
def as_general_categories(cats, name="cats"):
|
||
|
"""Return a tuple of Unicode categories in a normalised order.
|
||
|
|
||
|
This function expands one-letter designations of a major class to include
|
||
|
all subclasses:
|
||
|
|
||
|
>>> as_general_categories(['N'])
|
||
|
('Nd', 'Nl', 'No')
|
||
|
|
||
|
See section 4.5 of the Unicode standard for more on classes:
|
||
|
https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf
|
||
|
|
||
|
If the collection ``cats`` includes any elements that do not represent a
|
||
|
major class or a class with subclass, a deprecation warning is raised.
|
||
|
"""
|
||
|
if cats is None:
|
||
|
return None
|
||
|
major_classes = ("L", "M", "N", "P", "S", "Z", "C")
|
||
|
cs = categories()
|
||
|
out = set(cats)
|
||
|
for c in cats:
|
||
|
if c in major_classes:
|
||
|
out.discard(c)
|
||
|
out.update(x for x in cs if x.startswith(c))
|
||
|
elif c not in cs:
|
||
|
raise InvalidArgument(
|
||
|
f"In {name}={cats!r}, {c!r} is not a valid Unicode category."
|
||
|
)
|
||
|
return tuple(c for c in cs if c in out)
|
||
|
|
||
|
|
||
|
def _union_intervals(x, y):
|
||
|
"""Merge two sequences of intervals into a single tuple of intervals.
|
||
|
|
||
|
Any integer bounded by `x` or `y` is also bounded by the result.
|
||
|
|
||
|
>>> _union_intervals([(3, 10)], [(1, 2), (5, 17)])
|
||
|
((1, 17),)
|
||
|
"""
|
||
|
if not x:
|
||
|
return tuple((u, v) for u, v in y)
|
||
|
if not y:
|
||
|
return tuple((u, v) for u, v in x)
|
||
|
intervals = sorted(x + y, reverse=True)
|
||
|
result = [intervals.pop()]
|
||
|
while intervals:
|
||
|
# 1. intervals is in descending order
|
||
|
# 2. pop() takes from the RHS.
|
||
|
# 3. (a, b) was popped 1st, then (u, v) was popped 2nd
|
||
|
# 4. Therefore: a <= u
|
||
|
# 5. We assume that u <= v and a <= b
|
||
|
# 6. So we need to handle 2 cases of overlap, and one disjoint case
|
||
|
# | u--v | u----v | u--v |
|
||
|
# | a----b | a--b | a--b |
|
||
|
u, v = intervals.pop()
|
||
|
a, b = result[-1]
|
||
|
if u <= b + 1:
|
||
|
# Overlap cases
|
||
|
result[-1] = (a, max(v, b))
|
||
|
else:
|
||
|
# Disjoint case
|
||
|
result.append((u, v))
|
||
|
return tuple(result)
|
||
|
|
||
|
|
||
|
def _subtract_intervals(x, y):
|
||
|
"""Set difference for lists of intervals. That is, returns a list of
|
||
|
intervals that bounds all values bounded by x that are not also bounded by
|
||
|
y. x and y are expected to be in sorted order.
|
||
|
|
||
|
For example _subtract_intervals([(1, 10)], [(2, 3), (9, 15)]) would
|
||
|
return [(1, 1), (4, 8)], removing the values 2, 3, 9 and 10 from the
|
||
|
interval.
|
||
|
"""
|
||
|
if not y:
|
||
|
return tuple(x)
|
||
|
x = list(map(list, x))
|
||
|
i = 0
|
||
|
j = 0
|
||
|
result = []
|
||
|
while i < len(x) and j < len(y):
|
||
|
# Iterate in parallel over x and y. j stays pointing at the smallest
|
||
|
# interval in the left hand side that could still overlap with some
|
||
|
# element of x at index >= i.
|
||
|
# Similarly, i is not incremented until we know that it does not
|
||
|
# overlap with any element of y at index >= j.
|
||
|
|
||
|
xl, xr = x[i]
|
||
|
assert xl <= xr
|
||
|
yl, yr = y[j]
|
||
|
assert yl <= yr
|
||
|
|
||
|
if yr < xl:
|
||
|
# The interval at y[j] is strictly to the left of the interval at
|
||
|
# x[i], so will not overlap with it or any later interval of x.
|
||
|
j += 1
|
||
|
elif yl > xr:
|
||
|
# The interval at y[j] is strictly to the right of the interval at
|
||
|
# x[i], so all of x[i] goes into the result as no further intervals
|
||
|
# in y will intersect it.
|
||
|
result.append(x[i])
|
||
|
i += 1
|
||
|
elif yl <= xl:
|
||
|
if yr >= xr:
|
||
|
# x[i] is contained entirely in y[j], so we just skip over it
|
||
|
# without adding it to the result.
|
||
|
i += 1
|
||
|
else:
|
||
|
# The beginning of x[i] is contained in y[j], so we update the
|
||
|
# left endpoint of x[i] to remove this, and increment j as we
|
||
|
# now have moved past it. Note that this is not added to the
|
||
|
# result as is, as more intervals from y may intersect it so it
|
||
|
# may need updating further.
|
||
|
x[i][0] = yr + 1
|
||
|
j += 1
|
||
|
else:
|
||
|
# yl > xl, so the left hand part of x[i] is not contained in y[j],
|
||
|
# so there are some values we should add to the result.
|
||
|
result.append((xl, yl - 1))
|
||
|
|
||
|
if yr + 1 <= xr:
|
||
|
# If y[j] finishes before x[i] does, there may be some values
|
||
|
# in x[i] left that should go in the result (or they may be
|
||
|
# removed by a later interval in y), so we update x[i] to
|
||
|
# reflect that and increment j because it no longer overlaps
|
||
|
# with any remaining element of x.
|
||
|
x[i][0] = yr + 1
|
||
|
j += 1
|
||
|
else:
|
||
|
# Every element of x[i] other than the initial part we have
|
||
|
# already added is contained in y[j], so we move to the next
|
||
|
# interval.
|
||
|
i += 1
|
||
|
# Any remaining intervals in x do not overlap with any of y, as if they did
|
||
|
# we would not have incremented j to the end, so can be added to the result
|
||
|
# as they are.
|
||
|
result.extend(x[i:])
|
||
|
return tuple(map(tuple, result))
|
||
|
|
||
|
|
||
|
def _intervals(s):
|
||
|
"""Return a tuple of intervals, covering the codepoints of characters in
|
||
|
`s`.
|
||
|
|
||
|
>>> _intervals('abcdef0123456789')
|
||
|
((48, 57), (97, 102))
|
||
|
"""
|
||
|
intervals = tuple((ord(c), ord(c)) for c in sorted(s))
|
||
|
return _union_intervals(intervals, intervals)
|
||
|
|
||
|
|
||
|
category_index_cache = {(): ()}
|
||
|
|
||
|
|
||
|
def _category_key(exclude, include):
|
||
|
"""Return a normalised tuple of all Unicode categories that are in
|
||
|
`include`, but not in `exclude`.
|
||
|
|
||
|
If include is None then default to including all categories.
|
||
|
Any item in include that is not a unicode character will be excluded.
|
||
|
|
||
|
>>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])
|
||
|
('Me', 'Lu', 'Cs')
|
||
|
"""
|
||
|
cs = categories()
|
||
|
if include is None:
|
||
|
include = set(cs)
|
||
|
else:
|
||
|
include = set(include)
|
||
|
exclude = set(exclude or ())
|
||
|
assert include.issubset(cs)
|
||
|
assert exclude.issubset(cs)
|
||
|
include -= exclude
|
||
|
return tuple(c for c in cs if c in include)
|
||
|
|
||
|
|
||
|
def _query_for_key(key):
|
||
|
"""Return a tuple of codepoint intervals covering characters that match one
|
||
|
or more categories in the tuple of categories `key`.
|
||
|
|
||
|
>>> _query_for_key(categories())
|
||
|
((0, 1114111),)
|
||
|
>>> _query_for_key(('Zl', 'Zp', 'Co'))
|
||
|
((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
|
||
|
"""
|
||
|
try:
|
||
|
return category_index_cache[key]
|
||
|
except KeyError:
|
||
|
pass
|
||
|
assert key
|
||
|
if set(key) == set(categories()):
|
||
|
result = ((0, sys.maxunicode),)
|
||
|
else:
|
||
|
result = _union_intervals(_query_for_key(key[:-1]), charmap()[key[-1]])
|
||
|
category_index_cache[key] = result
|
||
|
return result
|
||
|
|
||
|
|
||
|
limited_category_index_cache = {} # type: cache_type
|
||
|
|
||
|
|
||
|
def query(
|
||
|
exclude_categories=(),
|
||
|
include_categories=None,
|
||
|
min_codepoint=None,
|
||
|
max_codepoint=None,
|
||
|
include_characters="",
|
||
|
exclude_characters="",
|
||
|
):
|
||
|
"""Return a tuple of intervals covering the codepoints for all characters
|
||
|
that meet the criteria (min_codepoint <= codepoint(c) <= max_codepoint and
|
||
|
any(cat in include_categories for cat in categories(c)) and all(cat not in
|
||
|
exclude_categories for cat in categories(c)) or (c in include_characters)
|
||
|
|
||
|
>>> query()
|
||
|
((0, 1114111),)
|
||
|
>>> query(min_codepoint=0, max_codepoint=128)
|
||
|
((0, 128),)
|
||
|
>>> query(min_codepoint=0, max_codepoint=128, include_categories=['Lu'])
|
||
|
((65, 90),)
|
||
|
>>> query(min_codepoint=0, max_codepoint=128, include_categories=['Lu'],
|
||
|
... include_characters=u'☃')
|
||
|
((65, 90), (9731, 9731))
|
||
|
"""
|
||
|
if min_codepoint is None:
|
||
|
min_codepoint = 0
|
||
|
if max_codepoint is None:
|
||
|
max_codepoint = sys.maxunicode
|
||
|
catkey = _category_key(exclude_categories, include_categories)
|
||
|
character_intervals = _intervals(include_characters or "")
|
||
|
exclude_intervals = _intervals(exclude_characters or "")
|
||
|
qkey = (
|
||
|
catkey,
|
||
|
min_codepoint,
|
||
|
max_codepoint,
|
||
|
character_intervals,
|
||
|
exclude_intervals,
|
||
|
)
|
||
|
try:
|
||
|
return limited_category_index_cache[qkey]
|
||
|
except KeyError:
|
||
|
pass
|
||
|
base = _query_for_key(catkey)
|
||
|
result = []
|
||
|
for u, v in base:
|
||
|
if v >= min_codepoint and u <= max_codepoint:
|
||
|
result.append((max(u, min_codepoint), min(v, max_codepoint)))
|
||
|
result = tuple(result)
|
||
|
result = _union_intervals(result, character_intervals)
|
||
|
result = _subtract_intervals(result, exclude_intervals)
|
||
|
limited_category_index_cache[qkey] = result
|
||
|
return result
|