483 lines
17 KiB
Python
483 lines
17 KiB
Python
# This file is part of Hypothesis, which may be found at
|
|
# https://github.com/HypothesisWorks/hypothesis/
|
|
#
|
|
# Most of this work is copyright (C) 2013-2021 David R. MacIver
|
|
# (david@drmaciver.com), but it contains contributions by others. See
|
|
# CONTRIBUTING.rst for a full list of people who may hold copyright, and
|
|
# consult the git log if you need to determine who owns an individual
|
|
# contribution.
|
|
#
|
|
# This Source Code Form is subject to the terms of the Mozilla Public License,
|
|
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
|
|
# obtain one at https://mozilla.org/MPL/2.0/.
|
|
#
|
|
# END HEADER
|
|
|
|
import enum
|
|
import hashlib
|
|
import heapq
|
|
import math
|
|
import sys
|
|
from collections import OrderedDict, abc
|
|
from functools import lru_cache
|
|
|
|
from hypothesis.errors import InvalidArgument
|
|
from hypothesis.internal.compat import floor, int_from_bytes
|
|
from hypothesis.internal.floats import int_to_float
|
|
|
|
LABEL_MASK = 2 ** 64 - 1
|
|
|
|
|
|
def calc_label_from_name(name: str) -> int:
|
|
hashed = hashlib.sha384(name.encode()).digest()
|
|
return int_from_bytes(hashed[:8])
|
|
|
|
|
|
def calc_label_from_cls(cls: type) -> int:
|
|
return calc_label_from_name(cls.__qualname__)
|
|
|
|
|
|
def combine_labels(*labels: int) -> int:
|
|
label = 0
|
|
for l in labels:
|
|
label = (label << 1) & LABEL_MASK
|
|
label ^= l
|
|
return label
|
|
|
|
|
|
INTEGER_RANGE_DRAW_LABEL = calc_label_from_name("another draw in integer_range()")
|
|
BIASED_COIN_LABEL = calc_label_from_name("biased_coin()")
|
|
BIASED_COIN_INNER_LABEL = calc_label_from_name("inside biased_coin()")
|
|
SAMPLE_IN_SAMPLER_LABEL = calc_label_from_name("a sample() in Sampler")
|
|
ONE_FROM_MANY_LABEL = calc_label_from_name("one more from many()")
|
|
|
|
|
|
def unbounded_integers(data):
|
|
size = INT_SIZES[INT_SIZES_SAMPLER.sample(data)]
|
|
r = data.draw_bits(size)
|
|
sign = r & 1
|
|
r >>= 1
|
|
if sign:
|
|
r = -r
|
|
return int(r)
|
|
|
|
|
|
def integer_range(data, lower, upper, center=None):
|
|
assert lower <= upper
|
|
if lower == upper:
|
|
# Write a value even when this is trivial so that when a bound depends
|
|
# on other values we don't suddenly disappear when the gap shrinks to
|
|
# zero - if that happens then often the data stream becomes misaligned
|
|
# and we fail to shrink in cases where we really should be able to.
|
|
data.draw_bits(1, forced=0)
|
|
return int(lower)
|
|
|
|
if center is None:
|
|
center = lower
|
|
center = min(max(center, lower), upper)
|
|
|
|
if center == upper:
|
|
above = False
|
|
elif center == lower:
|
|
above = True
|
|
else:
|
|
above = not boolean(data)
|
|
|
|
if above:
|
|
gap = upper - center
|
|
else:
|
|
gap = center - lower
|
|
|
|
assert gap > 0
|
|
|
|
bits = gap.bit_length()
|
|
probe = gap + 1
|
|
|
|
if bits > 24 and data.draw_bits(3):
|
|
# For large ranges, we combine the uniform random distribution from draw_bits
|
|
# with a weighting scheme with moderate chance. Cutoff at 2 ** 24 so that our
|
|
# choice of unicode characters is uniform but the 32bit distribution is not.
|
|
idx = INT_SIZES_SAMPLER.sample(data)
|
|
bits = min(bits, INT_SIZES[idx])
|
|
|
|
while probe > gap:
|
|
data.start_example(INTEGER_RANGE_DRAW_LABEL)
|
|
probe = data.draw_bits(bits)
|
|
data.stop_example(discard=probe > gap)
|
|
|
|
if above:
|
|
result = center + probe
|
|
else:
|
|
result = center - probe
|
|
|
|
assert lower <= result <= upper
|
|
return int(result)
|
|
|
|
|
|
def check_sample(values, strategy_name):
|
|
if "numpy" in sys.modules and isinstance(values, sys.modules["numpy"].ndarray):
|
|
if values.ndim != 1:
|
|
raise InvalidArgument(
|
|
"Only one-dimensional arrays are supported for sampling, "
|
|
f"and the given value has {values.ndim} dimensions (shape "
|
|
f"{values.shape}). This array would give samples of array slices "
|
|
"instead of elements! Use np.ravel(values) to convert "
|
|
"to a one-dimensional array, or tuple(values) if you "
|
|
"want to sample slices."
|
|
)
|
|
elif not isinstance(values, (OrderedDict, abc.Sequence, enum.EnumMeta)):
|
|
raise InvalidArgument(
|
|
f"Cannot sample from {values!r}, not an ordered collection. "
|
|
f"Hypothesis goes to some length to ensure that the {strategy_name} "
|
|
"strategy has stable results between runs. To replay a saved "
|
|
"example, the sampled values must have the same iteration order "
|
|
"on every run - ruling out sets, dicts, etc due to hash "
|
|
"randomisation. Most cases can simply use `sorted(values)`, but "
|
|
"mixed types or special values such as math.nan require careful "
|
|
"handling - and note that when simplifying an example, "
|
|
"Hypothesis treats earlier values as simpler."
|
|
)
|
|
if isinstance(values, range):
|
|
return values
|
|
return tuple(values)
|
|
|
|
|
|
def choice(data, values):
|
|
return values[integer_range(data, 0, len(values) - 1)]
|
|
|
|
|
|
FLOAT_PREFIX = 0b1111111111 << 52
|
|
FULL_FLOAT = int_to_float(FLOAT_PREFIX | ((2 << 53) - 1)) - 1
|
|
|
|
|
|
def fractional_float(data):
|
|
return (int_to_float(FLOAT_PREFIX | data.draw_bits(52)) - 1) / FULL_FLOAT
|
|
|
|
|
|
def boolean(data):
|
|
return bool(data.draw_bits(1))
|
|
|
|
|
|
def biased_coin(data, p, *, forced=None):
|
|
"""Return True with probability p (assuming a uniform generator),
|
|
shrinking towards False. If ``forced`` is set to a non-None value, this
|
|
will always return that value but will write choices appropriate to having
|
|
drawn that value randomly."""
|
|
|
|
# NB this function is vastly more complicated than it may seem reasonable
|
|
# for it to be. This is because it is used in a lot of places and it's
|
|
# important for it to shrink well, so it's worth the engineering effort.
|
|
|
|
if p <= 0 or p >= 1:
|
|
bits = 1
|
|
else:
|
|
# When there is a meaningful draw, in order to shrink well we will
|
|
# set things up so that 0 and 1 always correspond to False and True
|
|
# respectively. This means we want enough bits available that in a
|
|
# draw we will always have at least one truthy value and one falsey
|
|
# value.
|
|
bits = math.ceil(-math.log(min(p, 1 - p), 2))
|
|
# In order to avoid stupidly large draws where the probability is
|
|
# effectively zero or one, we treat probabilities of under 2^-64 to be
|
|
# effectively zero.
|
|
if bits > 64:
|
|
# There isn't enough precision near one for this to occur for values
|
|
# far from 0.
|
|
p = 0.0
|
|
bits = 1
|
|
|
|
size = 2 ** bits
|
|
|
|
data.start_example(BIASED_COIN_LABEL)
|
|
while True:
|
|
# The logic here is a bit complicated and special cased to make it
|
|
# play better with the shrinker.
|
|
|
|
# We imagine partitioning the real interval [0, 1] into 256 equal parts
|
|
# and looking at each part and whether its interior is wholly <= p
|
|
# or wholly >= p. At most one part can be neither.
|
|
|
|
# We then pick a random part. If it's wholly on one side or the other
|
|
# of p then we use that as the answer. If p is contained in the
|
|
# interval then we start again with a new probability that is given
|
|
# by the fraction of that interval that was <= our previous p.
|
|
|
|
# We then take advantage of the fact that we have control of the
|
|
# labelling to make this shrink better, using the following tricks:
|
|
|
|
# If p is <= 0 or >= 1 the result of this coin is certain. We make sure
|
|
# to write a byte to the data stream anyway so that these don't cause
|
|
# difficulties when shrinking.
|
|
if p <= 0:
|
|
data.draw_bits(1, forced=0)
|
|
result = False
|
|
elif p >= 1:
|
|
data.draw_bits(1, forced=1)
|
|
result = True
|
|
else:
|
|
falsey = floor(size * (1 - p))
|
|
truthy = floor(size * p)
|
|
remainder = size * p - truthy
|
|
|
|
if falsey + truthy == size:
|
|
partial = False
|
|
else:
|
|
partial = True
|
|
|
|
if forced is None:
|
|
# We want to get to the point where True is represented by
|
|
# 1 and False is represented by 0 as quickly as possible, so
|
|
# we use the remove_discarded machinery in the shrinker to
|
|
# achieve that by discarding any draws that are > 1 and writing
|
|
# a suitable draw into the choice sequence at the end of the
|
|
# loop.
|
|
data.start_example(BIASED_COIN_INNER_LABEL)
|
|
i = data.draw_bits(bits)
|
|
data.stop_example(discard=i > 1)
|
|
else:
|
|
i = data.draw_bits(bits, forced=int(forced))
|
|
|
|
# We always label the region that causes us to repeat the loop as
|
|
# 255 so that shrinking this byte never causes us to need to draw
|
|
# more data.
|
|
if partial and i == size - 1:
|
|
p = remainder
|
|
continue
|
|
if falsey == 0:
|
|
# Every other partition is truthy, so the result is true
|
|
result = True
|
|
elif truthy == 0:
|
|
# Every other partition is falsey, so the result is false
|
|
result = False
|
|
elif i <= 1:
|
|
# We special case so that zero is always false and 1 is always
|
|
# true which makes shrinking easier because we can always
|
|
# replace a truthy block with 1. This has the slightly weird
|
|
# property that shrinking from 2 to 1 can cause the result to
|
|
# grow, but the shrinker always tries 0 and 1 first anyway, so
|
|
# this will usually be fine.
|
|
result = bool(i)
|
|
else:
|
|
# Originally everything in the region 0 <= i < falsey was false
|
|
# and everything above was true. We swapped one truthy element
|
|
# into this region, so the region becomes 0 <= i <= falsey
|
|
# except for i = 1. We know i > 1 here, so the test for truth
|
|
# becomes i > falsey.
|
|
result = i > falsey
|
|
|
|
if i > 1: # pragma: no branch
|
|
# Thanks to bytecode optimisations on CPython >= 3.7 and PyPy
|
|
# (see https://bugs.python.org/issue2506), coverage incorrectly
|
|
# thinks that this condition is always true. You can trivially
|
|
# check by adding `else: assert False` and running the tests.
|
|
data.draw_bits(bits, forced=int(result))
|
|
break
|
|
data.stop_example()
|
|
return result
|
|
|
|
|
|
class Sampler:
|
|
"""Sampler based on Vose's algorithm for the alias method. See
|
|
http://www.keithschwarz.com/darts-dice-coins/ for a good explanation.
|
|
|
|
The general idea is that we store a table of triples (base, alternate, p).
|
|
base. We then pick a triple uniformly at random, and choose its alternate
|
|
value with probability p and else choose its base value. The triples are
|
|
chosen so that the resulting mixture has the right distribution.
|
|
|
|
We maintain the following invariants to try to produce good shrinks:
|
|
|
|
1. The table is in lexicographic (base, alternate) order, so that choosing
|
|
an earlier value in the list always lowers (or at least leaves
|
|
unchanged) the value.
|
|
2. base[i] < alternate[i], so that shrinking the draw always results in
|
|
shrinking the chosen element.
|
|
"""
|
|
|
|
def __init__(self, weights):
|
|
|
|
n = len(weights)
|
|
|
|
self.table = [[i, None, None] for i in range(n)]
|
|
|
|
total = sum(weights)
|
|
|
|
num_type = type(total)
|
|
|
|
zero = num_type(0)
|
|
one = num_type(1)
|
|
|
|
small = []
|
|
large = []
|
|
|
|
probabilities = [w / total for w in weights]
|
|
scaled_probabilities = []
|
|
|
|
for i, p in enumerate(probabilities):
|
|
scaled = p * n
|
|
scaled_probabilities.append(scaled)
|
|
if scaled == 1:
|
|
self.table[i][2] = zero
|
|
elif scaled < 1:
|
|
small.append(i)
|
|
else:
|
|
large.append(i)
|
|
heapq.heapify(small)
|
|
heapq.heapify(large)
|
|
|
|
while small and large:
|
|
lo = heapq.heappop(small)
|
|
hi = heapq.heappop(large)
|
|
|
|
assert lo != hi
|
|
assert scaled_probabilities[hi] > one
|
|
assert self.table[lo][1] is None
|
|
self.table[lo][1] = hi
|
|
self.table[lo][2] = one - scaled_probabilities[lo]
|
|
scaled_probabilities[hi] = (
|
|
scaled_probabilities[hi] + scaled_probabilities[lo]
|
|
) - one
|
|
|
|
if scaled_probabilities[hi] < 1:
|
|
heapq.heappush(small, hi)
|
|
elif scaled_probabilities[hi] == 1:
|
|
self.table[hi][2] = zero
|
|
else:
|
|
heapq.heappush(large, hi)
|
|
while large:
|
|
self.table[large.pop()][2] = zero
|
|
while small:
|
|
self.table[small.pop()][2] = zero
|
|
|
|
for entry in self.table:
|
|
assert entry[2] is not None
|
|
if entry[1] is None:
|
|
entry[1] = entry[0]
|
|
elif entry[1] < entry[0]:
|
|
entry[0], entry[1] = entry[1], entry[0]
|
|
entry[2] = one - entry[2]
|
|
self.table.sort()
|
|
|
|
def sample(self, data):
|
|
data.start_example(SAMPLE_IN_SAMPLER_LABEL)
|
|
i = integer_range(data, 0, len(self.table) - 1)
|
|
base, alternate, alternate_chance = self.table[i]
|
|
use_alternate = biased_coin(data, alternate_chance)
|
|
data.stop_example()
|
|
if use_alternate:
|
|
return alternate
|
|
else:
|
|
return base
|
|
|
|
|
|
INT_SIZES = (8, 16, 32, 64, 128)
|
|
INT_SIZES_SAMPLER = Sampler((4.0, 8.0, 1.0, 1.0, 0.5))
|
|
|
|
|
|
class many:
|
|
"""Utility class for collections. Bundles up the logic we use for "should I
|
|
keep drawing more values?" and handles starting and stopping examples in
|
|
the right place.
|
|
|
|
Intended usage is something like:
|
|
|
|
elements = many(data, ...)
|
|
while elements.more():
|
|
add_stuff_to_result()
|
|
"""
|
|
|
|
def __init__(self, data, min_size, max_size, average_size):
|
|
assert 0 <= min_size <= average_size <= max_size
|
|
self.min_size = min_size
|
|
self.max_size = max_size
|
|
self.data = data
|
|
self.p_continue = _calc_p_continue(average_size - min_size, max_size - min_size)
|
|
self.count = 0
|
|
self.rejections = 0
|
|
self.drawn = False
|
|
self.force_stop = False
|
|
self.rejected = False
|
|
|
|
def more(self):
|
|
"""Should I draw another element to add to the collection?"""
|
|
if self.drawn:
|
|
self.data.stop_example(discard=self.rejected)
|
|
|
|
self.drawn = True
|
|
self.rejected = False
|
|
|
|
self.data.start_example(ONE_FROM_MANY_LABEL)
|
|
|
|
if self.min_size == self.max_size:
|
|
should_continue = self.count < self.min_size
|
|
else:
|
|
forced_result = None
|
|
if self.force_stop:
|
|
forced_result = False
|
|
elif self.count < self.min_size:
|
|
forced_result = True
|
|
elif self.count >= self.max_size:
|
|
forced_result = False
|
|
should_continue = biased_coin(
|
|
self.data, self.p_continue, forced=forced_result
|
|
)
|
|
|
|
if should_continue:
|
|
self.count += 1
|
|
return True
|
|
else:
|
|
self.data.stop_example()
|
|
return False
|
|
|
|
def reject(self):
|
|
"""Reject the last example (i.e. don't count it towards our budget of
|
|
elements because it's not going to go in the final collection)."""
|
|
assert self.count > 0
|
|
self.count -= 1
|
|
self.rejections += 1
|
|
self.rejected = True
|
|
# We set a minimum number of rejections before we give up to avoid
|
|
# failing too fast when we reject the first draw.
|
|
if self.rejections > max(3, 2 * self.count):
|
|
if self.count < self.min_size:
|
|
self.data.mark_invalid()
|
|
else:
|
|
self.force_stop = True
|
|
|
|
|
|
@lru_cache()
|
|
def _calc_p_continue(desired_avg, max_size):
|
|
"""Return the p_continue which will generate the desired average size."""
|
|
if desired_avg == max_size:
|
|
return 1.0
|
|
p_continue = 1 - 1.0 / (1 + desired_avg)
|
|
if p_continue == 0 or max_size == float("inf"):
|
|
assert 0 <= p_continue < 1, p_continue
|
|
return p_continue
|
|
# For small max_size, the infinite-series p_continue is a poor approximation,
|
|
# and while we can't solve the polynomial a few rounds of iteration quickly
|
|
# gets us a good approximate solution in almost all cases (sometimes exact!).
|
|
while _p_continue_to_avg(p_continue, max_size) > desired_avg:
|
|
# This is impossible over the reals, but *can* happen with floats.
|
|
p_continue -= 0.0001
|
|
# Let's binary-search our way to a better estimate! We tried fancier options
|
|
# like gradient descent, but this is numerically stable and works better.
|
|
hi = 1.0
|
|
while desired_avg - _p_continue_to_avg(p_continue, max_size) > 0.01:
|
|
assert p_continue < hi
|
|
mid = (p_continue + hi) / 2
|
|
if _p_continue_to_avg(mid, max_size) <= desired_avg:
|
|
p_continue = mid
|
|
else:
|
|
hi = mid
|
|
assert 0 < p_continue < 1
|
|
assert _p_continue_to_avg(p_continue, max_size) <= desired_avg
|
|
return p_continue
|
|
|
|
|
|
def _p_continue_to_avg(p_continue, max_size):
|
|
"""Return the average_size generated by this p_continue and max_size."""
|
|
if p_continue >= 1:
|
|
return max_size
|
|
return (1.0 / (1 - p_continue) - 1) * (1 - p_continue ** max_size)
|