microproduct/atmosphericDelay/ISCEApp/site-packages/hypothesis/internal/conjecture/shrinker.py

1478 lines
57 KiB
Python

# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Most of this work is copyright (C) 2013-2021 David R. MacIver
# (david@drmaciver.com), but it contains contributions by others. See
# CONTRIBUTING.rst for a full list of people who may hold copyright, and
# consult the git log if you need to determine who owns an individual
# contribution.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
#
# END HEADER
from collections import defaultdict
from typing import Dict
import attr
from hypothesis.internal.compat import int_from_bytes, int_to_bytes
from hypothesis.internal.conjecture.choicetree import (
ChoiceTree,
prefix_selection_order,
random_selection_order,
)
from hypothesis.internal.conjecture.data import ConjectureResult, Status
from hypothesis.internal.conjecture.floats import (
DRAW_FLOAT_LABEL,
float_to_lex,
lex_to_float,
)
from hypothesis.internal.conjecture.junkdrawer import (
binary_search,
find_integer,
replace_all,
)
from hypothesis.internal.conjecture.shrinking import Float, Integer, Lexical, Ordering
from hypothesis.internal.conjecture.shrinking.learned_dfas import SHRINKING_DFAS
def sort_key(buffer):
"""Returns a sort key such that "simpler" buffers are smaller than
"more complicated" ones.
We define sort_key so that x is simpler than y if x is shorter than y or if
they have the same length and x < y lexicographically. This is called the
shortlex order.
The reason for using the shortlex order is:
1. If x is shorter than y then that means we had to make fewer decisions
in constructing the test case when we ran x than we did when we ran y.
2. If x is the same length as y then replacing a byte with a lower byte
corresponds to reducing the value of an integer we drew with draw_bits
towards zero.
3. We want a total order, and given (2) the natural choices for things of
the same size are either the lexicographic or colexicographic orders
(the latter being the lexicographic order of the reverse of the string).
Because values drawn early in generation potentially get used in more
places they potentially have a more significant impact on the final
result, so it makes sense to prioritise reducing earlier values over
later ones. This makes the lexicographic order the more natural choice.
"""
return (len(buffer), buffer)
SHRINK_PASS_DEFINITIONS: Dict[str, "ShrinkPassDefinition"] = {}
@attr.s()
class ShrinkPassDefinition:
"""A shrink pass bundles together a large number of local changes to
the current shrink target.
Each shrink pass is defined by some function and some arguments to that
function. The ``generate_arguments`` function returns all arguments that
might be useful to run on the current shrink target.
The guarantee made by methods defined this way is that after they are
called then *either* the shrink target has changed *or* each of
``fn(*args)`` has been called for every ``args`` in ``generate_arguments(self)``.
No guarantee is made that all of these will be called if the shrink target
changes.
"""
run_with_chooser = attr.ib()
@property
def name(self):
return self.run_with_chooser.__name__
def __attrs_post_init__(self):
assert self.name not in SHRINK_PASS_DEFINITIONS, self.name
SHRINK_PASS_DEFINITIONS[self.name] = self
def defines_shrink_pass():
"""A convenient decorator for defining shrink passes."""
def accept(run_step):
ShrinkPassDefinition(run_with_chooser=run_step)
def run(self):
raise NotImplementedError("Shrink passes should not be run directly")
run.__name__ = run_step.__name__
run.is_shrink_pass = True
return run
return accept
class Shrinker:
"""A shrinker is a child object of a ConjectureRunner which is designed to
manage the associated state of a particular shrink problem. That is, we
have some initial ConjectureData object and some property of interest
that it satisfies, and we want to find a ConjectureData object with a
shortlex (see sort_key above) smaller buffer that exhibits the same
property.
Currently the only property of interest we use is that the status is
INTERESTING and the interesting_origin takes on some fixed value, but we
may potentially be interested in other use cases later.
However we assume that data with a status < VALID never satisfies the predicate.
The shrinker keeps track of a value shrink_target which represents the
current best known ConjectureData object satisfying the predicate.
It refines this value by repeatedly running *shrink passes*, which are
methods that perform a series of transformations to the current shrink_target
and evaluate the underlying test function to find new ConjectureData
objects. If any of these satisfy the predicate, the shrink_target
is updated automatically. Shrinking runs until no shrink pass can
improve the shrink_target, at which point it stops. It may also be
terminated if the underlying engine throws RunIsComplete, but that
is handled by the calling code rather than the Shrinker.
=======================
Designing Shrink Passes
=======================
Generally a shrink pass is just any function that calls
cached_test_function and/or incorporate_new_buffer a number of times,
but there are a couple of useful things to bear in mind.
A shrink pass *makes progress* if running it changes self.shrink_target
(i.e. it tries a shortlex smaller ConjectureData object satisfying
the predicate). The desired end state of shrinking is to find a
value such that no shrink pass can make progress, i.e. that we
are at a local minimum for each shrink pass.
In aid of this goal, the main invariant that a shrink pass much
satisfy is that whether it makes progress must be deterministic.
It is fine (encouraged even) for the specific progress it makes
to be non-deterministic, but if you run a shrink pass, it makes
no progress, and then you immediately run it again, it should
never succeed on the second time. This allows us to stop as soon
as we have run each shrink pass and seen no progress on any of
them.
This means that e.g. it's fine to try each of N deletions
or replacements in a random order, but it's not OK to try N random
deletions (unless you have already shrunk at least once, though we
don't currently take advantage of this loophole).
Shrink passes need to be written so as to be robust against
change in the underlying shrink target. It is generally safe
to assume that the shrink target does not change prior to the
point of first modification - e.g. if you change no bytes at
index ``i``, all examples whose start is ``<= i`` still exist,
as do all blocks, and the data object is still of length
``>= i + 1``. This can only be violated by bad user code which
relies on an external source of non-determinism.
When the underlying shrink_target changes, shrink
passes should not run substantially more test_function calls
on success than they do on failure. Say, no more than a constant
factor more. In particular shrink passes should not iterate to a
fixed point.
This means that shrink passes are often written with loops that
are carefully designed to do the right thing in the case that no
shrinks occurred and try to adapt to any changes to do a reasonable
job. e.g. say we wanted to write a shrink pass that tried deleting
each individual byte (this isn't an especially good choice,
but it leads to a simple illustrative example), we might do it
by iterating over the buffer like so:
.. code-block:: python
i = 0
while i < len(self.shrink_target.buffer):
if not self.incorporate_new_buffer(
self.shrink_target.buffer[:i] + self.shrink_target.buffer[i + 1 :]
):
i += 1
The reason for writing the loop this way is that i is always a
valid index into the current buffer, even if the current buffer
changes as a result of our actions. When the buffer changes,
we leave the index where it is rather than restarting from the
beginning, and carry on. This means that the number of steps we
run in this case is always bounded above by the number of steps
we would run if nothing works.
Another thing to bear in mind about shrink pass design is that
they should prioritise *progress*. If you have N operations that
you need to run, you should try to order them in such a way as
to avoid stalling, where you have long periods of test function
invocations where no shrinks happen. This is bad because whenever
we shrink we reduce the amount of work the shrinker has to do
in future, and often speed up the test function, so we ideally
wanted those shrinks to happen much earlier in the process.
Sometimes stalls are inevitable of course - e.g. if the pass
makes no progress, then the entire thing is just one long stall,
but it's helpful to design it so that stalls are less likely
in typical behaviour.
The two easiest ways to do this are:
* Just run the N steps in random order. As long as a
reasonably large proportion of the operations succeed, this
guarantees the expected stall length is quite short. The
book keeping for making sure this does the right thing when
it succeeds can be quite annoying.
* When you have any sort of nested loop, loop in such a way
that both loop variables change each time. This prevents
stalls which occur when one particular value for the outer
loop is impossible to make progress on, rendering the entire
inner loop into a stall.
However, although progress is good, too much progress can be
a bad sign! If you're *only* seeing successful reductions,
that's probably a sign that you are making changes that are
too timid. Two useful things to offset this:
* It's worth writing shrink passes which are *adaptive*, in
the sense that when operations seem to be working really
well we try to bundle multiple of them together. This can
often be used to turn what would be O(m) successful calls
into O(log(m)).
* It's often worth trying one or two special minimal values
before trying anything more fine grained (e.g. replacing
the whole thing with zero).
"""
def derived_value(fn): # noqa: B902
"""It's useful during shrinking to have access to derived values of
the current shrink target.
This decorator allows you to define these as cached properties. They
are calculated once, then cached until the shrink target changes, then
recalculated the next time they are used."""
def accept(self):
try:
return self.__derived_values[fn.__name__]
except KeyError:
return self.__derived_values.setdefault(fn.__name__, fn(self))
accept.__name__ = fn.__name__
return property(accept)
def __init__(self, engine, initial, predicate, allow_transition):
"""Create a shrinker for a particular engine, with a given starting
point and predicate. When shrink() is called it will attempt to find an
example for which predicate is True and which is strictly smaller than
initial.
Note that initial is a ConjectureData object, and predicate
takes ConjectureData objects.
"""
assert predicate is not None or allow_transition is not None
self.engine = engine
self.__predicate = predicate or (lambda data: True)
self.__allow_transition = allow_transition or (lambda source, destination: True)
self.__derived_values = {}
self.__pending_shrink_explanation = None
self.initial_size = len(initial.buffer)
# We keep track of the current best example on the shrink_target
# attribute.
self.shrink_target = None
self.update_shrink_target(initial)
self.shrinks = 0
# We terminate shrinks that seem to have reached their logical
# conclusion: If we've called the underlying test function at
# least self.max_stall times since the last time we shrunk,
# it's time to stop shrinking.
self.max_stall = 200
self.initial_calls = self.engine.call_count
self.calls_at_last_shrink = self.initial_calls
self.passes_by_name = {}
self.passes = []
# Extra DFAs that may be installed. This is used solely for
# testing and learning purposes.
self.extra_dfas = {}
@derived_value # type: ignore
def cached_calculations(self):
return {}
def cached(self, *keys):
def accept(f):
cache_key = (f.__name__,) + keys
try:
return self.cached_calculations[cache_key]
except KeyError:
return self.cached_calculations.setdefault(cache_key, f())
return accept
def add_new_pass(self, run):
"""Creates a shrink pass corresponding to calling ``run(self)``"""
definition = SHRINK_PASS_DEFINITIONS[run]
p = ShrinkPass(
run_with_chooser=definition.run_with_chooser,
shrinker=self,
index=len(self.passes),
)
self.passes.append(p)
self.passes_by_name[p.name] = p
return p
def shrink_pass(self, name):
"""Return the ShrinkPass object for the pass with the given name."""
if isinstance(name, ShrinkPass):
return name
if name not in self.passes_by_name:
self.add_new_pass(name)
return self.passes_by_name[name]
@derived_value # type: ignore
def match_cache(self):
return {}
def matching_regions(self, dfa):
"""Returns all pairs (u, v) such that self.buffer[u:v] is accepted
by this DFA."""
try:
return self.match_cache[dfa]
except KeyError:
pass
results = dfa.all_matching_regions(self.buffer)
results.sort(key=lambda t: (t[1] - t[0], t[1]))
assert all(dfa.matches(self.buffer[u:v]) for u, v in results)
self.match_cache[dfa] = results
return results
@property
def calls(self):
"""Return the number of calls that have been made to the underlying
test function."""
return self.engine.call_count
def consider_new_buffer(self, buffer):
"""Returns True if after running this buffer the result would be
the current shrink_target."""
buffer = bytes(buffer)
return buffer.startswith(self.buffer) or self.incorporate_new_buffer(buffer)
def incorporate_new_buffer(self, buffer):
"""Either runs the test function on this buffer and returns True if
that changed the shrink_target, or determines that doing so would
be useless and returns False without running it."""
buffer = bytes(buffer[: self.shrink_target.index])
# Sometimes an attempt at lexicographic minimization will do the wrong
# thing because the buffer has changed under it (e.g. something has
# turned into a write, the bit size has changed). The result would be
# an invalid string, but it's better for us to just ignore it here as
# it turns out to involve quite a lot of tricky book-keeping to get
# this right and it's better to just handle it in one place.
if sort_key(buffer) >= sort_key(self.shrink_target.buffer):
return False
if self.shrink_target.buffer.startswith(buffer):
return False
previous = self.shrink_target
self.cached_test_function(buffer)
return previous is not self.shrink_target
def incorporate_test_data(self, data):
"""Takes a ConjectureData or Overrun object updates the current
shrink_target if this data represents an improvement over it."""
if data.status < Status.VALID or data is self.shrink_target:
return
if (
self.__predicate(data)
and sort_key(data.buffer) < sort_key(self.shrink_target.buffer)
and self.__allow_transition(self.shrink_target, data)
):
self.update_shrink_target(data)
def cached_test_function(self, buffer):
"""Returns a cached version of the underlying test function, so
that the result is either an Overrun object (if the buffer is
too short to be a valid test case) or a ConjectureData object
with status >= INVALID that would result from running this buffer."""
buffer = bytes(buffer)
result = self.engine.cached_test_function(buffer)
self.incorporate_test_data(result)
if self.calls - self.calls_at_last_shrink >= self.max_stall:
raise StopShrinking()
return result
def debug(self, msg):
self.engine.debug(msg)
@property
def random(self):
return self.engine.random
def shrink(self):
"""Run the full set of shrinks and update shrink_target.
This method is "mostly idempotent" - calling it twice is unlikely to
have any effect, though it has a non-zero probability of doing so.
"""
# We assume that if an all-zero block of bytes is an interesting
# example then we're not going to do better than that.
# This might not technically be true: e.g. for integers() | booleans()
# the simplest example is actually [1, 0]. Missing this case is fairly
# harmless and this allows us to make various simplifying assumptions
# about the structure of the data (principally that we're never
# operating on a block of all zero bytes so can use non-zeroness as a
# signpost of complexity).
if not any(self.shrink_target.buffer) or self.incorporate_new_buffer(
bytes(len(self.shrink_target.buffer))
):
return
try:
self.greedy_shrink()
except StopShrinking:
pass
finally:
if self.engine.report_debug_info:
def s(n):
return "s" if n != 1 else ""
total_deleted = self.initial_size - len(self.shrink_target.buffer)
self.debug("---------------------")
self.debug("Shrink pass profiling")
self.debug("---------------------")
self.debug("")
calls = self.engine.call_count - self.initial_calls
self.debug(
"Shrinking made a total of %d call%s "
"of which %d shrank. This deleted %d byte%s out of %d."
% (
calls,
s(calls),
self.shrinks,
total_deleted,
s(total_deleted),
self.initial_size,
)
)
for useful in [True, False]:
self.debug("")
if useful:
self.debug("Useful passes:")
else:
self.debug("Useless passes:")
self.debug("")
for p in sorted(
self.passes, key=lambda t: (-t.calls, t.deletions, t.shrinks)
):
if p.calls == 0:
continue
if (p.shrinks != 0) != useful:
continue
self.debug(
" * %s made %d call%s of which "
"%d shrank, deleting %d byte%s."
% (
p.name,
p.calls,
s(p.calls),
p.shrinks,
p.deletions,
s(p.deletions),
)
)
self.debug("")
def greedy_shrink(self):
"""Run a full set of greedy shrinks (that is, ones that will only ever
move to a better target) and update shrink_target appropriately.
This method iterates to a fixed point and so is idempontent - calling
it twice will have exactly the same effect as calling it once.
"""
self.fixate_shrink_passes(
[
block_program("X" * 5),
block_program("X" * 4),
block_program("X" * 3),
block_program("X" * 2),
block_program("X" * 1),
"pass_to_descendant",
"reorder_examples",
"minimize_floats",
"minimize_duplicated_blocks",
block_program("-XX"),
"minimize_individual_blocks",
block_program("--X"),
"redistribute_block_pairs",
"lower_blocks_together",
]
+ [dfa_replacement(n) for n in SHRINKING_DFAS]
)
@derived_value # type: ignore
def shrink_pass_choice_trees(self):
return defaultdict(ChoiceTree)
def fixate_shrink_passes(self, passes):
"""Run steps from each pass in ``passes`` until the current shrink target
is a fixed point of all of them."""
passes = list(map(self.shrink_pass, passes))
any_ran = True
while any_ran:
any_ran = False
reordering = {}
# We run remove_discarded after every pass to do cleanup
# keeping track of whether that actually works. Either there is
# no discarded data and it is basically free, or it reliably works
# and deletes data, or it doesn't work. In that latter case we turn
# it off for the rest of this loop through the passes, but will
# try again once all of the passes have been run.
can_discard = self.remove_discarded()
calls_at_loop_start = self.calls
# We keep track of how many calls can be made by a single step
# without making progress and use this to test how much to pad
# out self.max_stall by as we go along.
max_calls_per_failing_step = 1
for sp in passes:
if can_discard:
can_discard = self.remove_discarded()
before_sp = self.shrink_target
# Run the shrink pass until it fails to make any progress
# max_failures times in a row. This implicitly boosts shrink
# passes that are more likely to work.
failures = 0
max_failures = 20
while failures < max_failures:
# We don't allow more than max_stall consecutive failures
# to shrink, but this means that if we're unlucky and the
# shrink passes are in a bad order where only the ones at
# the end are useful, if we're not careful this heuristic
# might stop us before we've tried everything. In order to
# avoid that happening, we make sure that there's always
# plenty of breathing room to make it through a single
# iteration of the fixate_shrink_passes loop.
self.max_stall = max(
self.max_stall,
2 * max_calls_per_failing_step
+ (self.calls - calls_at_loop_start),
)
prev = self.shrink_target
initial_calls = self.calls
# It's better for us to run shrink passes in a deterministic
# order, to avoid repeat work, but this can cause us to create
# long stalls when there are a lot of steps which fail to do
# anything useful. In order to avoid this, once we've noticed
# we're in a stall (i.e. half of max_failures calls have failed
# to do anything) we switch to randomly jumping around. If we
# find a success then we'll resume deterministic order from
# there which, with any luck, is in a new good region.
if not sp.step(random_order=failures >= max_failures // 2):
# step returns False when there is nothing to do because
# the entire choice tree is exhausted. If this happens
# we break because we literally can't run this pass any
# more than we already have until something else makes
# progress.
break
any_ran = True
# Don't count steps that didn't actually try to do
# anything as failures. Otherwise, this call is a failure
# if it failed to make any changes to the shrink target.
if initial_calls != self.calls:
if prev is not self.shrink_target:
failures = 0
else:
max_calls_per_failing_step = max(
max_calls_per_failing_step, self.calls - initial_calls
)
failures += 1
# We reorder the shrink passes so that on our next run through
# we try good ones first. The rule is that shrink passes that
# did nothing useful are the worst, shrink passes that reduced
# the length are the best.
if self.shrink_target is before_sp:
reordering[sp] = 1
elif len(self.buffer) < len(before_sp.buffer):
reordering[sp] = -1
else:
reordering[sp] = 0
passes.sort(key=reordering.__getitem__)
@property
def buffer(self):
return self.shrink_target.buffer
@property
def blocks(self):
return self.shrink_target.blocks
@property
def examples(self):
return self.shrink_target.examples
def all_block_bounds(self):
return self.shrink_target.blocks.all_bounds()
@derived_value # type: ignore
def examples_by_label(self):
"""An index of all examples grouped by their label, with
the examples stored in their normal index order."""
examples_by_label = defaultdict(list)
for ex in self.examples:
examples_by_label[ex.label].append(ex)
return dict(examples_by_label)
@derived_value # type: ignore
def distinct_labels(self):
return sorted(self.examples_by_label, key=str)
@defines_shrink_pass()
def pass_to_descendant(self, chooser):
"""Attempt to replace each example with a descendant example.
This is designed to deal with strategies that call themselves
recursively. For example, suppose we had:
binary_tree = st.deferred(
lambda: st.one_of(
st.integers(), st.tuples(binary_tree, binary_tree)))
This pass guarantees that we can replace any binary tree with one of
its subtrees - each of those will create an interval that the parent
could validly be replaced with, and this pass will try doing that.
This is pretty expensive - it takes O(len(intervals)^2) - so we run it
late in the process when we've got the number of intervals as far down
as possible.
"""
label = chooser.choose(
self.distinct_labels, lambda l: len(self.examples_by_label[l]) >= 2
)
ls = self.examples_by_label[label]
i = chooser.choose(range(len(ls) - 1))
ancestor = ls[i]
if i + 1 == len(ls) or ls[i + 1].start >= ancestor.end:
return
@self.cached(label, i)
def descendants():
lo = i + 1
hi = len(ls)
while lo + 1 < hi:
mid = (lo + hi) // 2
if ls[mid].start >= ancestor.end:
hi = mid
else:
lo = mid
return [t for t in ls[i + 1 : hi] if t.length < ancestor.length]
descendant = chooser.choose(descendants, lambda ex: ex.length > 0)
assert ancestor.start <= descendant.start
assert ancestor.end >= descendant.end
assert descendant.length < ancestor.length
self.incorporate_new_buffer(
self.buffer[: ancestor.start]
+ self.buffer[descendant.start : descendant.end]
+ self.buffer[ancestor.end :]
)
def lower_common_block_offset(self):
"""Sometimes we find ourselves in a situation where changes to one part
of the byte stream unlock changes to other parts. Sometimes this is
good, but sometimes this can cause us to exhibit exponential slow
downs!
e.g. suppose we had the following:
m = draw(integers(min_value=0))
n = draw(integers(min_value=0))
assert abs(m - n) > 1
If this fails then we'll end up with a loop where on each iteration we
reduce each of m and n by 2 - m can't go lower because of n, then n
can't go lower because of m.
This will take us O(m) iterations to complete, which is exponential in
the data size, as we gradually zig zag our way towards zero.
This can only happen if we're failing to reduce the size of the byte
stream: The number of iterations that reduce the length of the byte
stream is bounded by that length.
So what we do is this: We keep track of which blocks are changing, and
then if there's some non-zero common offset to them we try and minimize
them all at once by lowering that offset.
This may not work, and it definitely won't get us out of all possible
exponential slow downs (an example of where it doesn't is where the
shape of the blocks changes as a result of this bouncing behaviour),
but it fails fast when it doesn't work and gets us out of a really
nastily slow case when it does.
"""
if len(self.__changed_blocks) <= 1:
return
current = self.shrink_target
blocked = [current.buffer[u:v] for u, v in self.all_block_bounds()]
changed = [
i
for i in sorted(self.__changed_blocks)
if not self.shrink_target.blocks[i].trivial
]
if not changed:
return
ints = [int_from_bytes(blocked[i]) for i in changed]
offset = min(ints)
assert offset > 0
for i in range(len(ints)):
ints[i] -= offset
def reoffset(o):
new_blocks = list(blocked)
for i, v in zip(changed, ints):
new_blocks[i] = int_to_bytes(v + o, len(blocked[i]))
return self.incorporate_new_buffer(b"".join(new_blocks))
Integer.shrink(offset, reoffset, random=self.random)
self.clear_change_tracking()
def clear_change_tracking(self):
self.__last_checked_changed_at = self.shrink_target
self.__all_changed_blocks = set()
def mark_changed(self, i):
self.__changed_blocks.add(i)
@property
def __changed_blocks(self):
if self.__last_checked_changed_at is not self.shrink_target:
prev_target = self.__last_checked_changed_at
new_target = self.shrink_target
assert prev_target is not new_target
prev = prev_target.buffer
new = new_target.buffer
assert sort_key(new) < sort_key(prev)
if (
len(new_target.blocks) != len(prev_target.blocks)
or new_target.blocks.endpoints != prev_target.blocks.endpoints
):
self.__all_changed_blocks = set()
else:
blocks = new_target.blocks
# Index of last block whose contents have been modified, found
# by checking if the tail past this point has been modified.
last_changed = binary_search(
0,
len(blocks),
lambda i: prev[blocks.start(i) :] != new[blocks.start(i) :],
)
# Index of the first block whose contents have been changed,
# because we know that this predicate is true for zero (because
# the prefix from the start is empty), so the result must be True
# for the bytes from the start of this block and False for the
# bytes from the end, hence the change is in this block.
first_changed = binary_search(
0,
len(blocks),
lambda i: prev[: blocks.start(i)] == new[: blocks.start(i)],
)
# Between these two changed regions we now do a linear scan to
# check if any specific block values have changed.
for i in range(first_changed, last_changed + 1):
u, v = blocks.bounds(i)
if i not in self.__all_changed_blocks and prev[u:v] != new[u:v]:
self.__all_changed_blocks.add(i)
self.__last_checked_changed_at = new_target
assert self.__last_checked_changed_at is self.shrink_target
return self.__all_changed_blocks
def update_shrink_target(self, new_target):
assert isinstance(new_target, ConjectureResult)
if self.shrink_target is not None:
self.shrinks += 1
# If we are just taking a long time to shrink we don't want to
# trigger this heuristic, so whenever we shrink successfully
# we give ourselves a bit of breathing room to make sure we
# would find a shrink that took that long to find the next time.
# The case where we're taking a long time but making steady
# progress is handled by `finish_shrinking_deadline` in engine.py
self.max_stall = max(
self.max_stall, (self.calls - self.calls_at_last_shrink) * 2
)
self.calls_at_last_shrink = self.calls
else:
self.__all_changed_blocks = set()
self.__last_checked_changed_at = new_target
self.shrink_target = new_target
self.__derived_values = {}
def try_shrinking_blocks(self, blocks, b):
"""Attempts to replace each block in the blocks list with b. Returns
True if it succeeded (which may include some additional modifications
to shrink_target).
In current usage it is expected that each of the blocks currently have
the same value, although this is not essential. Note that b must be
< the block at min(blocks) or this is not a valid shrink.
This method will attempt to do some small amount of work to delete data
that occurs after the end of the blocks. This is useful for cases where
there is some size dependency on the value of a block.
"""
initial_attempt = bytearray(self.shrink_target.buffer)
for i, block in enumerate(blocks):
if block >= len(self.blocks):
blocks = blocks[:i]
break
u, v = self.blocks[block].bounds
n = min(self.blocks[block].length, len(b))
initial_attempt[v - n : v] = b[-n:]
if not blocks:
return False
start = self.shrink_target.blocks[blocks[0]].start
end = self.shrink_target.blocks[blocks[-1]].end
initial_data = self.cached_test_function(initial_attempt)
if initial_data is self.shrink_target:
self.lower_common_block_offset()
return True
# If this produced something completely invalid we ditch it
# here rather than trying to persevere.
if initial_data.status < Status.VALID:
return False
# We've shrunk inside our group of blocks, so we have no way to
# continue. (This only happens when shrinking more than one block at
# a time).
if len(initial_data.buffer) < v:
return False
lost_data = len(self.shrink_target.buffer) - len(initial_data.buffer)
# If this did not in fact cause the data size to shrink we
# bail here because it's not worth trying to delete stuff from
# the remainder.
if lost_data <= 0:
return False
# We now look for contiguous regions to delete that might help fix up
# this failed shrink. We only look for contiguous regions of the right
# lengths because doing anything more than that starts to get very
# expensive. See minimize_individual_blocks for where we
# try to be more aggressive.
regions_to_delete = {(end, end + lost_data)}
for j in (blocks[-1] + 1, blocks[-1] + 2):
if j >= min(len(initial_data.blocks), len(self.blocks)):
continue
# We look for a block very shortly after the last one that has
# lost some of its size, and try to delete from the beginning so
# that it retains the same integer value. This is a bit of a hyper
# specific trick designed to make our integers() strategy shrink
# well.
r1, s1 = self.shrink_target.blocks[j].bounds
r2, s2 = initial_data.blocks[j].bounds
lost = (s1 - r1) - (s2 - r2)
# Apparently a coverage bug? An assert False in the body of this
# will reliably fail, but it shows up as uncovered.
if lost <= 0 or r1 != r2: # pragma: no cover
continue
regions_to_delete.add((r1, r1 + lost))
for ex in self.shrink_target.examples:
if ex.start > start:
continue
if ex.end <= end:
continue
replacement = initial_data.examples[ex.index]
in_original = [c for c in ex.children if c.start >= end]
in_replaced = [c for c in replacement.children if c.start >= end]
if len(in_replaced) >= len(in_original) or not in_replaced:
continue
# We've found an example where some of the children went missing
# as a result of this change, and just replacing it with the data
# it would have had and removing the spillover didn't work. This
# means that some of its children towards the right must be
# important, so we try to arrange it so that it retains its
# rightmost children instead of its leftmost.
regions_to_delete.add(
(in_original[0].start, in_original[-len(in_replaced)].start)
)
for u, v in sorted(regions_to_delete, key=lambda x: x[1] - x[0], reverse=True):
try_with_deleted = bytearray(initial_attempt)
del try_with_deleted[u:v]
if self.incorporate_new_buffer(try_with_deleted):
return True
return False
def remove_discarded(self):
"""Try removing all bytes marked as discarded.
This is primarily to deal with data that has been ignored while
doing rejection sampling - e.g. as a result of an integer range, or a
filtered strategy.
Such data will also be handled by the adaptive_example_deletion pass,
but that pass is necessarily more conservative and will try deleting
each interval individually. The common case is that all data drawn and
rejected can just be thrown away immediately in one block, so this pass
will be much faster than trying each one individually when it works.
returns False if there is discarded data and removing it does not work,
otherwise returns True.
"""
while self.shrink_target.has_discards:
discarded = []
for ex in self.shrink_target.examples:
if (
ex.length > 0
and ex.discarded
and (not discarded or ex.start >= discarded[-1][-1])
):
discarded.append((ex.start, ex.end))
# This can happen if we have discards but they are all of
# zero length. This shouldn't happen very often so it's
# faster to check for it here than at the point of example
# generation.
if not discarded:
break
attempt = bytearray(self.shrink_target.buffer)
for u, v in reversed(discarded):
del attempt[u:v]
if not self.incorporate_new_buffer(attempt):
return False
return True
@derived_value # type: ignore
def blocks_by_non_zero_suffix(self):
"""Returns a list of blocks grouped by their non-zero suffix,
as a list of (suffix, indices) pairs, skipping all groupings
where there is only one index.
This is only used for the arguments of minimize_duplicated_blocks.
"""
duplicates = defaultdict(list)
for block in self.blocks:
duplicates[non_zero_suffix(self.buffer[block.start : block.end])].append(
block.index
)
return duplicates
@derived_value # type: ignore
def duplicated_block_suffixes(self):
return sorted(self.blocks_by_non_zero_suffix)
@defines_shrink_pass()
def minimize_duplicated_blocks(self, chooser):
"""Find blocks that have been duplicated in multiple places and attempt
to minimize all of the duplicates simultaneously.
This lets us handle cases where two values can't be shrunk
independently of each other but can easily be shrunk together.
For example if we had something like:
ls = data.draw(lists(integers()))
y = data.draw(integers())
assert y not in ls
Suppose we drew y = 3 and after shrinking we have ls = [3]. If we were
to replace both 3s with 0, this would be a valid shrink, but if we were
to replace either 3 with 0 on its own the test would start passing.
It is also useful for when that duplication is accidental and the value
of the blocks doesn't matter very much because it allows us to replace
more values at once.
"""
block = chooser.choose(self.duplicated_block_suffixes)
targets = self.blocks_by_non_zero_suffix[block]
if len(targets) <= 1:
return
Lexical.shrink(
block,
lambda b: self.try_shrinking_blocks(targets, b),
random=self.random,
full=False,
)
@defines_shrink_pass()
def minimize_floats(self, chooser):
"""Some shrinks that we employ that only really make sense for our
specific floating point encoding that are hard to discover from any
sort of reasonable general principle. This allows us to make
transformations like replacing a NaN with an Infinity or replacing
a float with its nearest integers that we would otherwise not be
able to due to them requiring very specific transformations of
the bit sequence.
We only apply these transformations to blocks that "look like" our
standard float encodings because they are only really meaningful
there. The logic for detecting this is reasonably precise, but
it doesn't matter if it's wrong. These are always valid
transformations to make, they just don't necessarily correspond to
anything particularly meaningful for non-float values.
"""
ex = chooser.choose(
self.examples,
lambda ex: (
ex.label == DRAW_FLOAT_LABEL
and len(ex.children) == 2
and ex.children[0].length == 8
),
)
u = ex.children[0].start
v = ex.children[0].end
buf = self.shrink_target.buffer
b = buf[u:v]
f = lex_to_float(int_from_bytes(b))
b2 = int_to_bytes(float_to_lex(f), 8)
if b == b2 or self.consider_new_buffer(buf[:u] + b2 + buf[v:]):
Float.shrink(
f,
lambda x: self.consider_new_buffer(
self.shrink_target.buffer[:u]
+ int_to_bytes(float_to_lex(x), 8)
+ self.shrink_target.buffer[v:]
),
random=self.random,
)
@defines_shrink_pass()
def redistribute_block_pairs(self, chooser):
"""If there is a sum of generated integers that we need their sum
to exceed some bound, lowering one of them requires raising the
other. This pass enables that."""
block = chooser.choose(self.blocks, lambda b: not b.all_zero)
for j in range(block.index + 1, len(self.blocks)):
next_block = self.blocks[j]
if next_block.length == block.length:
break
else:
return
buffer = self.buffer
m = int_from_bytes(buffer[block.start : block.end])
n = int_from_bytes(buffer[next_block.start : next_block.end])
def boost(k):
if k > m:
return False
attempt = bytearray(buffer)
attempt[block.start : block.end] = int_to_bytes(m - k, block.length)
try:
attempt[next_block.start : next_block.end] = int_to_bytes(
n + k, next_block.length
)
except OverflowError:
return False
return self.consider_new_buffer(attempt)
find_integer(boost)
@defines_shrink_pass()
def lower_blocks_together(self, chooser):
block = chooser.choose(self.blocks, lambda b: not b.all_zero)
# Choose the next block to be up to eight blocks onwards. We don't
# want to go too far (to avoid quadratic time) but it's worth a
# reasonable amount of lookahead, especially as we expect most
# blocks are zero by this point anyway.
next_block = self.blocks[
chooser.choose(
range(block.index + 1, min(len(self.blocks), block.index + 9)),
lambda j: not self.blocks[j].all_zero,
)
]
buffer = self.buffer
m = int_from_bytes(buffer[block.start : block.end])
n = int_from_bytes(buffer[next_block.start : next_block.end])
def lower(k):
if k > min(m, n):
return False
attempt = bytearray(buffer)
attempt[block.start : block.end] = int_to_bytes(m - k, block.length)
attempt[next_block.start : next_block.end] = int_to_bytes(
n - k, next_block.length
)
assert len(attempt) == len(buffer)
return self.consider_new_buffer(attempt)
find_integer(lower)
@defines_shrink_pass()
def minimize_individual_blocks(self, chooser):
"""Attempt to minimize each block in sequence.
This is the pass that ensures that e.g. each integer we draw is a
minimum value. So it's the part that guarantees that if we e.g. do
x = data.draw(integers())
assert x < 10
then in our shrunk example, x = 10 rather than say 97.
If we are unsuccessful at minimizing a block of interest we then
check if that's because it's changing the size of the test case and,
if so, we also make an attempt to delete parts of the test case to
see if that fixes it.
We handle most of the common cases in try_shrinking_blocks which is
pretty good at clearing out large contiguous blocks of dead space,
but it fails when there is data that has to stay in particular places
in the list.
"""
block = chooser.choose(self.blocks, lambda b: not b.trivial)
initial = self.shrink_target
u, v = block.bounds
i = block.index
Lexical.shrink(
self.shrink_target.buffer[u:v],
lambda b: self.try_shrinking_blocks((i,), b),
random=self.random,
full=False,
)
if self.shrink_target is not initial:
return
lowered = (
self.buffer[: block.start]
+ int_to_bytes(
int_from_bytes(self.buffer[block.start : block.end]) - 1, block.length
)
+ self.buffer[block.end :]
)
attempt = self.cached_test_function(lowered)
if (
attempt.status < Status.VALID
or len(attempt.buffer) == len(self.buffer)
or len(attempt.buffer) == block.end
):
return
# If it were then the lexical shrink should have worked and we could
# never have got here.
assert attempt is not self.shrink_target
@self.cached(block.index)
def first_example_after_block():
lo = 0
hi = len(self.examples)
while lo + 1 < hi:
mid = (lo + hi) // 2
ex = self.examples[mid]
if ex.start >= block.end:
hi = mid
else:
lo = mid
return hi
ex = self.examples[
chooser.choose(
range(first_example_after_block, len(self.examples)),
lambda i: self.examples[i].length > 0,
)
]
u, v = block.bounds
buf = bytearray(lowered)
del buf[ex.start : ex.end]
self.incorporate_new_buffer(buf)
@defines_shrink_pass()
def reorder_examples(self, chooser):
"""This pass allows us to reorder the children of each example.
For example, consider the following:
.. code-block:: python
import hypothesis.strategies as st
from hypothesis import given
@given(st.text(), st.text())
def test_not_equal(x, y):
assert x != y
Without the ability to reorder x and y this could fail either with
``x=""``, ``y="0"``, or the other way around. With reordering it will
reliably fail with ``x=""``, ``y="0"``.
"""
ex = chooser.choose(self.examples)
label = chooser.choose(ex.children).label
group = [c for c in ex.children if c.label == label]
if len(group) <= 1:
return
st = self.shrink_target
pieces = [st.buffer[ex.start : ex.end] for ex in group]
endpoints = [(ex.start, ex.end) for ex in group]
Ordering.shrink(
pieces,
lambda ls: self.consider_new_buffer(
replace_all(st.buffer, [(u, v, r) for (u, v), r in zip(endpoints, ls)])
),
random=self.random,
)
def run_block_program(self, i, description, original, repeats=1):
"""Block programs are a mini-DSL for block rewriting, defined as a sequence
of commands that can be run at some index into the blocks
Commands are:
* "-", subtract one from this block.
* "X", delete this block
If a command does not apply (currently only because it's - on a zero
block) the block will be silently skipped over.
This method runs the block program in ``description`` at block index
``i`` on the ConjectureData ``original``. If ``repeats > 1`` then it
will attempt to approximate the results of running it that many times.
Returns True if this successfully changes the underlying shrink target,
else False.
"""
if i + len(description) > len(original.blocks) or i < 0:
return False
attempt = bytearray(original.buffer)
for _ in range(repeats):
for k, d in reversed(list(enumerate(description))):
j = i + k
u, v = original.blocks[j].bounds
if v > len(attempt):
return False
if d == "-":
value = int_from_bytes(attempt[u:v])
if value == 0:
return False
else:
attempt[u:v] = int_to_bytes(value - 1, v - u)
elif d == "X":
del attempt[u:v]
else:
raise NotImplementedError(f"Unrecognised command {d!r}")
return self.incorporate_new_buffer(attempt)
def shrink_pass_family(f):
def accept(*args):
name = "{}({})".format(f.__name__, ", ".join(map(repr, args)))
if name not in SHRINK_PASS_DEFINITIONS:
def run(self, chooser):
return f(self, chooser, *args)
run.__name__ = name
defines_shrink_pass()(run)
assert name in SHRINK_PASS_DEFINITIONS
return name
return accept
@shrink_pass_family
def block_program(self, chooser, description):
"""Mini-DSL for block rewriting. A sequence of commands that will be run
over all contiguous sequences of blocks of the description length in order.
Commands are:
* ".", keep this block unchanged
* "-", subtract one from this block.
* "0", replace this block with zero
* "X", delete this block
If a command does not apply (currently only because it's - on a zero
block) the block will be silently skipped over. As a side effect of
running a block program its score will be updated.
"""
n = len(description)
"""Adaptively attempt to run the block program at the current
index. If this successfully applies the block program ``k`` times
then this runs in ``O(log(k))`` test function calls."""
i = chooser.choose(range(len(self.shrink_target.blocks) - n))
# First, run the block program at the chosen index. If this fails,
# don't do any extra work, so that failure is as cheap as possible.
if not self.run_block_program(i, description, original=self.shrink_target):
return
# Because we run in a random order we will often find ourselves in the middle
# of a region where we could run the block program. We thus start by moving
# left to the beginning of that region if possible in order to to start from
# the beginning of that region.
def offset_left(k):
return i - k * n
i = offset_left(
find_integer(
lambda k: self.run_block_program(
offset_left(k), description, original=self.shrink_target
)
)
)
original = self.shrink_target
# Now try to run the block program multiple times here.
find_integer(
lambda k: self.run_block_program(i, description, original=original, repeats=k)
)
@shrink_pass_family
def dfa_replacement(self, chooser, dfa_name):
"""Use one of our previously learned shrinking DFAs to reduce
the current test case. This works by finding a match of the DFA in the
current buffer that is not already minimal and attempting to replace it
with the minimal string matching that DFA.
"""
try:
dfa = SHRINKING_DFAS[dfa_name]
except KeyError:
dfa = self.extra_dfas[dfa_name]
matching_regions = self.matching_regions(dfa)
minimal = next(dfa.all_matching_strings())
u, v = chooser.choose(
matching_regions, lambda t: self.buffer[t[0] : t[1]] != minimal
)
p = self.buffer[u:v]
assert sort_key(minimal) < sort_key(p)
replaced = self.buffer[:u] + minimal + self.buffer[v:]
assert sort_key(replaced) < sort_key(self.buffer)
self.consider_new_buffer(replaced)
@attr.s(slots=True, eq=False)
class ShrinkPass:
run_with_chooser = attr.ib()
index = attr.ib()
shrinker = attr.ib()
last_prefix = attr.ib(default=())
successes = attr.ib(default=0)
calls = attr.ib(default=0)
shrinks = attr.ib(default=0)
deletions = attr.ib(default=0)
def step(self, random_order=False):
tree = self.shrinker.shrink_pass_choice_trees[self]
if tree.exhausted:
return False
initial_shrinks = self.shrinker.shrinks
initial_calls = self.shrinker.calls
size = len(self.shrinker.shrink_target.buffer)
self.shrinker.engine.explain_next_call_as(self.name)
if random_order:
selection_order = random_selection_order(self.shrinker.random)
else:
selection_order = prefix_selection_order(self.last_prefix)
try:
self.last_prefix = tree.step(
selection_order,
lambda chooser: self.run_with_chooser(self.shrinker, chooser),
)
finally:
self.calls += self.shrinker.calls - initial_calls
self.shrinks += self.shrinker.shrinks - initial_shrinks
self.deletions += size - len(self.shrinker.shrink_target.buffer)
self.shrinker.engine.clear_call_explanation()
return True
@property
def name(self):
return self.run_with_chooser.__name__
def non_zero_suffix(b):
"""Returns the longest suffix of b that starts with a non-zero
byte."""
i = 0
while i < len(b) and b[i] == 0:
i += 1
return b[i:]
class StopShrinking(Exception):
pass