microproduct/atmosphericDelay/ISCEApp/site-packages/whoosh/util/numeric.py

318 lines
11 KiB
Python

# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
import math, struct
from array import array
from bisect import bisect_left
from struct import pack, unpack
from whoosh.compat import b, long_type
from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort
from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint
from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong
from whoosh.system import pack_float, unpack_float, pack_double, unpack_double
NaN = struct.unpack("<d", b('\xff\xff\xff\xff\xff\xff\xff\xff'))[0]
typecode_max = {"b": 127, "B": 255, "h": 2 ** 15 - 1, "H": 2 ** 16 - 1,
"i": 2 ** 31 - 1, "I": 2 ** 32 - 1,
"q": 2 ** 63 - 1, "Q": 2 ** 64 - 1}
typecode_min = {"b": 0 - 128, "B": 0, "h": 0 - 2 ** 15, "H": 0,
"i": 0 - 2 ** 31, "I": 0,
"q": 0 - 2 ** 63, "Q": 0}
typecode_pack = {"B": pack_byte, "H": pack_ushort, "i": pack_int,
"I": pack_uint, "q": pack_long, "Q": pack_ulong,
"f": pack_float, "d": pack_double}
typecode_unpack = {"B": unpack_byte, "H": unpack_ushort, "i": unpack_int,
"I": unpack_uint, "q": unpack_long, "Q": unpack_ulong,
"f": unpack_float, "d": unpack_double}
# Functions related to binary representations
def bits_required(maxnum):
"""Returns the number of bits required to represent the given (unsigned)
integer.
"""
return max(1, math.ceil(math.log(maxnum, 2)))
def typecode_required(maxnum):
if maxnum < 256:
return "B"
elif maxnum < 2 ** 16:
return "H"
elif maxnum < 2 ** 31 - 1:
return "i"
elif maxnum < 2 ** 32:
return "I"
elif maxnum < 2 ** 63 - 1:
return "q"
else:
return "Q"
def max_value(bitcount):
"""Returns the maximum (unsigned) integer representable in the given number
of bits.
"""
return ~(~0 << bitcount)
def bytes_for_bits(bitcount):
r = int(math.ceil((bitcount + 1) / 8.0))
return r
# Functions for converting numbers to and from sortable representations
_istruct = struct.Struct(">i")
_qstruct = struct.Struct(">q")
_dstruct = struct.Struct(">d")
_ipack, _iunpack = _istruct.pack, _istruct.unpack
_qpack, _qunpack = _qstruct.pack, _qstruct.unpack
_dpack, _dunpack = _dstruct.pack, _dstruct.unpack
def to_sortable(numtype, intsize, signed, x):
if numtype is int or numtype is long_type:
if signed:
x += (1 << intsize - 1)
return x
else:
return float_to_sortable_long(x, signed)
def from_sortable(numtype, intsize, signed, x):
if numtype is int or numtype is long_type:
if signed:
x -= (1 << intsize - 1)
return x
else:
return sortable_long_to_float(x, signed)
def float_to_sortable_long(x, signed):
x = _qunpack(_dpack(x))[0]
if x < 0:
x ^= 0x7fffffffffffffff
if signed:
x += 1 << 63
assert x >= 0
return x
def sortable_long_to_float(x, signed):
if signed:
x -= 1 << 63
if x < 0:
x ^= 0x7fffffffffffffff
x = _dunpack(_qpack(x))[0]
return x
# Functions for generating tiered ranges
def split_ranges(intsize, step, start, end):
"""Splits a range of numbers (from ``start`` to ``end``, inclusive)
into a sequence of trie ranges of the form ``(start, end, shift)``. The
consumer of these tuples is expected to shift the ``start`` and ``end``
right by ``shift``.
This is used for generating term ranges for a numeric field. The queries
for the edges of the range are generated at high precision and large blocks
in the middle are generated at low precision.
"""
shift = 0
while True:
diff = 1 << (shift + step)
mask = ((1 << step) - 1) << shift
setbits = lambda x: x | ((1 << shift) - 1)
haslower = (start & mask) != 0
hasupper = (end & mask) != mask
not_mask = ~mask & ((1 << intsize + 1) - 1)
nextstart = (start + diff if haslower else start) & not_mask
nextend = (end - diff if hasupper else end) & not_mask
if shift + step >= intsize or nextstart > nextend:
yield (start, setbits(end), shift)
break
if haslower:
yield (start, setbits(start | mask), shift)
if hasupper:
yield (end & not_mask, setbits(end), shift)
start = nextstart
end = nextend
shift += step
def tiered_ranges(numtype, intsize, signed, start, end, shift_step,
startexcl, endexcl):
assert numtype in (int, float)
assert intsize in (8, 16, 32, 64)
# Convert start and end values to sortable ints
if start is None:
start = 0
else:
start = to_sortable(numtype, intsize, signed, start)
if startexcl:
start += 1
if end is None:
end = 2 ** intsize - 1
else:
end = to_sortable(numtype, intsize, signed, end)
if endexcl:
end -= 1
if not shift_step:
return ((start, end, 0),)
# Yield (rstart, rend, shift) ranges for the different resolutions
return split_ranges(intsize, shift_step, start, end)
# Float-to-byte encoding/decoding
def float_to_byte(value, mantissabits=5, zeroexp=2):
"""Encodes a floating point number in a single byte.
"""
# Assume int size == float size
fzero = (63 - zeroexp) << mantissabits
bits = unpack("i", pack("f", value))[0]
smallfloat = bits >> (24 - mantissabits)
if smallfloat < fzero:
# Map negative numbers and 0 to 0
# Map underflow to next smallest non-zero number
if bits <= 0:
result = chr(0)
else:
result = chr(1)
elif smallfloat >= fzero + 0x100:
# Map overflow to largest number
result = chr(255)
else:
result = chr(smallfloat - fzero)
return b(result)
def byte_to_float(b, mantissabits=5, zeroexp=2):
"""Decodes a floating point number stored in a single byte.
"""
if type(b) is not int:
b = ord(b)
if b == 0:
return 0.0
bits = (b & 0xff) << (24 - mantissabits)
bits += (63 - zeroexp) << 24
return unpack("f", pack("i", bits))[0]
# Length-to-byte approximation functions
# Old implementation:
#def length_to_byte(length):
# """Returns a logarithmic approximation of the given number, in the range
# 0-255. The approximation has high precision at the low end (e.g.
# 1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers
# equal to or greater than 108116 all approximate to 255.
#
# This is useful for storing field lengths, where the general case is small
# documents and very large documents are more rare.
# """
#
# # This encoding formula works up to 108116 -> 255, so if the length is
# # equal to or greater than that limit, just return 255.
# if length >= 108116:
# return 255
#
# # The parameters of this formula where chosen heuristically so that low
# # numbers would approximate closely, and the byte range 0-255 would cover
# # a decent range of document lengths (i.e. 1 to ~100000).
# return int(round(log((length / 27.0) + 1, 1.033)))
#def _byte_to_length(n):
# return int(round((pow(1.033, n) - 1) * 27))
#_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256)))
#byte_to_length = _b2l_cache.__getitem__
# New implementation
# Instead of computing the actual formula to get the byte for any given length,
# precompute the length associated with each byte, and use bisect to find the
# nearest value. This gives quite a large speed-up.
#
# Note that this does not give all the same answers as the old, "real"
# implementation since this implementation always "rounds down" (thanks to the
# bisect_left) while the old implementation would "round up" or "round down"
# depending on the input. Since this is a fairly gross approximation anyway,
# I don't think it matters much.
# Values generated using the formula from the "old" implementation above
_length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14,
16, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52,
54, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114,
119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211,
219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373,
386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645,
667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066,
1103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641,
1696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516,
2600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852,
3980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889,
6084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995,
9293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868,
13294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403,
19011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314,
27183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620,
38863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780,
55556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876,
79414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374])
def length_to_byte(length):
if length is None:
return 0
if length >= 106374:
return 255
else:
return bisect_left(_length_byte_cache, length)
byte_to_length = _length_byte_cache.__getitem__