736 lines
25 KiB
Python
736 lines
25 KiB
Python
# Copyright 2009 Matt Chaput. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
|
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
|
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# The views and conclusions contained in the software and documentation are
|
|
# those of the authors and should not be interpreted as representing official
|
|
# policies, either expressed or implied, of Matt Chaput.
|
|
|
|
"""This module defines writer and reader classes for a fast, immutable
|
|
on-disk key-value database format. The current format is based heavily on
|
|
D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
|
|
"""
|
|
|
|
import os, struct
|
|
from binascii import crc32
|
|
from bisect import bisect_left
|
|
from hashlib import md5 # @UnresolvedImport
|
|
|
|
from whoosh.compat import b, bytes_type
|
|
from whoosh.compat import xrange
|
|
from whoosh.util.numlists import GrowableArray
|
|
from whoosh.system import _INT_SIZE, emptybytes
|
|
|
|
|
|
# Exceptions
|
|
|
|
class FileFormatError(Exception):
|
|
pass
|
|
|
|
|
|
# Hash functions
|
|
|
|
def cdb_hash(key):
|
|
h = 5381
|
|
for c in key:
|
|
h = (h + (h << 5)) & 0xffffffff ^ ord(c)
|
|
return h
|
|
|
|
|
|
def md5_hash(key):
|
|
return int(md5(key).hexdigest(), 16) & 0xffffffff
|
|
|
|
|
|
def crc_hash(key):
|
|
return crc32(key) & 0xffffffff
|
|
|
|
|
|
_hash_functions = (md5_hash, crc_hash, cdb_hash)
|
|
|
|
|
|
# Structs
|
|
|
|
# Two uints before the key/value pair giving the length of the key and value
|
|
_lengths = struct.Struct("!ii")
|
|
# A pointer in a hash table, giving the hash value and the key position
|
|
_pointer = struct.Struct("!Iq")
|
|
# A pointer in the hash table directory, giving the position and number of slots
|
|
_dir_entry = struct.Struct("!qi")
|
|
|
|
_directory_size = 256 * _dir_entry.size
|
|
|
|
|
|
# Basic hash file
|
|
|
|
class HashWriter(object):
|
|
"""Implements a fast on-disk key-value store. This hash uses a two-level
|
|
hashing scheme, where a key is hashed, the low eight bits of the hash value
|
|
are used to index into one of 256 hash tables. This is basically the CDB
|
|
algorithm, but unlike CDB this object writes all data serially (it doesn't
|
|
seek backwards to overwrite information at the end).
|
|
|
|
Also unlike CDB, this format uses 64-bit file pointers, so the file length
|
|
is essentially unlimited. However, each key and value must be less than
|
|
2 GB in length.
|
|
"""
|
|
|
|
def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
|
|
"""
|
|
:param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
|
|
to write to.
|
|
:param magic: the format tag bytes to write at the start of the file.
|
|
:param hashtype: an integer indicating which hashing algorithm to use.
|
|
Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).
|
|
"""
|
|
|
|
self.dbfile = dbfile
|
|
self.hashtype = hashtype
|
|
self.hashfn = _hash_functions[self.hashtype]
|
|
# A place for subclasses to put extra metadata
|
|
self.extras = {}
|
|
|
|
self.startoffset = dbfile.tell()
|
|
# Write format tag
|
|
dbfile.write(magic)
|
|
# Write hash type
|
|
dbfile.write_byte(self.hashtype)
|
|
# Unused future expansion bits
|
|
dbfile.write_int(0)
|
|
dbfile.write_int(0)
|
|
|
|
# 256 lists of hashed keys and positions
|
|
self.buckets = [[] for _ in xrange(256)]
|
|
# List to remember the positions of the hash tables
|
|
self.directory = []
|
|
|
|
def tell(self):
|
|
return self.dbfile.tell()
|
|
|
|
def add(self, key, value):
|
|
"""Adds a key/value pair to the file. Note that keys DO NOT need to be
|
|
unique. You can store multiple values under the same key and retrieve
|
|
them using :meth:`HashReader.all`.
|
|
"""
|
|
|
|
assert isinstance(key, bytes_type)
|
|
assert isinstance(value, bytes_type)
|
|
|
|
dbfile = self.dbfile
|
|
pos = dbfile.tell()
|
|
dbfile.write(_lengths.pack(len(key), len(value)))
|
|
dbfile.write(key)
|
|
dbfile.write(value)
|
|
|
|
# Get hash value for the key
|
|
h = self.hashfn(key)
|
|
# Add hash and on-disk position to appropriate bucket
|
|
self.buckets[h & 255].append((h, pos))
|
|
|
|
def add_all(self, items):
|
|
"""Convenience method to add a sequence of ``(key, value)`` pairs. This
|
|
is the same as calling :meth:`HashWriter.add` on each pair in the
|
|
sequence.
|
|
"""
|
|
|
|
add = self.add
|
|
for key, value in items:
|
|
add(key, value)
|
|
|
|
def _write_hashes(self):
|
|
# Writes 256 hash tables containing pointers to the key/value pairs
|
|
|
|
dbfile = self.dbfile
|
|
# Represent and empty slot in the hash table using 0,0 (no key can
|
|
# start at position 0 because of the header)
|
|
null = (0, 0)
|
|
|
|
for entries in self.buckets:
|
|
# Start position of this bucket's hash table
|
|
pos = dbfile.tell()
|
|
# Remember the start position and the number of slots
|
|
numslots = 2 * len(entries)
|
|
self.directory.append((pos, numslots))
|
|
|
|
# Create the empty hash table
|
|
hashtable = [null] * numslots
|
|
# For each (hash value, key position) tuple in the bucket
|
|
for hashval, position in entries:
|
|
# Bitshift and wrap to get the slot for this entry
|
|
slot = (hashval >> 8) % numslots
|
|
# If the slot is taken, keep going until we find an empty slot
|
|
while hashtable[slot] != null:
|
|
slot = (slot + 1) % numslots
|
|
# Insert the entry into the hashtable
|
|
hashtable[slot] = (hashval, position)
|
|
|
|
# Write the hash table for this bucket to disk
|
|
for hashval, position in hashtable:
|
|
dbfile.write(_pointer.pack(hashval, position))
|
|
|
|
def _write_directory(self):
|
|
# Writes a directory of pointers to the 256 hash tables
|
|
|
|
dbfile = self.dbfile
|
|
for position, numslots in self.directory:
|
|
dbfile.write(_dir_entry.pack(position, numslots))
|
|
|
|
def _write_extras(self):
|
|
self.dbfile.write_pickle(self.extras)
|
|
|
|
def close(self):
|
|
dbfile = self.dbfile
|
|
|
|
# Write hash tables
|
|
self._write_hashes()
|
|
# Write directory of pointers to hash tables
|
|
self._write_directory()
|
|
|
|
expos = dbfile.tell()
|
|
# Write extra information
|
|
self._write_extras()
|
|
# Write length of pickle
|
|
dbfile.write_int(dbfile.tell() - expos)
|
|
|
|
endpos = dbfile.tell()
|
|
dbfile.close()
|
|
return endpos
|
|
|
|
|
|
class HashReader(object):
|
|
"""Reader for the fast on-disk key-value files created by
|
|
:class:`HashWriter`.
|
|
"""
|
|
|
|
def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0):
|
|
"""
|
|
:param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
|
|
to read from.
|
|
:param length: the length of the file data. This is necessary since the
|
|
hashing information is written at the end of the file.
|
|
:param magic: the format tag bytes to look for at the start of the
|
|
file. If the file's format tag does not match these bytes, the
|
|
object raises a :class:`FileFormatError` exception.
|
|
:param startoffset: the starting point of the file data.
|
|
"""
|
|
|
|
self.dbfile = dbfile
|
|
self.startoffset = startoffset
|
|
self.is_closed = False
|
|
|
|
if length is None:
|
|
dbfile.seek(0, os.SEEK_END)
|
|
length = dbfile.tell() - startoffset
|
|
|
|
dbfile.seek(startoffset)
|
|
# Check format tag
|
|
filemagic = dbfile.read(4)
|
|
if filemagic != magic:
|
|
raise FileFormatError("Unknown file header %r" % filemagic)
|
|
# Read hash type
|
|
self.hashtype = dbfile.read_byte()
|
|
self.hashfn = _hash_functions[self.hashtype]
|
|
# Skip unused future expansion bits
|
|
dbfile.read_int()
|
|
dbfile.read_int()
|
|
self.startofdata = dbfile.tell()
|
|
|
|
exptr = startoffset + length - _INT_SIZE
|
|
# Get the length of extras from the end of the file
|
|
exlen = dbfile.get_int(exptr)
|
|
# Read the extras
|
|
expos = exptr - exlen
|
|
dbfile.seek(expos)
|
|
self._read_extras()
|
|
|
|
# Calculate the directory base from the beginning of the extras
|
|
dbfile.seek(expos - _directory_size)
|
|
# Read directory of hash tables
|
|
self.tables = []
|
|
entrysize = _dir_entry.size
|
|
unpackentry = _dir_entry.unpack
|
|
for _ in xrange(256):
|
|
# position, numslots
|
|
self.tables.append(unpackentry(dbfile.read(entrysize)))
|
|
# The position of the first hash table is the end of the key/value pairs
|
|
self.endofdata = self.tables[0][0]
|
|
|
|
@classmethod
|
|
def open(cls, storage, name):
|
|
"""Convenience method to open a hash file given a
|
|
:class:`whoosh.filedb.filestore.Storage` object and a name. This takes
|
|
care of opening the file and passing its length to the initializer.
|
|
"""
|
|
|
|
length = storage.file_length(name)
|
|
dbfile = storage.open_file(name)
|
|
return cls(dbfile, length)
|
|
|
|
def file(self):
|
|
return self.dbfile
|
|
|
|
def _read_extras(self):
|
|
try:
|
|
self.extras = self.dbfile.read_pickle()
|
|
except EOFError:
|
|
self.extras = {}
|
|
|
|
def close(self):
|
|
if self.is_closed:
|
|
raise Exception("Tried to close %r twice" % self)
|
|
self.dbfile.close()
|
|
self.is_closed = True
|
|
|
|
def key_at(self, pos):
|
|
# Returns the key bytes at the given position
|
|
|
|
dbfile = self.dbfile
|
|
keylen = dbfile.get_uint(pos)
|
|
return dbfile.get(pos + _lengths.size, keylen)
|
|
|
|
def key_and_range_at(self, pos):
|
|
# Returns a (keybytes, datapos, datalen) tuple for the key at the given
|
|
# position
|
|
dbfile = self.dbfile
|
|
lenssize = _lengths.size
|
|
|
|
if pos >= self.endofdata:
|
|
return None
|
|
|
|
keylen, datalen = _lengths.unpack(dbfile.get(pos, lenssize))
|
|
keybytes = dbfile.get(pos + lenssize, keylen)
|
|
datapos = pos + lenssize + keylen
|
|
return keybytes, datapos, datalen
|
|
|
|
def _ranges(self, pos=None, eod=None):
|
|
# Yields a series of (keypos, keylength, datapos, datalength) tuples
|
|
# for the key/value pairs in the file
|
|
dbfile = self.dbfile
|
|
pos = pos or self.startofdata
|
|
eod = eod or self.endofdata
|
|
lenssize = _lengths.size
|
|
unpacklens = _lengths.unpack
|
|
|
|
while pos < eod:
|
|
keylen, datalen = unpacklens(dbfile.get(pos, lenssize))
|
|
keypos = pos + lenssize
|
|
datapos = keypos + keylen
|
|
yield (keypos, keylen, datapos, datalen)
|
|
pos = datapos + datalen
|
|
|
|
def __getitem__(self, key):
|
|
for value in self.all(key):
|
|
return value
|
|
raise KeyError(key)
|
|
|
|
def __iter__(self):
|
|
dbfile = self.dbfile
|
|
for keypos, keylen, datapos, datalen in self._ranges():
|
|
key = dbfile.get(keypos, keylen)
|
|
value = dbfile.get(datapos, datalen)
|
|
yield (key, value)
|
|
|
|
def __contains__(self, key):
|
|
for _ in self.ranges_for_key(key):
|
|
return True
|
|
return False
|
|
|
|
def keys(self):
|
|
dbfile = self.dbfile
|
|
for keypos, keylen, _, _ in self._ranges():
|
|
yield dbfile.get(keypos, keylen)
|
|
|
|
def values(self):
|
|
dbfile = self.dbfile
|
|
for _, _, datapos, datalen in self._ranges():
|
|
yield dbfile.get(datapos, datalen)
|
|
|
|
def items(self):
|
|
dbfile = self.dbfile
|
|
for keypos, keylen, datapos, datalen in self._ranges():
|
|
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
|
|
|
|
def get(self, key, default=None):
|
|
for value in self.all(key):
|
|
return value
|
|
return default
|
|
|
|
def all(self, key):
|
|
"""Yields a sequence of values associated with the given key.
|
|
"""
|
|
|
|
dbfile = self.dbfile
|
|
for datapos, datalen in self.ranges_for_key(key):
|
|
yield dbfile.get(datapos, datalen)
|
|
|
|
def ranges_for_key(self, key):
|
|
"""Yields a sequence of ``(datapos, datalength)`` tuples associated
|
|
with the given key.
|
|
"""
|
|
|
|
if not isinstance(key, bytes_type):
|
|
raise TypeError("Key %r should be bytes" % key)
|
|
dbfile = self.dbfile
|
|
|
|
# Hash the key
|
|
keyhash = self.hashfn(key)
|
|
# Get the position and number of slots for the hash table in which the
|
|
# key may be found
|
|
tablestart, numslots = self.tables[keyhash & 255]
|
|
# If the hash table is empty, we know the key doesn't exists
|
|
if not numslots:
|
|
return
|
|
|
|
ptrsize = _pointer.size
|
|
unpackptr = _pointer.unpack
|
|
lenssize = _lengths.size
|
|
unpacklens = _lengths.unpack
|
|
|
|
# Calculate where the key's slot should be
|
|
slotpos = tablestart + (((keyhash >> 8) % numslots) * ptrsize)
|
|
# Read slots looking for our key's hash value
|
|
for _ in xrange(numslots):
|
|
slothash, itempos = unpackptr(dbfile.get(slotpos, ptrsize))
|
|
# If this slot is empty, we're done
|
|
if not itempos:
|
|
return
|
|
|
|
# If the key hash in this slot matches our key's hash, we might have
|
|
# a match, so read the actual key and see if it's our key
|
|
if slothash == keyhash:
|
|
# Read the key and value lengths
|
|
keylen, datalen = unpacklens(dbfile.get(itempos, lenssize))
|
|
# Only bother reading the actual key if the lengths match
|
|
if keylen == len(key):
|
|
keystart = itempos + lenssize
|
|
if key == dbfile.get(keystart, keylen):
|
|
# The keys match, so yield (datapos, datalen)
|
|
yield (keystart + keylen, datalen)
|
|
|
|
slotpos += ptrsize
|
|
# If we reach the end of the hashtable, wrap around
|
|
if slotpos == tablestart + (numslots * ptrsize):
|
|
slotpos = tablestart
|
|
|
|
def range_for_key(self, key):
|
|
for item in self.ranges_for_key(key):
|
|
return item
|
|
raise KeyError(key)
|
|
|
|
|
|
# Ordered hash file
|
|
|
|
class OrderedHashWriter(HashWriter):
|
|
"""Implements an on-disk hash, but requires that keys be added in order.
|
|
An :class:`OrderedHashReader` can then look up "nearest keys" based on
|
|
the ordering.
|
|
"""
|
|
|
|
def __init__(self, dbfile):
|
|
HashWriter.__init__(self, dbfile)
|
|
# Keep an array of the positions of all keys
|
|
self.index = GrowableArray("H")
|
|
# Keep track of the last key added
|
|
self.lastkey = emptybytes
|
|
|
|
def add(self, key, value):
|
|
if key <= self.lastkey:
|
|
raise ValueError("Keys must increase: %r..%r"
|
|
% (self.lastkey, key))
|
|
self.index.append(self.dbfile.tell())
|
|
HashWriter.add(self, key, value)
|
|
self.lastkey = key
|
|
|
|
def _write_extras(self):
|
|
dbfile = self.dbfile
|
|
index = self.index
|
|
|
|
# Store metadata about the index array
|
|
self.extras["indextype"] = index.typecode
|
|
self.extras["indexlen"] = len(index)
|
|
# Write the extras
|
|
HashWriter._write_extras(self)
|
|
# Write the index array
|
|
index.to_file(dbfile)
|
|
|
|
|
|
class OrderedHashReader(HashReader):
|
|
def closest_key(self, key):
|
|
"""Returns the closest key equal to or greater than the given key. If
|
|
there is no key in the file equal to or greater than the given key,
|
|
returns None.
|
|
"""
|
|
|
|
pos = self.closest_key_pos(key)
|
|
if pos is None:
|
|
return None
|
|
return self.key_at(pos)
|
|
|
|
def ranges_from(self, key):
|
|
"""Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples
|
|
for the ordered series of keys equal or greater than the given key.
|
|
"""
|
|
|
|
pos = self.closest_key_pos(key)
|
|
if pos is None:
|
|
return
|
|
|
|
for item in self._ranges(pos=pos):
|
|
yield item
|
|
|
|
def keys_from(self, key):
|
|
"""Yields an ordered series of keys equal to or greater than the given
|
|
key.
|
|
"""
|
|
|
|
dbfile = self.dbfile
|
|
for keypos, keylen, _, _ in self.ranges_from(key):
|
|
yield dbfile.get(keypos, keylen)
|
|
|
|
def items_from(self, key):
|
|
"""Yields an ordered series of ``(key, value)`` tuples for keys equal
|
|
to or greater than the given key.
|
|
"""
|
|
|
|
dbfile = self.dbfile
|
|
for keypos, keylen, datapos, datalen in self.ranges_from(key):
|
|
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
|
|
|
|
def _read_extras(self):
|
|
dbfile = self.dbfile
|
|
|
|
# Read the extras
|
|
HashReader._read_extras(self)
|
|
|
|
# Set up for reading the index array
|
|
indextype = self.extras["indextype"]
|
|
self.indexbase = dbfile.tell()
|
|
self.indexlen = self.extras["indexlen"]
|
|
self.indexsize = struct.calcsize(indextype)
|
|
# Set up the function to read values from the index array
|
|
if indextype == "B":
|
|
self._get_pos = dbfile.get_byte
|
|
elif indextype == "H":
|
|
self._get_pos = dbfile.get_ushort
|
|
elif indextype == "i":
|
|
self._get_pos = dbfile.get_int
|
|
elif indextype == "I":
|
|
self._get_pos = dbfile.get_uint
|
|
elif indextype == "q":
|
|
self._get_pos = dbfile.get_long
|
|
else:
|
|
raise Exception("Unknown index type %r" % indextype)
|
|
|
|
def closest_key_pos(self, key):
|
|
# Given a key, return the position of that key OR the next highest key
|
|
# if the given key does not exist
|
|
if not isinstance(key, bytes_type):
|
|
raise TypeError("Key %r should be bytes" % key)
|
|
|
|
indexbase = self.indexbase
|
|
indexsize = self.indexsize
|
|
key_at = self.key_at
|
|
_get_pos = self._get_pos
|
|
|
|
# Do a binary search of the positions in the index array
|
|
lo = 0
|
|
hi = self.indexlen
|
|
while lo < hi:
|
|
mid = (lo + hi) // 2
|
|
midkey = key_at(_get_pos(indexbase + mid * indexsize))
|
|
if midkey < key:
|
|
lo = mid + 1
|
|
else:
|
|
hi = mid
|
|
|
|
# If we went off the end, return None
|
|
if lo == self.indexlen:
|
|
return None
|
|
# Return the closest key
|
|
return _get_pos(indexbase + lo * indexsize)
|
|
|
|
|
|
# Fielded Ordered hash file
|
|
|
|
class FieldedOrderedHashWriter(HashWriter):
|
|
"""Implements an on-disk hash, but writes separate position indexes for
|
|
each field.
|
|
"""
|
|
|
|
def __init__(self, dbfile):
|
|
HashWriter.__init__(self, dbfile)
|
|
# Map field names to (startpos, indexpos, length, typecode)
|
|
self.fieldmap = self.extras["fieldmap"] = {}
|
|
|
|
# Keep track of the last key added
|
|
self.lastkey = emptybytes
|
|
|
|
def start_field(self, fieldname):
|
|
self.fieldstart = self.dbfile.tell()
|
|
self.fieldname = fieldname
|
|
# Keep an array of the positions of all keys
|
|
self.poses = GrowableArray("H")
|
|
self.lastkey = emptybytes
|
|
|
|
def add(self, key, value):
|
|
if key <= self.lastkey:
|
|
raise ValueError("Keys must increase: %r..%r"
|
|
% (self.lastkey, key))
|
|
self.poses.append(self.dbfile.tell() - self.fieldstart)
|
|
HashWriter.add(self, key, value)
|
|
self.lastkey = key
|
|
|
|
def end_field(self):
|
|
dbfile = self.dbfile
|
|
fieldname = self.fieldname
|
|
poses = self.poses
|
|
self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses),
|
|
poses.typecode)
|
|
poses.to_file(dbfile)
|
|
|
|
|
|
class FieldedOrderedHashReader(HashReader):
|
|
def __init__(self, *args, **kwargs):
|
|
HashReader.__init__(self, *args, **kwargs)
|
|
self.fieldmap = self.extras["fieldmap"]
|
|
# Make a sorted list of the field names with their start and end ranges
|
|
self.fieldlist = []
|
|
for fieldname in sorted(self.fieldmap.keys()):
|
|
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
|
|
self.fieldlist.append((fieldname, startpos, ixpos))
|
|
|
|
def field_start(self, fieldname):
|
|
return self.fieldmap[fieldname][0]
|
|
|
|
def fielded_ranges(self, pos=None, eod=None):
|
|
flist = self.fieldlist
|
|
fpos = 0
|
|
fieldname, start, end = flist[fpos]
|
|
for keypos, keylen, datapos, datalen in self._ranges(pos, eod):
|
|
if keypos >= end:
|
|
fpos += 1
|
|
fieldname, start, end = flist[fpos]
|
|
yield fieldname, keypos, keylen, datapos, datalen
|
|
|
|
def iter_terms(self):
|
|
get = self.dbfile.get
|
|
for fieldname, keypos, keylen, _, _ in self.fielded_ranges():
|
|
yield fieldname, get(keypos, keylen)
|
|
|
|
def iter_term_items(self):
|
|
get = self.dbfile.get
|
|
for item in self.fielded_ranges():
|
|
fieldname, keypos, keylen, datapos, datalen = item
|
|
yield fieldname, get(keypos, keylen), get(datapos, datalen)
|
|
|
|
def contains_term(self, fieldname, btext):
|
|
try:
|
|
x = self.range_for_term(fieldname, btext)
|
|
return True
|
|
except KeyError:
|
|
return False
|
|
|
|
def range_for_term(self, fieldname, btext):
|
|
start, ixpos, ixsize, code = self.fieldmap[fieldname]
|
|
for datapos, datalen in self.ranges_for_key(btext):
|
|
if start < datapos < ixpos:
|
|
return datapos, datalen
|
|
raise KeyError((fieldname, btext))
|
|
|
|
def term_data(self, fieldname, btext):
|
|
datapos, datalen = self.range_for_term(fieldname, btext)
|
|
return self.dbfile.get(datapos, datalen)
|
|
|
|
def term_get(self, fieldname, btext, default=None):
|
|
try:
|
|
return self.term_data(fieldname, btext)
|
|
except KeyError:
|
|
return default
|
|
|
|
def closest_term_pos(self, fieldname, key):
|
|
# Given a key, return the position of that key OR the next highest key
|
|
# if the given key does not exist
|
|
if not isinstance(key, bytes_type):
|
|
raise TypeError("Key %r should be bytes" % key)
|
|
|
|
dbfile = self.dbfile
|
|
key_at = self.key_at
|
|
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
|
|
|
|
if ixtype == "B":
|
|
get_pos = dbfile.get_byte
|
|
elif ixtype == "H":
|
|
get_pos = dbfile.get_ushort
|
|
elif ixtype == "i":
|
|
get_pos = dbfile.get_int
|
|
elif ixtype == "I":
|
|
get_pos = dbfile.get_uint
|
|
elif ixtype == "q":
|
|
get_pos = dbfile.get_long
|
|
else:
|
|
raise Exception("Unknown index type %r" % ixtype)
|
|
|
|
# Do a binary search of the positions in the index array
|
|
lo = 0
|
|
hi = ixsize
|
|
while lo < hi:
|
|
mid = (lo + hi) // 2
|
|
midkey = key_at(startpos + get_pos(ixpos + mid * ixsize))
|
|
if midkey < key:
|
|
lo = mid + 1
|
|
else:
|
|
hi = mid
|
|
|
|
# If we went off the end, return None
|
|
if lo == ixsize:
|
|
return None
|
|
# Return the closest key
|
|
return startpos + get_pos(ixpos + lo * ixsize)
|
|
|
|
def closest_term(self, fieldname, btext):
|
|
pos = self.closest_term_pos(fieldname, btext)
|
|
if pos is None:
|
|
return None
|
|
return self.key_at(pos)
|
|
|
|
def term_ranges_from(self, fieldname, btext):
|
|
pos = self.closest_term_pos(fieldname, btext)
|
|
if pos is None:
|
|
return
|
|
|
|
startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname]
|
|
for item in self._ranges(pos, ixpos):
|
|
yield item
|
|
|
|
def terms_from(self, fieldname, btext):
|
|
dbfile = self.dbfile
|
|
for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext):
|
|
yield dbfile.get(keypos, keylen)
|
|
|
|
def term_items_from(self, fieldname, btext):
|
|
dbfile = self.dbfile
|
|
for item in self.term_ranges_from(fieldname, btext):
|
|
keypos, keylen, datapos, datalen = item
|
|
yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen))
|
|
|
|
|
|
|