403 lines
12 KiB
Python
403 lines
12 KiB
Python
# Copyright 2009 Matt Chaput. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
|
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
|
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# The views and conclusions contained in the software and documentation are
|
|
# those of the authors and should not be interpreted as representing official
|
|
# policies, either expressed or implied, of Matt Chaput.
|
|
|
|
from array import array
|
|
from copy import copy
|
|
from struct import calcsize
|
|
|
|
from whoosh.compat import BytesIO, bytes_type
|
|
from whoosh.compat import dump as dump_pickle
|
|
from whoosh.compat import load as load_pickle
|
|
from whoosh.compat import array_frombytes, array_tobytes
|
|
from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE
|
|
from whoosh.system import IS_LITTLE
|
|
from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte
|
|
from whoosh.system import pack_ushort, unpack_ushort
|
|
from whoosh.system import pack_ushort_le, unpack_ushort_le
|
|
from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint
|
|
from whoosh.system import pack_uint_le, unpack_uint_le
|
|
from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong
|
|
from whoosh.system import pack_float, unpack_float
|
|
from whoosh.util.varints import varint, read_varint
|
|
from whoosh.util.varints import signed_varint, decode_signed_varint
|
|
|
|
|
|
_SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf")
|
|
_ORDERMAP = {"little": "<", "big": ">"}
|
|
|
|
_types = (("sbyte", "b"), ("ushort", "H"), ("int", "i"),
|
|
("long", "q"), ("float", "f"))
|
|
|
|
|
|
# Main function
|
|
|
|
class StructFile(object):
|
|
"""Returns a "structured file" object that wraps the given file object and
|
|
provides numerous additional methods for writing structured data, such as
|
|
"write_varint" and "write_long".
|
|
"""
|
|
|
|
def __init__(self, fileobj, name=None, onclose=None):
|
|
self.file = fileobj
|
|
self._name = name
|
|
self.onclose = onclose
|
|
self.is_closed = False
|
|
|
|
self.is_real = hasattr(fileobj, "fileno")
|
|
if self.is_real:
|
|
self.fileno = fileobj.fileno
|
|
|
|
def __repr__(self):
|
|
return "%s(%r)" % (self.__class__.__name__, self._name)
|
|
|
|
def __str__(self):
|
|
return self._name
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
self.close()
|
|
|
|
def __iter__(self):
|
|
return iter(self.file)
|
|
|
|
def raw_file(self):
|
|
return self.file
|
|
|
|
def read(self, *args, **kwargs):
|
|
return self.file.read(*args, **kwargs)
|
|
|
|
def readline(self, *args, **kwargs):
|
|
return self.file.readline(*args, **kwargs)
|
|
|
|
def write(self, *args, **kwargs):
|
|
return self.file.write(*args, **kwargs)
|
|
|
|
def tell(self, *args, **kwargs):
|
|
return self.file.tell(*args, **kwargs)
|
|
|
|
def seek(self, *args, **kwargs):
|
|
return self.file.seek(*args, **kwargs)
|
|
|
|
def truncate(self, *args, **kwargs):
|
|
return self.file.truncate(*args, **kwargs)
|
|
|
|
def flush(self):
|
|
"""Flushes the buffer of the wrapped file. This is a no-op if the
|
|
wrapped file does not have a flush method.
|
|
"""
|
|
|
|
if hasattr(self.file, "flush"):
|
|
self.file.flush()
|
|
|
|
def close(self):
|
|
"""Closes the wrapped file.
|
|
"""
|
|
|
|
if self.is_closed:
|
|
raise Exception("This file is already closed")
|
|
if self.onclose:
|
|
self.onclose(self)
|
|
if hasattr(self.file, "close"):
|
|
self.file.close()
|
|
self.is_closed = True
|
|
|
|
def subset(self, offset, length, name=None):
|
|
from whoosh.filedb.compound import SubFile
|
|
|
|
name = name or self._name
|
|
return StructFile(SubFile(self.file, offset, length), name=name)
|
|
|
|
def write_string(self, s):
|
|
"""Writes a string to the wrapped file. This method writes the length
|
|
of the string first, so you can read the string back without having to
|
|
know how long it was.
|
|
"""
|
|
self.write_varint(len(s))
|
|
self.write(s)
|
|
|
|
def write_string2(self, s):
|
|
self.write(pack_ushort(len(s)) + s)
|
|
|
|
def write_string4(self, s):
|
|
self.write(pack_int(len(s)) + s)
|
|
|
|
def read_string(self):
|
|
"""Reads a string from the wrapped file.
|
|
"""
|
|
return self.read(self.read_varint())
|
|
|
|
def read_string2(self):
|
|
l = self.read_ushort()
|
|
return self.read(l)
|
|
|
|
def read_string4(self):
|
|
l = self.read_int()
|
|
return self.read(l)
|
|
|
|
def get_string2(self, pos):
|
|
l = self.get_ushort(pos)
|
|
base = pos + _SHORT_SIZE
|
|
return self.get(base, l), base + l
|
|
|
|
def get_string4(self, pos):
|
|
l = self.get_int(pos)
|
|
base = pos + _INT_SIZE
|
|
return self.get(base, l), base + l
|
|
|
|
def skip_string(self):
|
|
l = self.read_varint()
|
|
self.seek(l, 1)
|
|
|
|
def write_varint(self, i):
|
|
"""Writes a variable-length unsigned integer to the wrapped file.
|
|
"""
|
|
self.write(varint(i))
|
|
|
|
def write_svarint(self, i):
|
|
"""Writes a variable-length signed integer to the wrapped file.
|
|
"""
|
|
self.write(signed_varint(i))
|
|
|
|
def read_varint(self):
|
|
"""Reads a variable-length encoded unsigned integer from the wrapped
|
|
file.
|
|
"""
|
|
return read_varint(self.read)
|
|
|
|
def read_svarint(self):
|
|
"""Reads a variable-length encoded signed integer from the wrapped
|
|
file.
|
|
"""
|
|
return decode_signed_varint(read_varint(self.read))
|
|
|
|
def write_tagint(self, i):
|
|
"""Writes a sometimes-compressed unsigned integer to the wrapped file.
|
|
This is similar to the varint methods but uses a less compressed but
|
|
faster format.
|
|
"""
|
|
|
|
# Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit
|
|
# int follows." Byte 255 means "An unsigned 32-bit int follows."
|
|
if i <= 253:
|
|
self.write(chr(i))
|
|
elif i <= 65535:
|
|
self.write("\xFE" + pack_ushort(i))
|
|
else:
|
|
self.write("\xFF" + pack_uint(i))
|
|
|
|
def read_tagint(self):
|
|
"""Reads a sometimes-compressed unsigned integer from the wrapped file.
|
|
This is similar to the varint methods but uses a less compressed but
|
|
faster format.
|
|
"""
|
|
|
|
tb = ord(self.read(1))
|
|
if tb == 254:
|
|
return self.read_ushort()
|
|
elif tb == 255:
|
|
return self.read_uint()
|
|
else:
|
|
return tb
|
|
|
|
def write_byte(self, n):
|
|
"""Writes a single byte to the wrapped file, shortcut for
|
|
``file.write(chr(n))``.
|
|
"""
|
|
self.write(pack_byte(n))
|
|
|
|
def read_byte(self):
|
|
return ord(self.read(1))
|
|
|
|
def write_pickle(self, obj, protocol=-1):
|
|
"""Writes a pickled representation of obj to the wrapped file.
|
|
"""
|
|
dump_pickle(obj, self.file, protocol)
|
|
|
|
def read_pickle(self):
|
|
"""Reads a pickled object from the wrapped file.
|
|
"""
|
|
return load_pickle(self.file)
|
|
|
|
def write_sbyte(self, n):
|
|
self.write(pack_sbyte(n))
|
|
|
|
def write_int(self, n):
|
|
self.write(pack_int(n))
|
|
|
|
def write_uint(self, n):
|
|
self.write(pack_uint(n))
|
|
|
|
def write_uint_le(self, n):
|
|
self.write(pack_uint_le(n))
|
|
|
|
def write_ushort(self, n):
|
|
self.write(pack_ushort(n))
|
|
|
|
def write_ushort_le(self, n):
|
|
self.write(pack_ushort_le(n))
|
|
|
|
def write_long(self, n):
|
|
self.write(pack_long(n))
|
|
|
|
def write_ulong(self, n):
|
|
self.write(pack_ulong(n))
|
|
|
|
def write_float(self, n):
|
|
self.write(pack_float(n))
|
|
|
|
def write_array(self, arry):
|
|
if IS_LITTLE:
|
|
arry = copy(arry)
|
|
arry.byteswap()
|
|
if self.is_real:
|
|
arry.tofile(self.file)
|
|
else:
|
|
self.write(array_tobytes(arry))
|
|
|
|
def read_sbyte(self):
|
|
return unpack_sbyte(self.read(1))[0]
|
|
|
|
def read_int(self):
|
|
return unpack_int(self.read(_INT_SIZE))[0]
|
|
|
|
def read_uint(self):
|
|
return unpack_uint(self.read(_INT_SIZE))[0]
|
|
|
|
def read_uint_le(self):
|
|
return unpack_uint_le(self.read(_INT_SIZE))[0]
|
|
|
|
def read_ushort(self):
|
|
return unpack_ushort(self.read(_SHORT_SIZE))[0]
|
|
|
|
def read_ushort_le(self):
|
|
return unpack_ushort_le(self.read(_SHORT_SIZE))[0]
|
|
|
|
def read_long(self):
|
|
return unpack_long(self.read(_LONG_SIZE))[0]
|
|
|
|
def read_ulong(self):
|
|
return unpack_ulong(self.read(_LONG_SIZE))[0]
|
|
|
|
def read_float(self):
|
|
return unpack_float(self.read(_FLOAT_SIZE))[0]
|
|
|
|
def read_array(self, typecode, length):
|
|
a = array(typecode)
|
|
if self.is_real:
|
|
a.fromfile(self.file, length)
|
|
else:
|
|
array_frombytes(a, self.read(length * _SIZEMAP[typecode]))
|
|
if IS_LITTLE:
|
|
a.byteswap()
|
|
return a
|
|
|
|
def get(self, position, length):
|
|
self.seek(position)
|
|
return self.read(length)
|
|
|
|
def get_byte(self, position):
|
|
return unpack_byte(self.get(position, 1))[0]
|
|
|
|
def get_sbyte(self, position):
|
|
return unpack_sbyte(self.get(position, 1))[0]
|
|
|
|
def get_int(self, position):
|
|
return unpack_int(self.get(position, _INT_SIZE))[0]
|
|
|
|
def get_uint(self, position):
|
|
return unpack_uint(self.get(position, _INT_SIZE))[0]
|
|
|
|
def get_ushort(self, position):
|
|
return unpack_ushort(self.get(position, _SHORT_SIZE))[0]
|
|
|
|
def get_long(self, position):
|
|
return unpack_long(self.get(position, _LONG_SIZE))[0]
|
|
|
|
def get_ulong(self, position):
|
|
return unpack_ulong(self.get(position, _LONG_SIZE))[0]
|
|
|
|
def get_float(self, position):
|
|
return unpack_float(self.get(position, _FLOAT_SIZE))[0]
|
|
|
|
def get_array(self, position, typecode, length):
|
|
self.seek(position)
|
|
return self.read_array(typecode, length)
|
|
|
|
|
|
class BufferFile(StructFile):
|
|
def __init__(self, buf, name=None, onclose=None):
|
|
self._buf = buf
|
|
self._name = name
|
|
self.file = BytesIO(buf)
|
|
self.onclose = onclose
|
|
|
|
self.is_real = False
|
|
self.is_closed = False
|
|
|
|
def subset(self, position, length, name=None):
|
|
name = name or self._name
|
|
return BufferFile(self.get(position, length), name=name)
|
|
|
|
def get(self, position, length):
|
|
return bytes_type(self._buf[position:position + length])
|
|
|
|
def get_array(self, position, typecode, length):
|
|
a = array(typecode)
|
|
array_frombytes(a, self.get(position, length * _SIZEMAP[typecode]))
|
|
if IS_LITTLE:
|
|
a.byteswap()
|
|
return a
|
|
|
|
|
|
class ChecksumFile(StructFile):
|
|
def __init__(self, *args, **kwargs):
|
|
StructFile.__init__(self, *args, **kwargs)
|
|
self._check = 0
|
|
self._crc32 = __import__("zlib").crc32
|
|
|
|
def __iter__(self):
|
|
for line in self.file:
|
|
self._check = self._crc32(line, self._check)
|
|
yield line
|
|
|
|
def seek(self, *args):
|
|
raise Exception("Cannot seek on a ChecksumFile")
|
|
|
|
def read(self, *args, **kwargs):
|
|
b = self.file.read(*args, **kwargs)
|
|
self._check = self._crc32(b, self._check)
|
|
return b
|
|
|
|
def write(self, b):
|
|
self._check = self._crc32(b, self._check)
|
|
self.file.write(b)
|
|
|
|
def checksum(self):
|
|
return self._check & 0xffffffff
|