133 lines
4.3 KiB
Python
133 lines
4.3 KiB
Python
# Copyright 2007 Matt Chaput. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
|
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
|
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# The views and conclusions contained in the software and documentation are
|
|
# those of the authors and should not be interpreted as representing official
|
|
# policies, either expressed or implied, of Matt Chaput.
|
|
|
|
import codecs, re
|
|
|
|
from whoosh.compat import string_type, u, byte
|
|
|
|
|
|
# Note: these functions return a tuple of (text, length), so when you call
|
|
# them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0]
|
|
|
|
utf8encode = codecs.getencoder("utf-8")
|
|
utf8decode = codecs.getdecoder("utf-8")
|
|
|
|
|
|
# Prefix encoding functions
|
|
|
|
def first_diff(a, b):
|
|
"""
|
|
Returns the position of the first differing character in the sequences a
|
|
and b. For example, first_diff('render', 'rending') == 4. This function
|
|
limits the return value to 255 so the difference can be encoded in a single
|
|
byte.
|
|
"""
|
|
|
|
i = 0
|
|
while i <= 255 and i < len(a) and i < len(b) and a[i] == b[i]:
|
|
i += 1
|
|
return i
|
|
|
|
|
|
def prefix_encode(a, b):
|
|
"""
|
|
Compresses bytestring b as a byte representing the prefix it shares with a,
|
|
followed by the suffix bytes.
|
|
"""
|
|
|
|
i = first_diff(a, b)
|
|
return byte(i) + b[i:]
|
|
|
|
|
|
def prefix_encode_all(ls):
|
|
"""Compresses the given list of (unicode) strings by storing each string
|
|
(except the first one) as an integer (encoded in a byte) representing
|
|
the prefix it shares with its predecessor, followed by the suffix encoded
|
|
as UTF-8.
|
|
"""
|
|
|
|
last = u('')
|
|
for w in ls:
|
|
i = first_diff(last, w)
|
|
yield chr(i) + w[i:].encode("utf-8")
|
|
last = w
|
|
|
|
|
|
def prefix_decode_all(ls):
|
|
"""Decompresses a list of strings compressed by prefix_encode().
|
|
"""
|
|
|
|
last = u('')
|
|
for w in ls:
|
|
i = ord(w[0])
|
|
decoded = last[:i] + w[1:].decode("utf-8")
|
|
yield decoded
|
|
last = decoded
|
|
|
|
|
|
# Natural key sorting function
|
|
|
|
_nkre = re.compile(r"\D+|\d+", re.UNICODE)
|
|
|
|
|
|
def _nkconv(i):
|
|
try:
|
|
return int(i)
|
|
except ValueError:
|
|
return i.lower()
|
|
|
|
|
|
def natural_key(s):
|
|
"""Converts string ``s`` into a tuple that will sort "naturally" (i.e.,
|
|
``name5`` will come before ``name10`` and ``1`` will come before ``A``).
|
|
This function is designed to be used as the ``key`` argument to sorting
|
|
functions.
|
|
|
|
:param s: the str/unicode string to convert.
|
|
:rtype: tuple
|
|
"""
|
|
|
|
# Use _nkre to split the input string into a sequence of
|
|
# digit runs and non-digit runs. Then use _nkconv() to convert
|
|
# the digit runs into ints and the non-digit runs to lowercase.
|
|
return tuple(_nkconv(m) for m in _nkre.findall(s))
|
|
|
|
|
|
# Regular expression functions
|
|
|
|
def rcompile(pattern, flags=0, verbose=False):
|
|
"""A wrapper for re.compile that checks whether "pattern" is a regex object
|
|
or a string to be compiled, and automatically adds the re.UNICODE flag.
|
|
"""
|
|
|
|
if not isinstance(pattern, string_type):
|
|
# If it's not a string, assume it's already a compiled pattern
|
|
return pattern
|
|
if verbose:
|
|
flags |= re.VERBOSE
|
|
return re.compile(pattern, re.UNICODE | flags)
|