482 lines
16 KiB
Python
482 lines
16 KiB
Python
|
# Copyright 2009 Matt Chaput. All rights reserved.
|
||
|
#
|
||
|
# Redistribution and use in source and binary forms, with or without
|
||
|
# modification, are permitted provided that the following conditions are met:
|
||
|
#
|
||
|
# 1. Redistributions of source code must retain the above copyright notice,
|
||
|
# this list of conditions and the following disclaimer.
|
||
|
#
|
||
|
# 2. Redistributions in binary form must reproduce the above copyright
|
||
|
# notice, this list of conditions and the following disclaimer in the
|
||
|
# documentation and/or other materials provided with the distribution.
|
||
|
#
|
||
|
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||
|
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||
|
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
#
|
||
|
# The views and conclusions contained in the software and documentation are
|
||
|
# those of the authors and should not be interpreted as representing official
|
||
|
# policies, either expressed or implied, of Matt Chaput.
|
||
|
|
||
|
"""
|
||
|
The classes in this module encode and decode posting information for a field.
|
||
|
The field format essentially determines what information is stored about each
|
||
|
occurance of a term.
|
||
|
"""
|
||
|
|
||
|
from collections import defaultdict
|
||
|
|
||
|
from whoosh.analysis import unstopped, entoken
|
||
|
from whoosh.compat import iteritems, dumps, loads, b
|
||
|
from whoosh.system import emptybytes
|
||
|
from whoosh.system import _INT_SIZE, _FLOAT_SIZE
|
||
|
from whoosh.system import pack_uint, unpack_uint, pack_float, unpack_float
|
||
|
|
||
|
|
||
|
# Format base class
|
||
|
|
||
|
class Format(object):
|
||
|
"""Abstract base class representing a storage format for a field or vector.
|
||
|
Format objects are responsible for writing and reading the low-level
|
||
|
representation of a field. It controls what kind/level of information to
|
||
|
store about the indexed fields.
|
||
|
"""
|
||
|
|
||
|
posting_size = -1
|
||
|
textual = True
|
||
|
__inittypes__ = dict(field_boost=float)
|
||
|
|
||
|
def __init__(self, field_boost=1.0, **options):
|
||
|
"""
|
||
|
:param field_boost: A constant boost factor to scale to the score
|
||
|
of all queries matching terms in this field.
|
||
|
"""
|
||
|
|
||
|
self.field_boost = field_boost
|
||
|
self.options = options
|
||
|
|
||
|
def __eq__(self, other):
|
||
|
return (other
|
||
|
and self.__class__ is other.__class__
|
||
|
and self.__dict__ == other.__dict__)
|
||
|
|
||
|
def __repr__(self):
|
||
|
return "%s(boost=%s)" % (self.__class__.__name__, self.field_boost)
|
||
|
|
||
|
def fixed_value_size(self):
|
||
|
if self.posting_size < 0:
|
||
|
return None
|
||
|
return self.posting_size
|
||
|
|
||
|
def word_values(self, value, analyzer, **kwargs):
|
||
|
"""Takes the text value to be indexed and yields a series of
|
||
|
("tokentext", frequency, weight, valuestring) tuples, where frequency
|
||
|
is the number of times "tokentext" appeared in the value, weight is the
|
||
|
weight (a float usually equal to frequency in the absence of per-term
|
||
|
boosts) and valuestring is encoded field-specific posting value for the
|
||
|
token. For example, in a Frequency format, the value string would be
|
||
|
the same as frequency; in a Positions format, the value string would
|
||
|
encode a list of token positions at which "tokentext" occured.
|
||
|
|
||
|
:param value: The unicode text to index.
|
||
|
:param analyzer: The analyzer to use to process the text.
|
||
|
"""
|
||
|
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def supports(self, name):
|
||
|
"""Returns True if this format supports interpreting its posting
|
||
|
value as 'name' (e.g. "frequency" or "positions").
|
||
|
"""
|
||
|
return hasattr(self, "decode_" + name)
|
||
|
|
||
|
def decoder(self, name):
|
||
|
"""Returns the bound method for interpreting value as 'name',
|
||
|
where 'name' is for example "frequency" or "positions". This
|
||
|
object must have a corresponding Format.decode_<name>() method.
|
||
|
"""
|
||
|
return getattr(self, "decode_" + name)
|
||
|
|
||
|
def decode_as(self, astype, valuestring):
|
||
|
"""Interprets the encoded value string as 'astype', where 'astype' is
|
||
|
for example "frequency" or "positions". This object must have a
|
||
|
corresponding decode_<astype>() method.
|
||
|
"""
|
||
|
return self.decoder(astype)(valuestring)
|
||
|
|
||
|
|
||
|
# Concrete field classes
|
||
|
|
||
|
# TODO: as a legacy thing most of these formats store the frequency but not the
|
||
|
# weight in the value string, so if you use field or term boosts
|
||
|
# postreader.value_as("weight") will not match postreader.weight()
|
||
|
|
||
|
def tokens(value, analyzer, kwargs):
|
||
|
if isinstance(value, (tuple, list)):
|
||
|
gen = entoken(value, **kwargs)
|
||
|
else:
|
||
|
gen = analyzer(value, **kwargs)
|
||
|
return unstopped(gen)
|
||
|
|
||
|
|
||
|
class Existence(Format):
|
||
|
"""Only indexes whether a given term occurred in a given document; it does
|
||
|
not store frequencies or positions. This is useful for fields that should
|
||
|
be searchable but not scorable, such as file path.
|
||
|
|
||
|
Supports: frequency, weight (always reports frequency = 1).
|
||
|
"""
|
||
|
|
||
|
posting_size = 0
|
||
|
__inittypes__ = dict(field_boost=float)
|
||
|
|
||
|
def __init__(self, field_boost=1.0, **options):
|
||
|
self.field_boost = field_boost
|
||
|
self.options = options
|
||
|
|
||
|
def word_values(self, value, analyzer, **kwargs):
|
||
|
fb = self.field_boost
|
||
|
wordset = set(t.text for t in tokens(value, analyzer, kwargs))
|
||
|
return ((w, 1, fb, emptybytes) for w in wordset)
|
||
|
|
||
|
def encode(self, value):
|
||
|
return emptybytes
|
||
|
|
||
|
def decode_frequency(self, valuestring):
|
||
|
return 1
|
||
|
|
||
|
def decode_weight(self, valuestring):
|
||
|
return self.field_boost
|
||
|
|
||
|
def combine(self, vs):
|
||
|
return emptybytes
|
||
|
|
||
|
|
||
|
class Frequency(Format):
|
||
|
"""Stores frequency information for each posting.
|
||
|
|
||
|
Supports: frequency, weight.
|
||
|
"""
|
||
|
|
||
|
posting_size = _INT_SIZE
|
||
|
__inittypes__ = dict(field_boost=float, boost_as_freq=bool)
|
||
|
|
||
|
def __init__(self, field_boost=1.0, boost_as_freq=False,
|
||
|
**options):
|
||
|
"""
|
||
|
:param field_boost: A constant boost factor to scale to the score of
|
||
|
all queries matching terms in this field.
|
||
|
"""
|
||
|
|
||
|
assert isinstance(field_boost, float)
|
||
|
self.field_boost = field_boost
|
||
|
self.options = options
|
||
|
|
||
|
def word_values(self, value, analyzer, **kwargs):
|
||
|
fb = self.field_boost
|
||
|
length = 0
|
||
|
freqs = defaultdict(int)
|
||
|
weights = defaultdict(float)
|
||
|
|
||
|
kwargs["boosts"] = True
|
||
|
for t in tokens(value, analyzer, kwargs):
|
||
|
length += 1
|
||
|
freqs[t.text] += 1
|
||
|
weights[t.text] += t.boost
|
||
|
|
||
|
wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq
|
||
|
in iteritems(freqs))
|
||
|
return wvs
|
||
|
|
||
|
def decode_frequency(self, valuestring):
|
||
|
return unpack_uint(valuestring)[0]
|
||
|
|
||
|
def decode_weight(self, valuestring):
|
||
|
freq = unpack_uint(valuestring)[0]
|
||
|
return freq * self.field_boost
|
||
|
|
||
|
def combine(self, vs):
|
||
|
return pack_uint(sum(self.decode_value(v) for v in vs))
|
||
|
|
||
|
|
||
|
class Positions(Format):
|
||
|
"""Stores position information in each posting, to allow phrase searching
|
||
|
and "near" queries.
|
||
|
|
||
|
Supports: frequency, weight, positions, position_boosts (always reports
|
||
|
position boost = 1.0).
|
||
|
"""
|
||
|
|
||
|
def word_values(self, value, analyzer, **kwargs):
|
||
|
fb = self.field_boost
|
||
|
poses = defaultdict(list)
|
||
|
weights = defaultdict(float)
|
||
|
kwargs["positions"] = True
|
||
|
kwargs["boosts"] = True
|
||
|
for t in tokens(value, analyzer, kwargs):
|
||
|
poses[t.text].append(t.pos)
|
||
|
weights[t.text] += t.boost
|
||
|
|
||
|
for w, poslist in iteritems(poses):
|
||
|
value = self.encode(poslist)
|
||
|
yield (w, len(poslist), weights[w] * fb, value)
|
||
|
|
||
|
def encode(self, poslist):
|
||
|
deltas = []
|
||
|
base = 0
|
||
|
for pos in poslist:
|
||
|
deltas.append(pos - base)
|
||
|
base = pos
|
||
|
return pack_uint(len(deltas)) + dumps(deltas, 2)
|
||
|
|
||
|
def decode_positions(self, valuestring):
|
||
|
if not valuestring.endswith(b(".")):
|
||
|
valuestring += b(".")
|
||
|
codes = loads(valuestring[_INT_SIZE:])
|
||
|
position = 0
|
||
|
positions = []
|
||
|
for code in codes:
|
||
|
position += code
|
||
|
positions.append(position)
|
||
|
return positions
|
||
|
|
||
|
def decode_frequency(self, valuestring):
|
||
|
return unpack_uint(valuestring[:_INT_SIZE])[0]
|
||
|
|
||
|
def decode_weight(self, valuestring):
|
||
|
return self.decode_frequency(valuestring) * self.field_boost
|
||
|
|
||
|
def decode_position_boosts(self, valuestring):
|
||
|
return [(pos, 1) for pos in self.decode_positions(valuestring)]
|
||
|
|
||
|
def combine(self, vs):
|
||
|
s = set()
|
||
|
for v in vs:
|
||
|
s.update(self.decode_positions(v))
|
||
|
return self.encode(sorted(s))
|
||
|
|
||
|
|
||
|
class Characters(Positions):
|
||
|
"""Stores token position and character start and end information for each
|
||
|
posting.
|
||
|
|
||
|
Supports: frequency, weight, positions, position_boosts (always reports
|
||
|
position boost = 1.0), characters.
|
||
|
"""
|
||
|
|
||
|
def word_values(self, value, analyzer, **kwargs):
|
||
|
fb = self.field_boost
|
||
|
seen = defaultdict(list)
|
||
|
weights = defaultdict(float)
|
||
|
|
||
|
kwargs["positions"] = True
|
||
|
kwargs["chars"] = True
|
||
|
kwargs["boosts"] = True
|
||
|
for t in tokens(value, analyzer, kwargs):
|
||
|
seen[t.text].append((t.pos, t.startchar, t.endchar))
|
||
|
weights[t.text] += t.boost
|
||
|
|
||
|
for w, poslist in iteritems(seen):
|
||
|
value = self.encode(poslist)
|
||
|
yield (w, len(poslist), weights[w] * fb, value)
|
||
|
|
||
|
def encode(self, poslist):
|
||
|
deltas = []
|
||
|
posbase = 0
|
||
|
charbase = 0
|
||
|
for pos, startchar, endchar in poslist:
|
||
|
deltas.append((pos - posbase, startchar - charbase,
|
||
|
endchar - startchar))
|
||
|
posbase = pos
|
||
|
charbase = endchar
|
||
|
return pack_uint(len(deltas)) + dumps(deltas, 2)
|
||
|
|
||
|
def decode_characters(self, valuestring):
|
||
|
if not valuestring.endswith(b(".")):
|
||
|
valuestring += b(".")
|
||
|
codes = loads(valuestring[_INT_SIZE:])
|
||
|
position = 0
|
||
|
endchar = 0
|
||
|
posns_chars = []
|
||
|
for code in codes:
|
||
|
position = code[0] + position
|
||
|
startchar = code[1] + endchar
|
||
|
endchar = code[2] + startchar
|
||
|
posns_chars.append((position, startchar, endchar))
|
||
|
return posns_chars
|
||
|
|
||
|
def decode_positions(self, valuestring):
|
||
|
if not valuestring.endswith(b(".")):
|
||
|
valuestring += b(".")
|
||
|
codes = loads(valuestring[_INT_SIZE:])
|
||
|
position = 0
|
||
|
posns = []
|
||
|
for code in codes:
|
||
|
position = code[0] + position
|
||
|
posns.append(position)
|
||
|
return posns
|
||
|
|
||
|
def combine(self, vs):
|
||
|
s = {}
|
||
|
for v in vs:
|
||
|
for pos, sc, ec in self.decode_characters(v):
|
||
|
if pos in s:
|
||
|
old_sc, old_ec = pos[s]
|
||
|
s[pos] = (min(sc, old_sc), max(ec, old_ec))
|
||
|
else:
|
||
|
s[pos] = (sc, ec)
|
||
|
poses = [(pos, s[pos][0], s[pos][1]) for pos in sorted(s.keys())]
|
||
|
return self.encode(poses)
|
||
|
|
||
|
|
||
|
class PositionBoosts(Positions):
|
||
|
"""A format that stores positions and per-position boost information
|
||
|
in each posting.
|
||
|
|
||
|
Supports: frequency, weight, positions, position_boosts.
|
||
|
"""
|
||
|
|
||
|
def word_values(self, value, analyzer, **kwargs):
|
||
|
fb = self.field_boost
|
||
|
seen = defaultdict(list)
|
||
|
|
||
|
kwargs["positions"] = True
|
||
|
kwargs["boosts"] = True
|
||
|
for t in tokens(value, analyzer, kwargs):
|
||
|
pos = t.pos
|
||
|
boost = t.boost
|
||
|
seen[t.text].append((pos, boost))
|
||
|
|
||
|
for w, poses in iteritems(seen):
|
||
|
value = self.encode(poses)
|
||
|
yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
|
||
|
|
||
|
def encode(self, poses):
|
||
|
codes = []
|
||
|
base = 0
|
||
|
summedboost = 0
|
||
|
for pos, boost in poses:
|
||
|
summedboost += boost
|
||
|
codes.append((pos - base, boost))
|
||
|
base = pos
|
||
|
return (pack_uint(len(poses)) + pack_float(summedboost)
|
||
|
+ dumps(codes, 2))
|
||
|
|
||
|
def decode_position_boosts(self, valuestring):
|
||
|
if not valuestring.endswith(b(".")):
|
||
|
valuestring += b(".")
|
||
|
codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
|
||
|
position = 0
|
||
|
posns_boosts = []
|
||
|
for code in codes:
|
||
|
position = code[0] + position
|
||
|
posns_boosts.append((position, code[1]))
|
||
|
return posns_boosts
|
||
|
|
||
|
def decode_positions(self, valuestring):
|
||
|
if not valuestring.endswith(b(".")):
|
||
|
valuestring += b(".")
|
||
|
codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
|
||
|
position = 0
|
||
|
posns = []
|
||
|
for code in codes:
|
||
|
position = code[0] + position
|
||
|
posns.append(position)
|
||
|
return posns
|
||
|
|
||
|
def decode_weight(self, v):
|
||
|
summedboost = unpack_float(v[_INT_SIZE:_INT_SIZE + _FLOAT_SIZE])[0]
|
||
|
return summedboost * self.field_boost
|
||
|
|
||
|
def combine(self, vs):
|
||
|
s = defaultdict(float)
|
||
|
for v in vs:
|
||
|
for pos, boost in self.decode_position_boosts(v):
|
||
|
s[pos] += boost
|
||
|
return self.encode(sorted(s.items()))
|
||
|
|
||
|
|
||
|
class CharacterBoosts(Characters):
|
||
|
"""A format that stores positions, character start and end, and
|
||
|
per-position boost information in each posting.
|
||
|
|
||
|
Supports: frequency, weight, positions, position_boosts, characters,
|
||
|
character_boosts.
|
||
|
"""
|
||
|
|
||
|
def word_values(self, value, analyzer, **kwargs):
|
||
|
seen = defaultdict(list)
|
||
|
|
||
|
kwargs["positions"] = True
|
||
|
kwargs["chars"] = True
|
||
|
kwargs["boosts"] = True
|
||
|
for t in tokens(value, analyzer, kwargs):
|
||
|
seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))
|
||
|
|
||
|
for w, poses in iteritems(seen):
|
||
|
value, summedboost = self.encode(poses)
|
||
|
yield (w, len(poses), summedboost, value)
|
||
|
|
||
|
def encode(self, poses):
|
||
|
fb = self.field_boost
|
||
|
# posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
|
||
|
codes = []
|
||
|
posbase = 0
|
||
|
charbase = 0
|
||
|
summedboost = 0
|
||
|
for pos, startchar, endchar, boost in poses:
|
||
|
codes.append((pos - posbase, startchar - charbase,
|
||
|
endchar - startchar, boost))
|
||
|
posbase = pos
|
||
|
charbase = endchar
|
||
|
summedboost += boost
|
||
|
|
||
|
return ((pack_uint(len(poses)) + pack_float(summedboost * fb)
|
||
|
+ dumps(codes, 2)), summedboost)
|
||
|
|
||
|
def decode_character_boosts(self, valuestring):
|
||
|
if not valuestring.endswith(b(".")):
|
||
|
valuestring += b(".")
|
||
|
codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
|
||
|
position = 0
|
||
|
endchar = 0
|
||
|
posn_char_boosts = []
|
||
|
for code in codes:
|
||
|
position = position + code[0]
|
||
|
startchar = endchar + code[1]
|
||
|
endchar = startchar + code[2]
|
||
|
posn_char_boosts.append((position, startchar, endchar, code[3]))
|
||
|
return posn_char_boosts
|
||
|
|
||
|
def decode_positions(self, valuestring):
|
||
|
return [item[0] for item in self.decode_character_boosts(valuestring)]
|
||
|
|
||
|
def decode_characters(self, valuestring):
|
||
|
return [(pos, startchar, endchar) for pos, startchar, endchar, _
|
||
|
in self.decode_character_boosts(valuestring)]
|
||
|
|
||
|
def decode_position_boosts(self, valuestring):
|
||
|
return [(pos, boost) for pos, _, _, boost
|
||
|
in self.decode_character_boosts(valuestring)]
|
||
|
|
||
|
def combine(self, vs):
|
||
|
s = {}
|
||
|
for v in vs:
|
||
|
for pos, sc, ec, boost in self.decode_character_boosts(v):
|
||
|
if pos in s:
|
||
|
old_sc, old_ec, old_boost = pos[s]
|
||
|
s[pos] = (min(sc, old_sc), max(ec, old_ec),
|
||
|
old_boost + boost)
|
||
|
else:
|
||
|
s[pos] = (sc, ec, boost)
|
||
|
poses = [(pos, sc, ec, boost) for pos, (sc, ec, boost)
|
||
|
in sorted(s.items())]
|
||
|
return self.encode(poses)[0] # encode() returns value, summedboost
|