882 lines
28 KiB
Python
882 lines
28 KiB
Python
# Copyright 2010 Matt Chaput. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
|
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
|
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# The views and conclusions contained in the software and documentation are
|
|
# those of the authors and should not be interpreted as representing official
|
|
# policies, either expressed or implied, of Matt Chaput.
|
|
|
|
"""
|
|
This module contains Query objects that deal with "spans".
|
|
|
|
Span queries allow for positional constraints on matching documents. For
|
|
example, the :class:`whoosh.spans.SpanNear` query matches documents where one
|
|
term occurs near another. Because you can nest span queries, and wrap them
|
|
around almost any non-span query, you can create very complex constraints.
|
|
|
|
For example, to find documents containing "whoosh" at most 5 positions before
|
|
"library" in the "text" field::
|
|
|
|
from whoosh import query, spans
|
|
t1 = query.Term("text", "whoosh")
|
|
t2 = query.Term("text", "library")
|
|
q = spans.SpanNear(t1, t2, slop=5)
|
|
|
|
"""
|
|
|
|
from whoosh.matching import mcore, wrappers, binary
|
|
from whoosh.query import Query, And, AndMaybe, Or, Term
|
|
from whoosh.util import make_binary_tree
|
|
|
|
|
|
# Span class
|
|
|
|
class Span(object):
|
|
__slots__ = ("start", "end", "startchar", "endchar", "boost")
|
|
|
|
def __init__(self, start, end=None, startchar=None, endchar=None,
|
|
boost=1.0):
|
|
if end is None:
|
|
end = start
|
|
assert start <= end
|
|
self.start = start
|
|
self.end = end
|
|
self.startchar = startchar
|
|
self.endchar = endchar
|
|
self.boost = boost
|
|
|
|
def __repr__(self):
|
|
if self.startchar is not None or self.endchar is not None:
|
|
return "<%d-%d %d:%d>" % (self.start, self.end, self.startchar,
|
|
self.endchar)
|
|
else:
|
|
return "<%d-%d>" % (self.start, self.end)
|
|
|
|
def __eq__(self, span):
|
|
return (self.start == span.start
|
|
and self.end == span.end
|
|
and self.startchar == span.startchar
|
|
and self.endchar == span.endchar)
|
|
|
|
def __ne__(self, span):
|
|
return self.start != span.start or self.end != span.end
|
|
|
|
def __lt__(self, span):
|
|
return self.start < span.start
|
|
|
|
def __gt__(self, span):
|
|
return self.start > span.start
|
|
|
|
def __hash__(self):
|
|
return hash((self.start, self.end))
|
|
|
|
@classmethod
|
|
def merge(cls, spans):
|
|
"""Merges overlapping and touches spans in the given list of spans.
|
|
|
|
Note that this modifies the original list.
|
|
|
|
>>> spans = [Span(1,2), Span(3)]
|
|
>>> Span.merge(spans)
|
|
>>> spans
|
|
[<1-3>]
|
|
"""
|
|
|
|
i = 0
|
|
while i < len(spans) - 1:
|
|
here = spans[i]
|
|
j = i + 1
|
|
while j < len(spans):
|
|
there = spans[j]
|
|
if there.start > here.end + 1:
|
|
break
|
|
if here.touches(there) or here.overlaps(there):
|
|
here = here.to(there)
|
|
spans[i] = here
|
|
del spans[j]
|
|
else:
|
|
j += 1
|
|
i += 1
|
|
return spans
|
|
|
|
def to(self, span):
|
|
if self.startchar is None:
|
|
minchar = span.startchar
|
|
elif span.startchar is None:
|
|
minchar = self.startchar
|
|
else:
|
|
minchar = min(self.startchar, span.startchar)
|
|
if self.endchar is None:
|
|
maxchar = span.endchar
|
|
elif span.endchar is None:
|
|
maxchar = self.endchar
|
|
else:
|
|
maxchar = max(self.endchar, span.endchar)
|
|
|
|
minpos = min(self.start, span.start)
|
|
maxpos = max(self.end, span.end)
|
|
return self.__class__(minpos, maxpos, minchar, maxchar)
|
|
|
|
def overlaps(self, span):
|
|
return ((self.start >= span.start and self.start <= span.end)
|
|
or (self.end >= span.start and self.end <= span.end)
|
|
or (span.start >= self.start and span.start <= self.end)
|
|
or (span.end >= self.start and span.end <= self.end))
|
|
|
|
def surrounds(self, span):
|
|
return self.start < span.start and self.end > span.end
|
|
|
|
def is_within(self, span):
|
|
return self.start >= span.start and self.end <= span.end
|
|
|
|
def is_before(self, span):
|
|
return self.end < span.start
|
|
|
|
def is_after(self, span):
|
|
return self.start > span.end
|
|
|
|
def touches(self, span):
|
|
return self.start == span.end + 1 or self.end == span.start - 1
|
|
|
|
def distance_to(self, span):
|
|
if self.overlaps(span):
|
|
return 0
|
|
elif self.is_before(span):
|
|
return span.start - self.end
|
|
else:
|
|
return self.start - span.end
|
|
|
|
|
|
def bisect_spans(spans, start):
|
|
lo = 0
|
|
hi = len(spans)
|
|
while lo < hi:
|
|
mid = (lo + hi) // 2
|
|
if spans[mid].start < start:
|
|
lo = mid + 1
|
|
else:
|
|
hi = mid
|
|
return lo
|
|
|
|
|
|
# Base matchers
|
|
|
|
class SpanWrappingMatcher(wrappers.WrappingMatcher):
|
|
"""An abstract matcher class that wraps a "regular" matcher. This matcher
|
|
uses the sub-matcher's matching logic, but only matches documents that have
|
|
matching spans, i.e. where ``_get_spans()`` returns a non-empty list.
|
|
|
|
Subclasses must implement the ``_get_spans()`` method, which returns a list
|
|
of valid spans for the current document.
|
|
"""
|
|
|
|
def __init__(self, child):
|
|
super(SpanWrappingMatcher, self).__init__(child)
|
|
self._spans = None
|
|
if self.is_active():
|
|
self._find_next()
|
|
|
|
def copy(self):
|
|
m = self.__class__(self.child.copy())
|
|
m._spans = self._spans
|
|
return m
|
|
|
|
def _replacement(self, newchild):
|
|
return self.__class__(newchild)
|
|
|
|
def _find_next(self):
|
|
if not self.is_active():
|
|
return
|
|
|
|
child = self.child
|
|
r = False
|
|
|
|
spans = self._get_spans()
|
|
while child.is_active() and not spans:
|
|
r = child.next() or r
|
|
if not child.is_active():
|
|
return True
|
|
spans = self._get_spans()
|
|
self._spans = spans
|
|
|
|
return r
|
|
|
|
def spans(self):
|
|
return self._spans
|
|
|
|
def next(self):
|
|
self.child.next()
|
|
self._find_next()
|
|
|
|
def skip_to(self, id):
|
|
self.child.skip_to(id)
|
|
self._find_next()
|
|
|
|
def all_ids(self):
|
|
while self.is_active():
|
|
if self.spans():
|
|
yield self.id()
|
|
self.next()
|
|
|
|
|
|
class SpanBiMatcher(SpanWrappingMatcher):
|
|
def copy(self):
|
|
return self.__class__(self.a.copy(), self.b.copy())
|
|
|
|
def depth(self):
|
|
return 1 + max(self.a.depth(), self.b.depth())
|
|
|
|
def replace(self, minquality=0):
|
|
# TODO: fix this
|
|
if not self.is_active():
|
|
return mcore.NullMatcher()
|
|
return self
|
|
|
|
|
|
# Queries
|
|
|
|
class SpanQuery(Query):
|
|
"""Abstract base class for span-based queries. Each span query type wraps
|
|
a "regular" query that implements the basic document-matching functionality
|
|
(for example, SpanNear wraps an And query, because SpanNear requires that
|
|
the two sub-queries occur in the same documents. The wrapped query is
|
|
stored in the ``q`` attribute.
|
|
|
|
Subclasses usually only need to implement the initializer to set the
|
|
wrapped query, and ``matcher()`` to return a span-aware matcher object.
|
|
"""
|
|
|
|
def _subm(self, s, context=None):
|
|
return self.q.matcher(s, context)
|
|
|
|
def __repr__(self):
|
|
return "%s(%r)" % (self.__class__.__name__, self.q)
|
|
|
|
def __eq__(self, other):
|
|
return (other and self.__class__ is other.__class__
|
|
and self.q == other.q)
|
|
|
|
def __hash__(self):
|
|
return hash(self.__class__.__name__) ^ hash(self.q)
|
|
|
|
def field(self):
|
|
return None
|
|
|
|
def needs_spans(self):
|
|
return True
|
|
|
|
|
|
class WrappingSpan(SpanQuery):
|
|
def is_leaf(self):
|
|
return False
|
|
|
|
def apply(self, fn):
|
|
return self.__class__(fn(self.q), limit=self.limit)
|
|
|
|
def field(self):
|
|
return self.q.field()
|
|
|
|
|
|
class SpanFirst(WrappingSpan):
|
|
"""Matches spans that end within the first N positions. This lets you
|
|
for example only match terms near the beginning of the document.
|
|
"""
|
|
|
|
def __init__(self, q, limit=0):
|
|
"""
|
|
:param q: the query to match.
|
|
:param limit: the query must match within this position at the start
|
|
of a document. The default is ``0``, which means the query must
|
|
match at the first position.
|
|
"""
|
|
|
|
self.q = q
|
|
self.limit = limit
|
|
|
|
def __eq__(self, other):
|
|
return (other and self.__class__ is other.__class__
|
|
and self.q == other.q and self.limit == other.limit)
|
|
|
|
def __hash__(self):
|
|
return hash(self.q) ^ hash(self.limit)
|
|
|
|
def matcher(self, searcher, context=None):
|
|
m = self._subm(searcher, context)
|
|
return SpanFirst.SpanFirstMatcher(m, limit=self.limit)
|
|
|
|
class SpanFirstMatcher(SpanWrappingMatcher):
|
|
def __init__(self, child, limit=0):
|
|
self.limit = limit
|
|
super(SpanFirst.SpanFirstMatcher, self).__init__(child)
|
|
|
|
def copy(self):
|
|
return self.__class__(self.child.copy(), limit=self.limit)
|
|
|
|
def _replacement(self, newchild):
|
|
return self.__class__(newchild, limit=self.limit)
|
|
|
|
def _get_spans(self):
|
|
return [span for span in self.child.spans()
|
|
if span.end <= self.limit]
|
|
|
|
|
|
class SpanNear(SpanQuery):
|
|
"""
|
|
Note: for new code, use :class:`SpanNear2` instead of this class. SpanNear2
|
|
takes a list of sub-queries instead of requiring you to create a binary
|
|
tree of query objects.
|
|
|
|
Matches queries that occur near each other. By default, only matches
|
|
queries that occur right next to each other (slop=1) and in order
|
|
(ordered=True).
|
|
|
|
For example, to find documents where "whoosh" occurs next to "library"
|
|
in the "text" field::
|
|
|
|
from whoosh import query, spans
|
|
t1 = query.Term("text", "whoosh")
|
|
t2 = query.Term("text", "library")
|
|
q = spans.SpanNear(t1, t2)
|
|
|
|
To find documents where "whoosh" occurs at most 5 positions before
|
|
"library"::
|
|
|
|
q = spans.SpanNear(t1, t2, slop=5)
|
|
|
|
To find documents where "whoosh" occurs at most 5 positions before or after
|
|
"library"::
|
|
|
|
q = spans.SpanNear(t1, t2, slop=5, ordered=False)
|
|
|
|
You can use the ``phrase()`` class method to create a tree of SpanNear
|
|
queries to match a list of terms::
|
|
|
|
q = spans.SpanNear.phrase("text", ["whoosh", "search", "library"],
|
|
slop=2)
|
|
"""
|
|
|
|
def __init__(self, a, b, slop=1, ordered=True, mindist=1):
|
|
"""
|
|
:param a: the first query to match.
|
|
:param b: the second query that must occur within "slop" positions of
|
|
the first query.
|
|
:param slop: the number of positions within which the queries must
|
|
occur. Default is 1, meaning the queries must occur right next
|
|
to each other.
|
|
:param ordered: whether a must occur before b. Default is True.
|
|
:pram mindist: the minimum distance allowed between the queries.
|
|
"""
|
|
|
|
self.q = And([a, b])
|
|
self.a = a
|
|
self.b = b
|
|
self.slop = slop
|
|
self.ordered = ordered
|
|
self.mindist = mindist
|
|
|
|
def __repr__(self):
|
|
return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
|
|
% (self.__class__.__name__, self.q, self.slop, self.ordered,
|
|
self.mindist))
|
|
|
|
def __eq__(self, other):
|
|
return (other and self.__class__ == other.__class__
|
|
and self.q == other.q and self.slop == other.slop
|
|
and self.ordered == other.ordered
|
|
and self.mindist == other.mindist)
|
|
|
|
def __hash__(self):
|
|
return (hash(self.a) ^ hash(self.b) ^ hash(self.slop)
|
|
^ hash(self.ordered) ^ hash(self.mindist))
|
|
|
|
def is_leaf(self):
|
|
return False
|
|
|
|
def apply(self, fn):
|
|
return self.__class__(fn(self.a), fn(self.b), slop=self.slop,
|
|
ordered=self.ordered, mindist=self.mindist)
|
|
|
|
def matcher(self, searcher, context=None):
|
|
ma = self.a.matcher(searcher, context)
|
|
mb = self.b.matcher(searcher, context)
|
|
return SpanNear.SpanNearMatcher(ma, mb, slop=self.slop,
|
|
ordered=self.ordered,
|
|
mindist=self.mindist)
|
|
|
|
@classmethod
|
|
def phrase(cls, fieldname, words, slop=1, ordered=True):
|
|
"""Returns a tree of SpanNear queries to match a list of terms.
|
|
|
|
This class method is a convenience for constructing a phrase query
|
|
using a binary tree of SpanNear queries::
|
|
|
|
SpanNear.phrase("content", ["alfa", "bravo", "charlie", "delta"])
|
|
|
|
:param fieldname: the name of the field to search in.
|
|
:param words: a sequence of texts to search for.
|
|
:param slop: the number of positions within which the terms must
|
|
occur. Default is 1, meaning the terms must occur right next
|
|
to each other.
|
|
:param ordered: whether the terms must occur in order. Default is True.
|
|
"""
|
|
|
|
terms = [Term(fieldname, word) for word in words]
|
|
return make_binary_tree(cls, terms, slop=slop, ordered=ordered)
|
|
|
|
class SpanNearMatcher(SpanWrappingMatcher):
|
|
def __init__(self, a, b, slop=1, ordered=True, mindist=1):
|
|
self.a = a
|
|
self.b = b
|
|
self.slop = slop
|
|
self.ordered = ordered
|
|
self.mindist = mindist
|
|
isect = binary.IntersectionMatcher(a, b)
|
|
super(SpanNear.SpanNearMatcher, self).__init__(isect)
|
|
|
|
def copy(self):
|
|
return self.__class__(self.a.copy(), self.b.copy(), slop=self.slop,
|
|
ordered=self.ordered, mindist=self.mindist)
|
|
|
|
def replace(self, minquality=0):
|
|
# TODO: fix this
|
|
if not self.is_active():
|
|
return mcore.NullMatcher()
|
|
return self
|
|
|
|
def _get_spans(self):
|
|
slop = self.slop
|
|
mindist = self.mindist
|
|
ordered = self.ordered
|
|
spans = set()
|
|
|
|
bspans = self.b.spans()
|
|
for aspan in self.a.spans():
|
|
for bspan in bspans:
|
|
if (bspan.end < aspan.start - slop
|
|
or (ordered and aspan.start > bspan.start)):
|
|
# B is too far in front of A, or B is in front of A
|
|
# *at all* when ordered is True
|
|
continue
|
|
if bspan.start > aspan.end + slop:
|
|
# B is too far from A. Since spans are listed in
|
|
# start position order, we know that all spans after
|
|
# this one will also be too far.
|
|
break
|
|
|
|
# Check the distance between the spans
|
|
dist = aspan.distance_to(bspan)
|
|
if mindist <= dist <= slop:
|
|
spans.add(aspan.to(bspan))
|
|
|
|
return sorted(spans)
|
|
|
|
|
|
class SpanNear2(SpanQuery):
|
|
"""
|
|
Matches queries that occur near each other. By default, only matches
|
|
queries that occur right next to each other (slop=1) and in order
|
|
(ordered=True).
|
|
|
|
New code should use this query type instead of :class:`SpanNear`.
|
|
|
|
(Unlike :class:`SpanNear`, this query takes a list of subqueries instead of
|
|
requiring you to build a binary tree of query objects. This query should
|
|
also be slightly faster due to less overhead.)
|
|
|
|
For example, to find documents where "whoosh" occurs next to "library"
|
|
in the "text" field::
|
|
|
|
from whoosh import query, spans
|
|
t1 = query.Term("text", "whoosh")
|
|
t2 = query.Term("text", "library")
|
|
q = spans.SpanNear2([t1, t2])
|
|
|
|
To find documents where "whoosh" occurs at most 5 positions before
|
|
"library"::
|
|
|
|
q = spans.SpanNear2([t1, t2], slop=5)
|
|
|
|
To find documents where "whoosh" occurs at most 5 positions before or after
|
|
"library"::
|
|
|
|
q = spans.SpanNear2(t1, t2, slop=5, ordered=False)
|
|
"""
|
|
|
|
def __init__(self, qs, slop=1, ordered=True, mindist=1):
|
|
"""
|
|
:param qs: a sequence of sub-queries to match.
|
|
:param slop: the number of positions within which the queries must
|
|
occur. Default is 1, meaning the queries must occur right next
|
|
to each other.
|
|
:param ordered: whether a must occur before b. Default is True.
|
|
:pram mindist: the minimum distance allowed between the queries.
|
|
"""
|
|
|
|
self.qs = qs
|
|
self.slop = slop
|
|
self.ordered = ordered
|
|
self.mindist = mindist
|
|
|
|
def __repr__(self):
|
|
return ("%s(%r, slop=%d, ordered=%s, mindist=%d)"
|
|
% (self.__class__.__name__, self.qs, self.slop, self.ordered,
|
|
self.mindist))
|
|
|
|
def __eq__(self, other):
|
|
return (other and self.__class__ == other.__class__
|
|
and self.qs == other.qs and self.slop == other.slop
|
|
and self.ordered == other.ordered
|
|
and self.mindist == other.mindist)
|
|
|
|
def __hash__(self):
|
|
h = hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist)
|
|
for q in self.qs:
|
|
h ^= hash(q)
|
|
return h
|
|
|
|
def _and_query(self):
|
|
return q.And(self.qs)
|
|
|
|
def estimate_size(self, ixreader):
|
|
return self._and_query().estimate_size(ixreader)
|
|
|
|
def estimate_min_size(self, ixreader):
|
|
return self._and_query().estimate_min_size(ixreader)
|
|
|
|
def is_leaf(self):
|
|
return False
|
|
|
|
def children(self):
|
|
return self.qs
|
|
|
|
def apply(self, fn):
|
|
return self.__class__([fn(q) for q in self.qs], slop=self.slop,
|
|
ordered=self.ordered, mindist=self.mindist)
|
|
|
|
def matcher(self, searcher, context=None):
|
|
ms = [q.matcher(searcher, context) for q in self.qs]
|
|
return self.SpanNear2Matcher(ms, slop=self.slop, ordered=self.ordered,
|
|
mindist=self.mindist)
|
|
|
|
class SpanNear2Matcher(SpanWrappingMatcher):
|
|
def __init__(self, ms, slop=1, ordered=True, mindist=1):
|
|
self.ms = ms
|
|
self.slop = slop
|
|
self.ordered = ordered
|
|
self.mindist = mindist
|
|
isect = make_binary_tree(binary.IntersectionMatcher, ms)
|
|
super(SpanNear2.SpanNear2Matcher, self).__init__(isect)
|
|
|
|
def copy(self):
|
|
return self.__class__([m.copy() for m in self.ms], slop=self.slop,
|
|
ordered=self.ordered, mindist=self.mindist)
|
|
|
|
def replace(self, minquality=0):
|
|
# TODO: fix this
|
|
if not self.is_active():
|
|
return mcore.NullMatcher()
|
|
return self
|
|
|
|
def _get_spans(self):
|
|
slop = self.slop
|
|
mindist = self.mindist
|
|
ordered = self.ordered
|
|
ms = self.ms
|
|
|
|
aspans = ms[0].spans()
|
|
i = 1
|
|
while i < len(ms) and aspans:
|
|
bspans = ms[i].spans()
|
|
spans = set()
|
|
for aspan in aspans:
|
|
# Use a binary search to find the first position we should
|
|
# start looking for possible matches
|
|
if ordered:
|
|
start = aspan.start
|
|
else:
|
|
start = max(0, aspan.start - slop)
|
|
j = bisect_spans(bspans, start)
|
|
|
|
while j < len(bspans):
|
|
bspan = bspans[j]
|
|
j += 1
|
|
|
|
if (bspan.end < aspan.start - slop
|
|
or (ordered and aspan.start > bspan.start)):
|
|
# B is too far in front of A, or B is in front of A
|
|
# *at all* when ordered is True
|
|
continue
|
|
if bspan.start > aspan.end + slop:
|
|
# B is too far from A. Since spans are listed in
|
|
# start position order, we know that all spans after
|
|
# this one will also be too far.
|
|
break
|
|
|
|
# Check the distance between the spans
|
|
dist = aspan.distance_to(bspan)
|
|
if mindist <= dist <= slop:
|
|
spans.add(aspan.to(bspan))
|
|
aspans = sorted(spans)
|
|
i += 1
|
|
|
|
if i == len(ms):
|
|
return aspans
|
|
else:
|
|
return []
|
|
|
|
|
|
class SpanOr(SpanQuery):
|
|
"""Matches documents that match any of a list of sub-queries. Unlike
|
|
query.Or, this class merges together matching spans from the different
|
|
sub-queries when they overlap.
|
|
"""
|
|
|
|
def __init__(self, subqs):
|
|
"""
|
|
:param subqs: a list of queries to match.
|
|
"""
|
|
|
|
self.q = Or(subqs)
|
|
self.subqs = subqs
|
|
|
|
def is_leaf(self):
|
|
return False
|
|
|
|
def apply(self, fn):
|
|
return self.__class__([fn(sq) for sq in self.subqs])
|
|
|
|
def matcher(self, searcher, context=None):
|
|
matchers = [q.matcher(searcher, context) for q in self.subqs]
|
|
return make_binary_tree(SpanOr.SpanOrMatcher, matchers)
|
|
|
|
class SpanOrMatcher(SpanBiMatcher):
|
|
def __init__(self, a, b):
|
|
self.a = a
|
|
self.b = b
|
|
um = binary.UnionMatcher(a, b)
|
|
super(SpanOr.SpanOrMatcher, self).__init__(um)
|
|
|
|
def _get_spans(self):
|
|
a_active = self.a.is_active()
|
|
b_active = self.b.is_active()
|
|
|
|
if a_active:
|
|
a_id = self.a.id()
|
|
if b_active:
|
|
b_id = self.b.id()
|
|
if a_id == b_id:
|
|
spans = sorted(set(self.a.spans())
|
|
| set(self.b.spans()))
|
|
elif a_id < b_id:
|
|
spans = self.a.spans()
|
|
else:
|
|
spans = self.b.spans()
|
|
else:
|
|
spans = self.a.spans()
|
|
else:
|
|
spans = self.b.spans()
|
|
|
|
Span.merge(spans)
|
|
return spans
|
|
|
|
|
|
class SpanBiQuery(SpanQuery):
|
|
# Intermediate base class for methods common to "a/b" span query types
|
|
|
|
def is_leaf(self):
|
|
return False
|
|
|
|
def apply(self, fn):
|
|
return self.__class__(fn(self.a), fn(self.b))
|
|
|
|
def matcher(self, searcher, context=None):
|
|
ma = self.a.matcher(searcher, context)
|
|
mb = self.b.matcher(searcher, context)
|
|
return self._Matcher(ma, mb)
|
|
|
|
|
|
class SpanNot(SpanBiQuery):
|
|
"""Matches spans from the first query only if they don't overlap with
|
|
spans from the second query. If there are no non-overlapping spans, the
|
|
document does not match.
|
|
|
|
For example, to match documents that contain "bear" at most 2 places after
|
|
"apple" in the "text" field but don't have "cute" between them::
|
|
|
|
from whoosh import query, spans
|
|
t1 = query.Term("text", "apple")
|
|
t2 = query.Term("text", "bear")
|
|
near = spans.SpanNear(t1, t2, slop=2)
|
|
q = spans.SpanNot(near, query.Term("text", "cute"))
|
|
"""
|
|
|
|
def __init__(self, a, b):
|
|
"""
|
|
:param a: the query to match.
|
|
:param b: do not match any spans that overlap with spans from this
|
|
query.
|
|
"""
|
|
|
|
self.q = AndMaybe(a, b)
|
|
self.a = a
|
|
self.b = b
|
|
|
|
class _Matcher(SpanBiMatcher):
|
|
def __init__(self, a, b):
|
|
self.a = a
|
|
self.b = b
|
|
amm = binary.AndMaybeMatcher(a, b)
|
|
super(SpanNot._Matcher, self).__init__(amm)
|
|
|
|
def _get_spans(self):
|
|
if self.a.id() == self.b.id():
|
|
spans = []
|
|
bspans = self.b.spans()
|
|
for aspan in self.a.spans():
|
|
overlapped = False
|
|
for bspan in bspans:
|
|
if aspan.overlaps(bspan):
|
|
overlapped = True
|
|
break
|
|
if not overlapped:
|
|
spans.append(aspan)
|
|
return spans
|
|
else:
|
|
return self.a.spans()
|
|
|
|
|
|
class SpanContains(SpanBiQuery):
|
|
"""Matches documents where the spans of the first query contain any spans
|
|
of the second query.
|
|
|
|
For example, to match documents where "apple" occurs at most 10 places
|
|
before "bear" in the "text" field and "cute" is between them::
|
|
|
|
from whoosh import query, spans
|
|
t1 = query.Term("text", "apple")
|
|
t2 = query.Term("text", "bear")
|
|
near = spans.SpanNear(t1, t2, slop=10)
|
|
q = spans.SpanContains(near, query.Term("text", "cute"))
|
|
"""
|
|
|
|
def __init__(self, a, b):
|
|
"""
|
|
:param a: the query to match.
|
|
:param b: the query whose spans must occur within the matching spans
|
|
of the first query.
|
|
"""
|
|
|
|
self.q = And([a, b])
|
|
self.a = a
|
|
self.b = b
|
|
|
|
class _Matcher(SpanBiMatcher):
|
|
def __init__(self, a, b):
|
|
self.a = a
|
|
self.b = b
|
|
im = binary.IntersectionMatcher(a, b)
|
|
super(SpanContains._Matcher, self).__init__(im)
|
|
|
|
def _get_spans(self):
|
|
spans = []
|
|
bspans = self.b.spans()
|
|
for aspan in self.a.spans():
|
|
for bspan in bspans:
|
|
if aspan.start > bspan.end:
|
|
continue
|
|
if aspan.end < bspan.start:
|
|
break
|
|
|
|
if bspan.is_within(aspan):
|
|
spans.append(aspan)
|
|
break
|
|
return spans
|
|
|
|
|
|
class SpanBefore(SpanBiQuery):
|
|
"""Matches documents where the spans of the first query occur before any
|
|
spans of the second query.
|
|
|
|
For example, to match documents where "apple" occurs anywhere before
|
|
"bear"::
|
|
|
|
from whoosh import query, spans
|
|
t1 = query.Term("text", "apple")
|
|
t2 = query.Term("text", "bear")
|
|
q = spans.SpanBefore(t1, t2)
|
|
"""
|
|
|
|
def __init__(self, a, b):
|
|
"""
|
|
:param a: the query that must occur before the second.
|
|
:param b: the query that must occur after the first.
|
|
"""
|
|
|
|
self.a = a
|
|
self.b = b
|
|
self.q = And([a, b])
|
|
|
|
class _Matcher(SpanBiMatcher):
|
|
def __init__(self, a, b):
|
|
self.a = a
|
|
self.b = b
|
|
im = binary.IntersectionMatcher(a, b)
|
|
super(SpanBefore._Matcher, self).__init__(im)
|
|
|
|
def _get_spans(self):
|
|
bminstart = min(bspan.start for bspan in self.b.spans())
|
|
return [aspan for aspan in self.a.spans() if aspan.end < bminstart]
|
|
|
|
|
|
class SpanCondition(SpanBiQuery):
|
|
"""Matches documents that satisfy both subqueries, but only uses the spans
|
|
from the first subquery.
|
|
|
|
This is useful when you want to place conditions on matches but not have
|
|
those conditions affect the spans returned.
|
|
|
|
For example, to get spans for the term ``alfa`` in documents that also
|
|
must contain the term ``bravo``::
|
|
|
|
SpanCondition(Term("text", u"alfa"), Term("text", u"bravo"))
|
|
|
|
"""
|
|
|
|
def __init__(self, a, b):
|
|
self.a = a
|
|
self.b = b
|
|
self.q = And([a, b])
|
|
|
|
class _Matcher(SpanBiMatcher):
|
|
def __init__(self, a, b):
|
|
self.a = a
|
|
im = binary.IntersectionMatcher(a, b)
|
|
super(SpanCondition._Matcher, self).__init__(im)
|
|
|
|
def _get_spans(self):
|
|
return self.a.spans()
|
|
|
|
|
|
|
|
|
|
|