416 lines
15 KiB
Python
416 lines
15 KiB
Python
|
# Copyright 2012 Matt Chaput. All rights reserved.
|
||
|
#
|
||
|
# Redistribution and use in source and binary forms, with or without
|
||
|
# modification, are permitted provided that the following conditions are met:
|
||
|
#
|
||
|
# 1. Redistributions of source code must retain the above copyright notice,
|
||
|
# this list of conditions and the following disclaimer.
|
||
|
#
|
||
|
# 2. Redistributions in binary form must reproduce the above copyright
|
||
|
# notice, this list of conditions and the following disclaimer in the
|
||
|
# documentation and/or other materials provided with the distribution.
|
||
|
#
|
||
|
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||
|
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||
|
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
#
|
||
|
# The views and conclusions contained in the software and documentation are
|
||
|
# those of the authors and should not be interpreted as representing official
|
||
|
# policies, either expressed or implied, of Matt Chaput.
|
||
|
|
||
|
from whoosh import matching
|
||
|
from whoosh.compat import text_type, u, xrange
|
||
|
from whoosh.query import qcore
|
||
|
from whoosh.query.wrappers import WrappingQuery
|
||
|
|
||
|
|
||
|
class NestedParent(WrappingQuery):
|
||
|
"""A query that allows you to search for "nested" documents, where you can
|
||
|
index (possibly multiple levels of) "parent" and "child" documents using
|
||
|
the :meth:`~whoosh.writing.IndexWriter.group` and/or
|
||
|
:meth:`~whoosh.writing.IndexWriter.start_group` methods of a
|
||
|
:class:`whoosh.writing.IndexWriter` to indicate that hierarchically related
|
||
|
documents should be kept together::
|
||
|
|
||
|
schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True))
|
||
|
|
||
|
with ix.writer() as w:
|
||
|
# Say we're indexing chapters (type=chap) and each chapter has a
|
||
|
# number of paragraphs (type=p)
|
||
|
with w.group():
|
||
|
w.add_document(type="chap", text="Chapter 1")
|
||
|
w.add_document(type="p", text="Able baker")
|
||
|
w.add_document(type="p", text="Bright morning")
|
||
|
with w.group():
|
||
|
w.add_document(type="chap", text="Chapter 2")
|
||
|
w.add_document(type="p", text="Car trip")
|
||
|
w.add_document(type="p", text="Dog eared")
|
||
|
w.add_document(type="p", text="Every day")
|
||
|
with w.group():
|
||
|
w.add_document(type="chap", text="Chapter 3")
|
||
|
w.add_document(type="p", text="Fine day")
|
||
|
|
||
|
The ``NestedParent`` query wraps two sub-queries: the "parent query"
|
||
|
matches a class of "parent documents". The "sub query" matches nested
|
||
|
documents you want to find. For each "sub document" the "sub query" finds,
|
||
|
this query acts as if it found the corresponding "parent document".
|
||
|
|
||
|
>>> with ix.searcher() as s:
|
||
|
... r = s.search(query.Term("text", "day"))
|
||
|
... for hit in r:
|
||
|
... print(hit["text"])
|
||
|
...
|
||
|
Chapter 2
|
||
|
Chapter 3
|
||
|
"""
|
||
|
|
||
|
def __init__(self, parents, subq, per_parent_limit=None, score_fn=sum):
|
||
|
"""
|
||
|
:param parents: a query, DocIdSet object, or Results object
|
||
|
representing the documents you want to use as the "parent"
|
||
|
documents. Where the sub-query matches, the corresponding document
|
||
|
in these results will be returned as the match.
|
||
|
:param subq: a query matching the information you want to find.
|
||
|
:param per_parent_limit: a maximum number of "sub documents" to search
|
||
|
per parent. The default is None, meaning no limit.
|
||
|
:param score_fn: a function to use to combine the scores of matching
|
||
|
sub-documents to calculate the score returned for the parent
|
||
|
document. The default is ``sum``, that is, add up the scores of the
|
||
|
sub-documents.
|
||
|
"""
|
||
|
|
||
|
self.parents = parents
|
||
|
self.child = subq
|
||
|
self.per_parent_limit = per_parent_limit
|
||
|
self.score_fn = score_fn
|
||
|
|
||
|
def normalize(self):
|
||
|
p = self.parents
|
||
|
if isinstance(p, qcore.Query):
|
||
|
p = p.normalize()
|
||
|
q = self.child.normalize()
|
||
|
|
||
|
if p is qcore.NullQuery or q is qcore.NullQuery:
|
||
|
return qcore.NullQuery
|
||
|
|
||
|
return self.__class__(p, q)
|
||
|
|
||
|
def requires(self):
|
||
|
return self.child.requires()
|
||
|
|
||
|
def matcher(self, searcher, context=None):
|
||
|
bits = searcher._filter_to_comb(self.parents)
|
||
|
if not bits:
|
||
|
return matching.NullMatcher
|
||
|
m = self.child.matcher(searcher, context)
|
||
|
if not m.is_active():
|
||
|
return matching.NullMatcher
|
||
|
|
||
|
return self.NestedParentMatcher(bits, m, self.per_parent_limit,
|
||
|
searcher.doc_count_all(),
|
||
|
self.score_fn)
|
||
|
|
||
|
def deletion_docs(self, searcher):
|
||
|
bits = searcher._filter_to_comb(self.parents)
|
||
|
if not bits:
|
||
|
return
|
||
|
|
||
|
m = self.child.matcher(searcher, searcher.boolean_context())
|
||
|
maxdoc = searcher.doc_count_all()
|
||
|
while m.is_active():
|
||
|
docnum = m.id()
|
||
|
parentdoc = bits.before(docnum + 1)
|
||
|
nextparent = bits.after(docnum) or maxdoc
|
||
|
for i in xrange(parentdoc, nextparent):
|
||
|
yield i
|
||
|
m.skip_to(nextparent)
|
||
|
|
||
|
class NestedParentMatcher(matching.Matcher):
|
||
|
def __init__(self, comb, child, per_parent_limit, maxdoc, score_fn):
|
||
|
self.comb = comb
|
||
|
self.child = child
|
||
|
self.per_parent_limit = per_parent_limit
|
||
|
self.maxdoc = maxdoc
|
||
|
self.score_fn = score_fn
|
||
|
|
||
|
self._nextdoc = None
|
||
|
if self.child.is_active():
|
||
|
self._gather()
|
||
|
|
||
|
def is_active(self):
|
||
|
return self._nextdoc is not None
|
||
|
|
||
|
def supports_block_quality(self):
|
||
|
return False
|
||
|
|
||
|
def _gather(self):
|
||
|
# This is where the magic happens ;)
|
||
|
child = self.child
|
||
|
pplimit = self.per_parent_limit
|
||
|
|
||
|
# The next document returned by this matcher is the parent of the
|
||
|
# child's current document. We don't have to worry about whether
|
||
|
# the parent is deleted, because the query that gave us the parents
|
||
|
# wouldn't return deleted documents.
|
||
|
self._nextdoc = self.comb.before(child.id() + 1)
|
||
|
# The next parent after the child matcher's current document
|
||
|
nextparent = self.comb.after(child.id()) or self.maxdoc
|
||
|
|
||
|
# Sum the scores of all matching documents under the parent
|
||
|
count = 1
|
||
|
scores = []
|
||
|
while child.is_active() and child.id() < nextparent:
|
||
|
if pplimit and count > pplimit:
|
||
|
child.skip_to(nextparent)
|
||
|
break
|
||
|
|
||
|
scores.append(child.score())
|
||
|
child.next()
|
||
|
count += 1
|
||
|
|
||
|
score = self.score_fn(scores) if scores else 0
|
||
|
self._nextscore = score
|
||
|
|
||
|
def id(self):
|
||
|
return self._nextdoc
|
||
|
|
||
|
def score(self):
|
||
|
return self._nextscore
|
||
|
|
||
|
def reset(self):
|
||
|
self.child.reset()
|
||
|
self._gather()
|
||
|
|
||
|
def next(self):
|
||
|
if self.child.is_active():
|
||
|
self._gather()
|
||
|
else:
|
||
|
if self._nextdoc is None:
|
||
|
raise matching.ReadTooFar
|
||
|
else:
|
||
|
self._nextdoc = None
|
||
|
|
||
|
def skip_to(self, id):
|
||
|
self.child.skip_to(id)
|
||
|
self._gather()
|
||
|
|
||
|
def value(self):
|
||
|
raise NotImplementedError(self.__class__)
|
||
|
|
||
|
def spans(self):
|
||
|
return []
|
||
|
|
||
|
|
||
|
class NestedChildren(WrappingQuery):
|
||
|
"""This is the reverse of a :class:`NestedParent` query: instead of taking
|
||
|
a query that matches children but returns the parent, this query matches
|
||
|
parents but returns the children.
|
||
|
|
||
|
This is useful, for example, to search for an album title and return the
|
||
|
songs in the album::
|
||
|
|
||
|
schema = fields.Schema(type=fields.ID(stored=True),
|
||
|
album_name=fields.TEXT(stored=True),
|
||
|
track_num=fields.NUMERIC(stored=True),
|
||
|
track_name=fields.TEXT(stored=True),
|
||
|
lyrics=fields.TEXT)
|
||
|
ix = RamStorage().create_index(schema)
|
||
|
|
||
|
# Indexing
|
||
|
with ix.writer() as w:
|
||
|
# For each album, index a "group" of a parent "album" document and
|
||
|
# multiple child "track" documents.
|
||
|
with w.group():
|
||
|
w.add_document(type="album",
|
||
|
artist="The Cure", album_name="Disintegration")
|
||
|
w.add_document(type="track", track_num=1,
|
||
|
track_name="Plainsong")
|
||
|
w.add_document(type="track", track_num=2,
|
||
|
track_name="Pictures of You")
|
||
|
# ...
|
||
|
# ...
|
||
|
|
||
|
|
||
|
# Find songs where the song name has "heaven" in the title and the
|
||
|
# album the song is on has "hell" in the title
|
||
|
qp = QueryParser("lyrics", ix.schema)
|
||
|
with ix.searcher() as s:
|
||
|
# A query that matches all parents
|
||
|
all_albums = qp.parse("type:album")
|
||
|
|
||
|
# A query that matches the parents we want
|
||
|
albums_with_hell = qp.parse("album_name:hell")
|
||
|
|
||
|
# A query that matches the desired albums but returns the tracks
|
||
|
songs_on_hell_albums = NestedChildren(all_albums, albums_with_hell)
|
||
|
|
||
|
# A query that matches tracks with heaven in the title
|
||
|
songs_with_heaven = qp.parse("track_name:heaven")
|
||
|
|
||
|
# A query that finds tracks with heaven in the title on albums
|
||
|
# with hell in the title
|
||
|
q = query.And([songs_on_hell_albums, songs_with_heaven])
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, parents, subq, boost=1.0):
|
||
|
self.parents = parents
|
||
|
self.child = subq
|
||
|
self.boost = boost
|
||
|
|
||
|
def matcher(self, searcher, context=None):
|
||
|
bits = searcher._filter_to_comb(self.parents)
|
||
|
if not bits:
|
||
|
return matching.NullMatcher
|
||
|
|
||
|
m = self.child.matcher(searcher, context)
|
||
|
if not m.is_active():
|
||
|
return matching.NullMatcher
|
||
|
|
||
|
return self.NestedChildMatcher(bits, m, searcher.doc_count_all(),
|
||
|
searcher.reader().is_deleted,
|
||
|
boost=self.boost)
|
||
|
|
||
|
class NestedChildMatcher(matching.WrappingMatcher):
|
||
|
def __init__(self, parent_comb, wanted_parent_matcher, limit,
|
||
|
is_deleted, boost=1.0):
|
||
|
self.parent_comb = parent_comb
|
||
|
self.child = wanted_parent_matcher
|
||
|
self.limit = limit
|
||
|
self.is_deleted = is_deleted
|
||
|
self.boost = boost
|
||
|
self._nextchild = -1
|
||
|
self._nextparent = -1
|
||
|
self._find_next_children()
|
||
|
|
||
|
def __repr__(self):
|
||
|
return "%s(%r, %r)" % (self.__class__.__name__,
|
||
|
self.parent_comb,
|
||
|
self.child)
|
||
|
|
||
|
def reset(self):
|
||
|
self.child.reset()
|
||
|
self._reset()
|
||
|
|
||
|
def _reset(self):
|
||
|
self._nextchild = -1
|
||
|
self._nextparent = -1
|
||
|
self._find_next_children()
|
||
|
|
||
|
def is_active(self):
|
||
|
return self._nextchild < self._nextparent
|
||
|
|
||
|
def replace(self, minquality=0):
|
||
|
return self
|
||
|
|
||
|
def _find_next_children(self):
|
||
|
# "comb" contains the doc IDs of all parent documents
|
||
|
comb = self.parent_comb
|
||
|
# "m" is the matcher for "wanted" parents
|
||
|
m = self.child
|
||
|
# Last doc ID + 1
|
||
|
limit = self.limit
|
||
|
# A function that returns True if a doc ID is deleted
|
||
|
is_deleted = self.is_deleted
|
||
|
nextchild = self._nextchild
|
||
|
nextparent = self._nextparent
|
||
|
|
||
|
while m.is_active():
|
||
|
# Move the "child id" to the document after the current match
|
||
|
nextchild = m.id() + 1
|
||
|
# Move the parent matcher to the next match
|
||
|
m.next()
|
||
|
|
||
|
# Find the next parent document (matching or not) after this
|
||
|
nextparent = comb.after(nextchild)
|
||
|
if nextparent is None:
|
||
|
nextparent = limit
|
||
|
|
||
|
# Skip any deleted child documents
|
||
|
while is_deleted(nextchild):
|
||
|
nextchild += 1
|
||
|
|
||
|
# If skipping deleted documents put us to or past the next
|
||
|
# parent doc, go again
|
||
|
if nextchild >= nextparent:
|
||
|
continue
|
||
|
else:
|
||
|
# Otherwise, we're done
|
||
|
break
|
||
|
|
||
|
self._nextchild = nextchild
|
||
|
self._nextparent = nextparent
|
||
|
|
||
|
def id(self):
|
||
|
return self._nextchild
|
||
|
|
||
|
def all_ids(self):
|
||
|
while self.is_active():
|
||
|
yield self.id()
|
||
|
self.next()
|
||
|
|
||
|
def next(self):
|
||
|
is_deleted = self.is_deleted
|
||
|
limit = self.limit
|
||
|
nextparent = self._nextparent
|
||
|
|
||
|
# Go to the next document
|
||
|
nextchild = self._nextchild
|
||
|
nextchild += 1
|
||
|
|
||
|
# Skip over any deleted child documents
|
||
|
while nextchild < nextparent and is_deleted(nextchild):
|
||
|
nextchild += 1
|
||
|
|
||
|
self._nextchild = nextchild
|
||
|
# If we're at or past the next parent doc, go to the next set of
|
||
|
# children
|
||
|
if nextchild >= limit:
|
||
|
return
|
||
|
elif nextchild >= nextparent:
|
||
|
self._find_next_children()
|
||
|
|
||
|
def skip_to(self, docid):
|
||
|
comb = self.parent_comb
|
||
|
wanted = self.child
|
||
|
|
||
|
# self._nextchild is the "current" matching child ID
|
||
|
if docid <= self._nextchild:
|
||
|
return
|
||
|
|
||
|
# self._nextparent is the next parent ID (matching or not)
|
||
|
if docid < self._nextparent:
|
||
|
# Just iterate
|
||
|
while self.is_active() and self.id() < docid:
|
||
|
self.next()
|
||
|
elif wanted.is_active():
|
||
|
# Find the parent before the target ID
|
||
|
pid = comb.before(docid)
|
||
|
# Skip the parent matcher to that ID
|
||
|
wanted.skip_to(pid)
|
||
|
# If that made the matcher inactive, then we're done
|
||
|
if not wanted.is_active():
|
||
|
self._nextchild = self._nextparent = self.limit
|
||
|
else:
|
||
|
# Reestablish for the next child after the next matching
|
||
|
# parent
|
||
|
self._find_next_children()
|
||
|
else:
|
||
|
self._nextchild = self._nextparent = self.limit
|
||
|
|
||
|
def value(self):
|
||
|
raise NotImplementedError(self.__class__)
|
||
|
|
||
|
def score(self):
|
||
|
return self.boost
|
||
|
|
||
|
def spans(self):
|
||
|
return []
|