611 lines
21 KiB
Python
611 lines
21 KiB
Python
# Copyright 2010 Matt Chaput. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
|
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
|
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# The views and conclusions contained in the software and documentation are
|
|
# those of the authors and should not be interpreted as representing official
|
|
# policies, either expressed or implied, of Matt Chaput.
|
|
|
|
from __future__ import division
|
|
import os.path
|
|
from optparse import OptionParser
|
|
from shutil import rmtree
|
|
|
|
from whoosh import index, qparser, query, scoring
|
|
from whoosh.util import now, find_object
|
|
|
|
try:
|
|
import xappy
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
import xapian
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
import pysolr
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
from persistent import Persistent
|
|
|
|
class ZDoc(Persistent):
|
|
def __init__(self, d):
|
|
self.__dict__.update(d)
|
|
except ImportError:
|
|
pass
|
|
|
|
|
|
class Module(object):
|
|
def __init__(self, bench, options, args):
|
|
self.bench = bench
|
|
self.options = options
|
|
self.args = args
|
|
|
|
def __repr__(self):
|
|
return self.__class__.__name__
|
|
|
|
def indexer(self, **kwargs):
|
|
pass
|
|
|
|
def index_document(self, d):
|
|
raise NotImplementedError
|
|
|
|
def finish(self, **kwargs):
|
|
pass
|
|
|
|
def _process_result(self, d):
|
|
attrname = "process_result_%s" % self.options.lib
|
|
if hasattr(self.bench.spec, attrname):
|
|
method = getattr(self.bench.spec, attrname)
|
|
self._process_result = method
|
|
return method(d)
|
|
else:
|
|
self._process_result = lambda x: x
|
|
return d
|
|
|
|
def searcher(self):
|
|
pass
|
|
|
|
def query(self):
|
|
raise NotImplementedError
|
|
|
|
def find(self, q):
|
|
raise NotImplementedError
|
|
|
|
def findterms(self, terms):
|
|
raise NotImplementedError
|
|
|
|
def results(self, r):
|
|
for hit in r:
|
|
yield self._process_result(hit)
|
|
|
|
|
|
class Spec(object):
|
|
headline_field = "title"
|
|
main_field = "body"
|
|
|
|
def __init__(self, options, args):
|
|
self.options = options
|
|
self.args = args
|
|
|
|
def documents(self):
|
|
raise NotImplementedError
|
|
|
|
def setup(self):
|
|
pass
|
|
|
|
def print_results(self, ls):
|
|
showbody = self.options.showbody
|
|
snippets = self.options.snippets
|
|
limit = self.options.limit
|
|
for i, hit in enumerate(ls):
|
|
if i >= limit:
|
|
break
|
|
|
|
print("%d. %s" % (i + 1, hit.get(self.headline_field)))
|
|
if snippets:
|
|
print(self.show_snippet(hit))
|
|
if showbody:
|
|
print(hit.get(self.main_field))
|
|
|
|
|
|
class WhooshModule(Module):
|
|
def indexer(self, create=True):
|
|
schema = self.bench.spec.whoosh_schema()
|
|
path = os.path.join(self.options.dir, "%s_whoosh"
|
|
% self.options.indexname)
|
|
|
|
if not os.path.exists(path):
|
|
os.mkdir(path)
|
|
if create:
|
|
ix = index.create_in(path, schema)
|
|
else:
|
|
ix = index.open_dir(path)
|
|
|
|
poolclass = None
|
|
if self.options.pool:
|
|
poolclass = find_object(self.options.pool)
|
|
|
|
self.writer = ix.writer(limitmb=int(self.options.limitmb),
|
|
poolclass=poolclass,
|
|
dir=self.options.tempdir,
|
|
procs=int(self.options.procs),
|
|
batchsize=int(self.options.batch),
|
|
multisegment=self.options.xms)
|
|
self._procdoc = None
|
|
if hasattr(self.bench.spec, "process_document_whoosh"):
|
|
self._procdoc = self.bench.spec.process_document_whoosh
|
|
|
|
def index_document(self, d):
|
|
_procdoc = self._procdoc
|
|
if _procdoc:
|
|
_procdoc(d)
|
|
self.writer.add_document(**d)
|
|
|
|
def finish(self, merge=True, optimize=False):
|
|
self.writer.commit(merge=merge, optimize=optimize)
|
|
|
|
def searcher(self):
|
|
path = os.path.join(self.options.dir, "%s_whoosh"
|
|
% self.options.indexname)
|
|
ix = index.open_dir(path)
|
|
self.srch = ix.searcher(weighting=scoring.PL2())
|
|
self.parser = qparser.QueryParser(self.bench.spec.main_field,
|
|
schema=ix.schema)
|
|
|
|
def query(self):
|
|
qstring = " ".join(self.args).decode("utf-8")
|
|
return self.parser.parse(qstring)
|
|
|
|
def find(self, q):
|
|
return self.srch.search(q, limit=int(self.options.limit),
|
|
optimize=self.options.optimize)
|
|
|
|
def findterms(self, terms):
|
|
limit = int(self.options.limit)
|
|
s = self.srch
|
|
q = query.Term(self.bench.spec.main_field, None)
|
|
for term in terms:
|
|
q.text = term
|
|
yield s.search(q, limit=limit)
|
|
|
|
|
|
class XappyModule(Module):
|
|
def indexer(self, **kwargs):
|
|
path = os.path.join(self.options.dir, "%s_xappy"
|
|
% self.options.indexname)
|
|
conn = self.bench.spec.xappy_connection(path)
|
|
return conn
|
|
|
|
def index_document(self, conn, d):
|
|
if hasattr(self.bench, "process_document_xappy"):
|
|
self.bench.process_document_xappy(d)
|
|
doc = xappy.UnprocessedDocument()
|
|
for key, values in d:
|
|
if not isinstance(values, list):
|
|
values = [values]
|
|
for value in values:
|
|
doc.fields.append(xappy.Field(key, value))
|
|
conn.add(doc)
|
|
|
|
def finish(self, conn):
|
|
conn.flush()
|
|
|
|
def searcher(self):
|
|
path = os.path.join(self.options.dir, "%s_xappy"
|
|
% self.options.indexname)
|
|
return xappy.SearchConnection(path)
|
|
|
|
def query(self, conn):
|
|
return conn.query_parse(" ".join(self.args))
|
|
|
|
def find(self, conn, q):
|
|
return conn.search(q, 0, int(self.options.limit))
|
|
|
|
def findterms(self, conn, terms):
|
|
limit = int(self.options.limit)
|
|
for term in terms:
|
|
q = conn.query_field(self.bench.spec.main_field, term)
|
|
yield conn.search(q, 0, limit)
|
|
|
|
def results(self, r):
|
|
hf = self.bench.spec.headline_field
|
|
mf = self.bench.spec.main_field
|
|
for hit in r:
|
|
yield self._process_result({hf: hit.data[hf], mf: hit.data[mf]})
|
|
|
|
|
|
class XapianModule(Module):
|
|
def indexer(self, **kwargs):
|
|
path = os.path.join(self.options.dir, "%s_xapian"
|
|
% self.options.indexname)
|
|
self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
|
|
self.ixer = xapian.TermGenerator()
|
|
|
|
def index_document(self, d):
|
|
if hasattr(self.bench, "process_document_xapian"):
|
|
self.bench.process_document_xapian(d)
|
|
doc = xapian.Document()
|
|
doc.add_value(0, d.get(self.bench.spec.headline_field, "-"))
|
|
doc.set_data(d[self.bench.spec.main_field])
|
|
self.ixer.set_document(doc)
|
|
self.ixer.index_text(d[self.bench.spec.main_field])
|
|
self.database.add_document(doc)
|
|
|
|
def finish(self, **kwargs):
|
|
self.database.flush()
|
|
|
|
def searcher(self):
|
|
path = os.path.join(self.options.dir, "%s_xappy"
|
|
% self.options.indexname)
|
|
self.db = xapian.Database(path)
|
|
self.enq = xapian.Enquire(self.db)
|
|
self.qp = xapian.QueryParser()
|
|
self.qp.set_database(self.db)
|
|
|
|
def query(self):
|
|
return self.qp.parse_query(" ".join(self.args))
|
|
|
|
def find(self, q):
|
|
self.enq.set_query(q)
|
|
return self.enq.get_mset(0, int(self.options.limit))
|
|
|
|
def findterms(self, terms):
|
|
limit = int(self.options.limit)
|
|
for term in terms:
|
|
q = self.qp.parse_query(term)
|
|
self.enq.set_query(q)
|
|
yield self.enq.get_mset(0, limit)
|
|
|
|
def results(self, matches):
|
|
hf = self.bench.spec.headline_field
|
|
mf = self.bench.spec.main_field
|
|
for m in matches:
|
|
yield self._process_result({hf: m.document.get_value(0),
|
|
mf: m.document.get_data()})
|
|
|
|
|
|
class SolrModule(Module):
|
|
def indexer(self, **kwargs):
|
|
self.solr_doclist = []
|
|
self.conn = pysolr.Solr(self.options.url)
|
|
self.conn.delete("*:*")
|
|
self.conn.commit()
|
|
|
|
def index_document(self, d):
|
|
self.solr_doclist.append(d)
|
|
if len(self.solr_doclist) >= int(self.options.batch):
|
|
self.conn.add(self.solr_doclist, commit=False)
|
|
self.solr_doclist = []
|
|
|
|
def finish(self, **kwargs):
|
|
if self.solr_doclist:
|
|
self.conn.add(self.solr_doclist)
|
|
del self.solr_doclist
|
|
self.conn.optimize(block=True)
|
|
|
|
def searcher(self):
|
|
self.solr = pysolr.Solr(self.options.url)
|
|
|
|
def query(self):
|
|
return " ".join(self.args)
|
|
|
|
def find(self, q):
|
|
return self.solr.search(q, limit=int(self.options.limit))
|
|
|
|
def findterms(self, terms):
|
|
limit = int(self.options.limit)
|
|
for term in terms:
|
|
yield self.solr.search("body:" + term, limit=limit)
|
|
|
|
|
|
class ZcatalogModule(Module):
|
|
def indexer(self, **kwargs):
|
|
from ZODB.FileStorage import FileStorage # @UnresolvedImport
|
|
from ZODB.DB import DB # @UnresolvedImport
|
|
from zcatalog import catalog # @UnresolvedImport
|
|
from zcatalog import indexes # @UnresolvedImport
|
|
import transaction # @UnresolvedImport
|
|
|
|
dir = os.path.join(self.options.dir, "%s_zcatalog"
|
|
% self.options.indexname)
|
|
if os.path.exists(dir):
|
|
rmtree(dir)
|
|
os.mkdir(dir)
|
|
|
|
storage = FileStorage(os.path.join(dir, "index"))
|
|
db = DB(storage)
|
|
conn = db.open()
|
|
|
|
self.cat = catalog.Catalog()
|
|
self.bench.spec.zcatalog_setup(self.cat)
|
|
conn.root()["cat"] = self.cat
|
|
transaction.commit()
|
|
|
|
self.zcatalog_count = 0
|
|
|
|
def index_document(self, d):
|
|
if hasattr(self.bench, "process_document_zcatalog"):
|
|
self.bench.process_document_zcatalog(d)
|
|
doc = ZDoc(d)
|
|
self.cat.index_doc(doc)
|
|
self.zcatalog_count += 1
|
|
if self.zcatalog_count >= 100:
|
|
import transaction # @UnresolvedImport
|
|
transaction.commit()
|
|
self.zcatalog_count = 0
|
|
|
|
def finish(self, **kwargs):
|
|
import transaction # @UnresolvedImport
|
|
transaction.commit()
|
|
del self.zcatalog_count
|
|
|
|
def searcher(self):
|
|
from ZODB.FileStorage import FileStorage # @UnresolvedImport
|
|
from ZODB.DB import DB # @UnresolvedImport
|
|
from zcatalog import catalog # @UnresolvedImport
|
|
from zcatalog import indexes # @UnresolvedImport
|
|
import transaction # @UnresolvedImport
|
|
|
|
path = os.path.join(self.options.dir, "%s_zcatalog"
|
|
% self.options.indexname, "index")
|
|
storage = FileStorage(path)
|
|
db = DB(storage)
|
|
conn = db.open()
|
|
|
|
self.cat = conn.root()["cat"]
|
|
|
|
def query(self):
|
|
return " ".join(self.args)
|
|
|
|
def find(self, q):
|
|
return self.cat.searchResults(body=q)
|
|
|
|
def findterms(self, terms):
|
|
for term in terms:
|
|
yield self.cat.searchResults(body=term)
|
|
|
|
def results(self, r):
|
|
hf = self.bench.spec.headline_field
|
|
mf = self.bench.spec.main_field
|
|
for hit in r:
|
|
# Have to access the attributes for them to be retrieved
|
|
yield self._process_result({hf: getattr(hit, hf),
|
|
mf: getattr(hit, mf)})
|
|
|
|
|
|
class NucularModule(Module):
|
|
def indexer(self, create=True):
|
|
import shutil
|
|
from nucular import Nucular
|
|
|
|
dir = os.path.join(self.options.dir, "%s_nucular"
|
|
% self.options.indexname)
|
|
if create:
|
|
if os.path.exists(dir):
|
|
shutil.rmtree(dir)
|
|
os.mkdir(dir)
|
|
self.archive = Nucular.Nucular(dir)
|
|
if create:
|
|
self.archive.create()
|
|
self.count = 0
|
|
|
|
def index_document(self, d):
|
|
try:
|
|
self.archive.indexDictionary(str(self.count), d)
|
|
except ValueError:
|
|
print("d=", d)
|
|
raise
|
|
self.count += 1
|
|
if not self.count % int(self.options.batch):
|
|
t = now()
|
|
self.archive.store(lazy=True)
|
|
self.indexer(create=False)
|
|
|
|
def finish(self, **kwargs):
|
|
self.archive.store(lazy=False)
|
|
self.archive.aggregateRecent(fast=False, verbose=True)
|
|
self.archive.moveTransientToBase(verbose=True)
|
|
self.archive.cleanUp()
|
|
|
|
def searcher(self):
|
|
from nucular import Nucular
|
|
|
|
dir = os.path.join(self.options.dir, "%s_nucular"
|
|
% self.options.indexname)
|
|
self.archive = Nucular.Nucular(dir)
|
|
|
|
def query(self):
|
|
return " ".join(self.args)
|
|
|
|
def find(self, q):
|
|
return self.archive.dictionaries(q)
|
|
|
|
def findterms(self, terms):
|
|
for term in terms:
|
|
q = self.archive.Query()
|
|
q.anyWord(term)
|
|
yield q.resultDictionaries()
|
|
|
|
|
|
class Bench(object):
|
|
libs = {"whoosh": WhooshModule, "xappy": XappyModule,
|
|
"xapian": XapianModule, "solr": SolrModule,
|
|
"zcatalog": ZcatalogModule, "nucular": NucularModule}
|
|
|
|
def index(self, lib):
|
|
print("Indexing with %s..." % lib)
|
|
|
|
options = self.options
|
|
every = None if options.every is None else int(options.every)
|
|
merge = options.merge
|
|
chunk = int(options.chunk)
|
|
skip = int(options.skip)
|
|
upto = int(options.upto)
|
|
count = 0
|
|
skipc = skip
|
|
|
|
starttime = chunkstarttime = now()
|
|
|
|
lib.indexer()
|
|
|
|
for d in self.spec.documents():
|
|
skipc -= 1
|
|
if not skipc:
|
|
lib.index_document(d)
|
|
count += 1
|
|
skipc = skip
|
|
if chunk and not count % chunk:
|
|
t = now()
|
|
sofar = t - starttime
|
|
print("Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count / sofar))
|
|
chunkstarttime = t
|
|
if count > upto:
|
|
break
|
|
if every and not count % every:
|
|
print("----Commit")
|
|
lib.finish(merge=merge)
|
|
lib.indexer(create=False)
|
|
|
|
spooltime = now()
|
|
print("Spool time:", spooltime - starttime)
|
|
lib.finish(merge=merge)
|
|
committime = now()
|
|
print("Commit time:", committime - spooltime)
|
|
totaltime = committime - starttime
|
|
print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" % (count, totaltime, totaltime / 60.0))
|
|
print("Indexed %0.3f docs/s" % (count / totaltime))
|
|
|
|
def search(self, lib):
|
|
lib.searcher()
|
|
|
|
t = now()
|
|
q = lib.query()
|
|
print("Query:", q)
|
|
r = lib.find(q)
|
|
print("Search time:", now() - t)
|
|
|
|
t = now()
|
|
self.spec.print_results(lib.results(r))
|
|
print("Print time:", now() - t)
|
|
|
|
def search_file(self, lib):
|
|
f = open(self.options.termfile, "rb")
|
|
terms = [line.strip() for line in f]
|
|
f.close()
|
|
|
|
print("Searching %d terms with %s" % (len(terms), lib))
|
|
lib.searcher()
|
|
starttime = now()
|
|
for r in lib.findterms(terms):
|
|
pass
|
|
searchtime = now() - starttime
|
|
print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime)
|
|
|
|
def _parser(self, name):
|
|
p = OptionParser()
|
|
p.add_option("-x", "--lib", dest="lib",
|
|
help="Name of the library to use to index/search.",
|
|
default="whoosh")
|
|
p.add_option("-d", "--dir", dest="dir", metavar="DIRNAME",
|
|
help="Directory in which to store index.", default=".")
|
|
p.add_option("-s", "--setup", dest="setup", action="store_true",
|
|
help="Set up any support files or caches.", default=False)
|
|
p.add_option("-i", "--index", dest="index", action="store_true",
|
|
help="Index the documents.", default=False)
|
|
p.add_option("-n", "--name", dest="indexname", metavar="PREFIX",
|
|
help="Index name prefix.", default="%s_index" % name)
|
|
p.add_option("-U", "--url", dest="url", metavar="URL",
|
|
help="Solr URL", default="http://localhost:8983/solr")
|
|
p.add_option("-m", "--mb", dest="limitmb",
|
|
help="Max. memory usage, in MB", default="128")
|
|
p.add_option("-c", "--chunk", dest="chunk",
|
|
help="Number of documents to index between progress messages.",
|
|
default=1000)
|
|
p.add_option("-B", "--batch", dest="batch",
|
|
help="Batch size for batch adding documents.",
|
|
default=1000)
|
|
p.add_option("-k", "--skip", dest="skip", metavar="N",
|
|
help="Index every Nth document.", default=1)
|
|
p.add_option("-e", "--commit-every", dest="every", metavar="NUM",
|
|
help="Commit every NUM documents", default=None)
|
|
p.add_option("-M", "--no-merge", dest="merge", action="store_false",
|
|
help="Don't merge segments when doing multiple commits",
|
|
default=True)
|
|
p.add_option("-u", "--upto", dest="upto", metavar="N",
|
|
help="Index up to this document number.", default=600000)
|
|
p.add_option("-p", "--procs", dest="procs", metavar="NUMBER",
|
|
help="Number of processors to use.", default=0)
|
|
p.add_option("-l", "--limit", dest="limit", metavar="N",
|
|
help="Maximum number of search results to retrieve.",
|
|
default=10)
|
|
p.add_option("-b", "--body", dest="showbody", action="store_true",
|
|
help="Show the body text in search results.",
|
|
default=False)
|
|
p.add_option("-g", "--gen", dest="generate", metavar="N",
|
|
help="Generate a list at most N terms present in all libraries.",
|
|
default=None)
|
|
p.add_option("-f", "--file", dest="termfile", metavar="FILENAME",
|
|
help="Search using the list of terms in this file.",
|
|
default=None)
|
|
p.add_option("-t", "--tempdir", dest="tempdir", metavar="DIRNAME",
|
|
help="Whoosh temp dir", default=None)
|
|
p.add_option("-P", "--pool", dest="pool", metavar="CLASSNAME",
|
|
help="Whoosh pool class", default=None)
|
|
p.add_option("-X", "--xms", dest="xms", action="store_true",
|
|
help="Experimental Whoosh feature", default=False)
|
|
p.add_option("-Z", "--storebody", dest="storebody", action="store_true",
|
|
help="Store the body text in index", default=False)
|
|
p.add_option("-q", "--snippets", dest="snippets", action="store_true",
|
|
help="Show highlighted snippets", default=False)
|
|
p.add_option("-O", "--no-optimize", dest="optimize", action="store_false",
|
|
help="Turn off searcher optimization", default=True)
|
|
|
|
return p
|
|
|
|
def run(self, specclass):
|
|
parser = self._parser(specclass.name)
|
|
options, args = parser.parse_args()
|
|
self.options = options
|
|
self.args = args
|
|
|
|
if options.lib not in self.libs:
|
|
raise Exception("Unknown library: %r" % options.lib)
|
|
lib = self.libs[options.lib](self, options, args)
|
|
|
|
self.spec = specclass(options, args)
|
|
|
|
if options.setup:
|
|
self.spec.setup()
|
|
|
|
action = self.search
|
|
if options.index:
|
|
action = self.index
|
|
if options.termfile:
|
|
action = self.search_file
|
|
if options.generate:
|
|
action = self.generate_search_file
|
|
|
|
action(lib)
|