378 lines
12 KiB
Python
378 lines
12 KiB
Python
|
# Copyright 2008 Matt Chaput. All rights reserved.
|
||
|
#
|
||
|
# Redistribution and use in source and binary forms, with or without
|
||
|
# modification, are permitted provided that the following conditions are met:
|
||
|
#
|
||
|
# 1. Redistributions of source code must retain the above copyright notice,
|
||
|
# this list of conditions and the following disclaimer.
|
||
|
#
|
||
|
# 2. Redistributions in binary form must reproduce the above copyright
|
||
|
# notice, this list of conditions and the following disclaimer in the
|
||
|
# documentation and/or other materials provided with the distribution.
|
||
|
#
|
||
|
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
|
||
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||
|
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||
|
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
#
|
||
|
# The views and conclusions contained in the software and documentation are
|
||
|
# those of the authors and should not be interpreted as representing official
|
||
|
# policies, either expressed or implied, of Matt Chaput.
|
||
|
|
||
|
"""Classes and functions for classifying and extracting information from
|
||
|
documents.
|
||
|
"""
|
||
|
|
||
|
from __future__ import division
|
||
|
import random
|
||
|
from collections import defaultdict
|
||
|
from math import log
|
||
|
|
||
|
from whoosh.compat import xrange, iteritems
|
||
|
|
||
|
|
||
|
# Expansion models
|
||
|
|
||
|
class ExpansionModel(object):
|
||
|
def __init__(self, doc_count, field_length):
|
||
|
self.N = doc_count
|
||
|
self.collection_total = field_length
|
||
|
|
||
|
if self.N:
|
||
|
self.mean_length = self.collection_total / self.N
|
||
|
else:
|
||
|
self.mean_length = 0
|
||
|
|
||
|
def normalizer(self, maxweight, top_total):
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def score(self, weight_in_top, weight_in_collection, top_total):
|
||
|
raise NotImplementedError
|
||
|
|
||
|
|
||
|
class Bo1Model(ExpansionModel):
|
||
|
def normalizer(self, maxweight, top_total):
|
||
|
f = maxweight / self.N
|
||
|
return (maxweight * log((1.0 + f) / f) + log(1.0 + f)) / log(2.0)
|
||
|
|
||
|
def score(self, weight_in_top, weight_in_collection, top_total):
|
||
|
f = weight_in_collection / self.N
|
||
|
return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
|
||
|
|
||
|
|
||
|
class Bo2Model(ExpansionModel):
|
||
|
def normalizer(self, maxweight, top_total):
|
||
|
f = maxweight * self.N / self.collection_total
|
||
|
return maxweight * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
|
||
|
|
||
|
def score(self, weight_in_top, weight_in_collection, top_total):
|
||
|
f = weight_in_top * top_total / self.collection_total
|
||
|
return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
|
||
|
|
||
|
|
||
|
class KLModel(ExpansionModel):
|
||
|
def normalizer(self, maxweight, top_total):
|
||
|
return (maxweight * log(self.collection_total / top_total) / log(2.0)
|
||
|
* top_total)
|
||
|
|
||
|
def score(self, weight_in_top, weight_in_collection, top_total):
|
||
|
wit_over_tt = weight_in_top / top_total
|
||
|
wic_over_ct = weight_in_collection / self.collection_total
|
||
|
|
||
|
if wit_over_tt < wic_over_ct:
|
||
|
return 0
|
||
|
else:
|
||
|
return wit_over_tt * log(wit_over_tt
|
||
|
/ (weight_in_top / self.collection_total),
|
||
|
2)
|
||
|
|
||
|
|
||
|
class Expander(object):
|
||
|
"""Uses an ExpansionModel to expand the set of query terms based on the top
|
||
|
N result documents.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, ixreader, fieldname, model=Bo1Model):
|
||
|
"""
|
||
|
:param reader: A :class:whoosh.reading.IndexReader object.
|
||
|
:param fieldname: The name of the field in which to search.
|
||
|
:param model: (classify.ExpansionModel) The model to use for expanding
|
||
|
the query terms. If you omit this parameter, the expander uses
|
||
|
:class:`Bo1Model` by default.
|
||
|
"""
|
||
|
|
||
|
self.ixreader = ixreader
|
||
|
self.fieldname = fieldname
|
||
|
doccount = self.ixreader.doc_count_all()
|
||
|
fieldlen = self.ixreader.field_length(fieldname)
|
||
|
|
||
|
if type(model) is type:
|
||
|
model = model(doccount, fieldlen)
|
||
|
self.model = model
|
||
|
|
||
|
# Maps words to their weight in the top N documents.
|
||
|
self.topN_weight = defaultdict(float)
|
||
|
|
||
|
# Total weight of all terms in the top N documents.
|
||
|
self.top_total = 0
|
||
|
|
||
|
def add(self, vector):
|
||
|
"""Adds forward-index information about one of the "top N" documents.
|
||
|
|
||
|
:param vector: A series of (text, weight) tuples, such as is
|
||
|
returned by Reader.vector_as("weight", docnum, fieldname).
|
||
|
"""
|
||
|
|
||
|
total_weight = 0
|
||
|
topN_weight = self.topN_weight
|
||
|
|
||
|
for word, weight in vector:
|
||
|
total_weight += weight
|
||
|
topN_weight[word] += weight
|
||
|
|
||
|
self.top_total += total_weight
|
||
|
|
||
|
def add_document(self, docnum):
|
||
|
ixreader = self.ixreader
|
||
|
if self.ixreader.has_vector(docnum, self.fieldname):
|
||
|
self.add(ixreader.vector_as("weight", docnum, self.fieldname))
|
||
|
elif self.ixreader.schema[self.fieldname].stored:
|
||
|
self.add_text(ixreader.stored_fields(docnum).get(self.fieldname))
|
||
|
else:
|
||
|
raise Exception("Field %r in document %s is not vectored or stored"
|
||
|
% (self.fieldname, docnum))
|
||
|
|
||
|
def add_text(self, string):
|
||
|
# Unfortunately since field.index() yields bytes texts, and we want
|
||
|
# unicode, we end up encoding and decoding unnecessarily.
|
||
|
#
|
||
|
# TODO: Find a way around this
|
||
|
|
||
|
field = self.ixreader.schema[self.fieldname]
|
||
|
from_bytes = field.from_bytes
|
||
|
self.add((from_bytes(text), weight) for text, _, weight, _
|
||
|
in field.index(string))
|
||
|
|
||
|
def expanded_terms(self, number, normalize=True):
|
||
|
"""Returns the N most important terms in the vectors added so far.
|
||
|
|
||
|
:param number: The number of terms to return.
|
||
|
:param normalize: Whether to normalize the weights.
|
||
|
:returns: A list of ("term", weight) tuples.
|
||
|
"""
|
||
|
|
||
|
model = self.model
|
||
|
fieldname = self.fieldname
|
||
|
ixreader = self.ixreader
|
||
|
field = ixreader.schema[fieldname]
|
||
|
tlist = []
|
||
|
maxweight = 0
|
||
|
|
||
|
# If no terms have been added, return an empty list
|
||
|
if not self.topN_weight:
|
||
|
return []
|
||
|
|
||
|
for word, weight in iteritems(self.topN_weight):
|
||
|
btext = field.to_bytes(word)
|
||
|
if (fieldname, btext) in ixreader:
|
||
|
cf = ixreader.frequency(fieldname, btext)
|
||
|
score = model.score(weight, cf, self.top_total)
|
||
|
if score > maxweight:
|
||
|
maxweight = score
|
||
|
tlist.append((score, word))
|
||
|
|
||
|
if normalize:
|
||
|
norm = model.normalizer(maxweight, self.top_total)
|
||
|
else:
|
||
|
norm = maxweight
|
||
|
tlist = [(weight / norm, t) for weight, t in tlist]
|
||
|
tlist.sort(key=lambda x: (0 - x[0], x[1]))
|
||
|
|
||
|
return [(t, weight) for weight, t in tlist[:number]]
|
||
|
|
||
|
|
||
|
# Similarity functions
|
||
|
|
||
|
def shingles(input, size=2):
|
||
|
d = defaultdict(int)
|
||
|
for shingle in (input[i:i + size]
|
||
|
for i in xrange(len(input) - (size - 1))):
|
||
|
d[shingle] += 1
|
||
|
return iteritems(d)
|
||
|
|
||
|
|
||
|
def simhash(features, hashbits=32):
|
||
|
if hashbits == 32:
|
||
|
hashfn = hash
|
||
|
else:
|
||
|
hashfn = lambda s: _hash(s, hashbits)
|
||
|
|
||
|
vs = [0] * hashbits
|
||
|
for feature, weight in features:
|
||
|
h = hashfn(feature)
|
||
|
for i in xrange(hashbits):
|
||
|
if h & (1 << i):
|
||
|
vs[i] += weight
|
||
|
else:
|
||
|
vs[i] -= weight
|
||
|
|
||
|
out = 0
|
||
|
for i, v in enumerate(vs):
|
||
|
if v > 0:
|
||
|
out |= 1 << i
|
||
|
return out
|
||
|
|
||
|
|
||
|
def _hash(s, hashbits):
|
||
|
# A variable-length version of Python's builtin hash
|
||
|
if s == "":
|
||
|
return 0
|
||
|
else:
|
||
|
x = ord(s[0]) << 7
|
||
|
m = 1000003
|
||
|
mask = 2 ** hashbits - 1
|
||
|
for c in s:
|
||
|
x = ((x * m) ^ ord(c)) & mask
|
||
|
x ^= len(s)
|
||
|
if x == -1:
|
||
|
x = -2
|
||
|
return x
|
||
|
|
||
|
|
||
|
def hamming_distance(first_hash, other_hash, hashbits=32):
|
||
|
x = (first_hash ^ other_hash) & ((1 << hashbits) - 1)
|
||
|
tot = 0
|
||
|
while x:
|
||
|
tot += 1
|
||
|
x &= x - 1
|
||
|
return tot
|
||
|
|
||
|
|
||
|
# Clustering
|
||
|
|
||
|
def kmeans(data, k, t=0.0001, distfun=None, maxiter=50, centers=None):
|
||
|
"""
|
||
|
One-dimensional K-means clustering function.
|
||
|
|
||
|
:param data: list of data points.
|
||
|
:param k: number of clusters.
|
||
|
:param t: tolerance; stop if changes between iterations are smaller than
|
||
|
this value.
|
||
|
:param distfun: a distance function.
|
||
|
:param centers: a list of centroids to start with.
|
||
|
:param maxiter: maximum number of iterations to run.
|
||
|
"""
|
||
|
|
||
|
# Adapted from a C version by Roger Zhang, <rogerz@cs.dal.ca>
|
||
|
# http://cs.smu.ca/~r_zhang/code/kmeans.c
|
||
|
|
||
|
DOUBLE_MAX = 1.797693e308
|
||
|
n = len(data)
|
||
|
|
||
|
error = DOUBLE_MAX # sum of squared euclidean distance
|
||
|
|
||
|
counts = [0] * k # size of each cluster
|
||
|
labels = [0] * n # output cluster label for each data point
|
||
|
|
||
|
# c1 is an array of len k of the temp centroids
|
||
|
c1 = [0] * k
|
||
|
|
||
|
# choose k initial centroids
|
||
|
if centers:
|
||
|
c = centers
|
||
|
else:
|
||
|
c = random.sample(data, k)
|
||
|
|
||
|
niter = 0
|
||
|
# main loop
|
||
|
while True:
|
||
|
# save error from last step
|
||
|
old_error = error
|
||
|
error = 0
|
||
|
|
||
|
# clear old counts and temp centroids
|
||
|
for i in xrange(k):
|
||
|
counts[i] = 0
|
||
|
c1[i] = 0
|
||
|
|
||
|
for h in xrange(n):
|
||
|
# identify the closest cluster
|
||
|
min_distance = DOUBLE_MAX
|
||
|
for i in xrange(k):
|
||
|
distance = (data[h] - c[i]) ** 2
|
||
|
if distance < min_distance:
|
||
|
labels[h] = i
|
||
|
min_distance = distance
|
||
|
|
||
|
# update size and temp centroid of the destination cluster
|
||
|
c1[labels[h]] += data[h]
|
||
|
counts[labels[h]] += 1
|
||
|
# update standard error
|
||
|
error += min_distance
|
||
|
|
||
|
for i in xrange(k): # update all centroids
|
||
|
c[i] = c1[i] / counts[i] if counts[i] else c1[i]
|
||
|
|
||
|
niter += 1
|
||
|
if (abs(error - old_error) < t) or (niter > maxiter):
|
||
|
break
|
||
|
|
||
|
return labels, c
|
||
|
|
||
|
|
||
|
# Sliding window clusters
|
||
|
|
||
|
def two_pass_variance(data):
|
||
|
n = 0
|
||
|
sum1 = 0
|
||
|
sum2 = 0
|
||
|
|
||
|
for x in data:
|
||
|
n += 1
|
||
|
sum1 = sum1 + x
|
||
|
|
||
|
mean = sum1 / n
|
||
|
|
||
|
for x in data:
|
||
|
sum2 += (x - mean) * (x - mean)
|
||
|
|
||
|
variance = sum2 / (n - 1)
|
||
|
return variance
|
||
|
|
||
|
|
||
|
def weighted_incremental_variance(data_weight_pairs):
|
||
|
mean = 0
|
||
|
S = 0
|
||
|
sumweight = 0
|
||
|
for x, weight in data_weight_pairs:
|
||
|
temp = weight + sumweight
|
||
|
Q = x - mean
|
||
|
R = Q * weight / temp
|
||
|
S += sumweight * Q * R
|
||
|
mean += R
|
||
|
sumweight = temp
|
||
|
Variance = S / (sumweight - 1) # if sample is the population, omit -1
|
||
|
return Variance
|
||
|
|
||
|
|
||
|
def swin(data, size):
|
||
|
clusters = []
|
||
|
for i, left in enumerate(data):
|
||
|
j = i
|
||
|
right = data[j]
|
||
|
while j < len(data) - 1 and right - left < size:
|
||
|
j += 1
|
||
|
right = data[j]
|
||
|
v = 99999
|
||
|
if j - i > 1:
|
||
|
v = two_pass_variance(data[i:j + 1])
|
||
|
clusters.append((left, right, j - i, v))
|
||
|
clusters.sort(key=lambda x: (0 - x[2], x[3]))
|
||
|
return clusters
|