571 lines
12 KiB
Python
571 lines
12 KiB
Python
"""This module implements the Lovins stemming algorithm. Use the ``stem()``
|
|
function::
|
|
|
|
stemmed_word = stem(word)
|
|
"""
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
# Conditions
|
|
|
|
def A(base):
|
|
# A No restrictions on stem
|
|
return True
|
|
|
|
|
|
def B(base):
|
|
# B Minimum stem length = 3
|
|
return len(base) > 2
|
|
|
|
|
|
def C(base):
|
|
# C Minimum stem length = 4
|
|
return len(base) > 3
|
|
|
|
|
|
def D(base):
|
|
# D Minimum stem length = 5
|
|
return len(base) > 4
|
|
|
|
|
|
def E(base):
|
|
# E Do not remove ending after e
|
|
return base[-1] != "e"
|
|
|
|
|
|
def F(base):
|
|
# F Minimum stem length = 3 and do not remove ending after e
|
|
return len(base) > 2 and base[-1] != "e"
|
|
|
|
|
|
def G(base):
|
|
# G Minimum stem length = 3 and remove ending only after f
|
|
return len(base) > 2 and base[-1] == "f"
|
|
|
|
|
|
def H(base):
|
|
# H Remove ending only after t or ll
|
|
c1, c2 = base[-2:]
|
|
return c2 == "t" or (c2 == "l" and c1 == "l")
|
|
|
|
|
|
def I(base):
|
|
# I Do not remove ending after o or e
|
|
c = base[-1]
|
|
return c != "o" and c != "e"
|
|
|
|
|
|
def J(base):
|
|
# J Do not remove ending after a or e
|
|
c = base[-1]
|
|
return c != "a" and c != "e"
|
|
|
|
|
|
def K(base):
|
|
# K Minimum stem length = 3 and remove ending only after l, i or u*e
|
|
c = base[-1]
|
|
cc = base[-3]
|
|
return len(base) > 2 and (c == "l" or c == "i" or (c == "e" and cc == "u"))
|
|
|
|
|
|
def L(base):
|
|
# L Do not remove ending after u, x or s, unless s follows o
|
|
c1, c2 = base[-2:]
|
|
return c2 != "u" and c2 != "x" and (c2 != "s" or c1 == "o")
|
|
|
|
|
|
def M(base):
|
|
# M Do not remove ending after a, c, e or m
|
|
c = base[-1]
|
|
return c != "a" and c != "c" and c != "e" and c != "m"
|
|
|
|
|
|
def N(base):
|
|
# N Minimum stem length = 4 after s**, elsewhere = 3
|
|
return len(base) > 3 or (len(base) == 3 and base[-1] != "s")
|
|
|
|
|
|
def O(base):
|
|
# O Remove ending only after l or i
|
|
c = base[-1]
|
|
return c == "l" or c == "i"
|
|
|
|
|
|
def P(base):
|
|
# P Do not remove ending after c
|
|
return base[-1] != "c"
|
|
|
|
|
|
def Q(base):
|
|
# Q Minimum stem length = 3 and do not remove ending after l or n
|
|
c = base[-1]
|
|
return len(base) > 2 and (c != "l" and c != "n")
|
|
|
|
|
|
def R(base):
|
|
# R Remove ending only after n or r
|
|
c = base[-1]
|
|
return c == "n" or c == "r"
|
|
|
|
|
|
def S(base):
|
|
# S Remove ending only after dr or t, unless t follows t
|
|
l2 = base[-2]
|
|
return l2 == "rd" or (base[-1] == "t" and l2 != "tt")
|
|
|
|
|
|
def T(base):
|
|
# T Remove ending only after s or t, unless t follows o
|
|
c1, c2 = base[-2:]
|
|
return c2 == "s" or (c2 == "t" and c1 != "o")
|
|
|
|
|
|
def U(base):
|
|
# U Remove ending only after l, m, n or r
|
|
c = base[-1]
|
|
return c == "l" or c == "m" or c == "n" or c == "r"
|
|
|
|
|
|
def V(base):
|
|
# V Remove ending only after c
|
|
return base[-1] == "c"
|
|
|
|
|
|
def W(base):
|
|
# W Do not remove ending after s or u
|
|
c = base[-1]
|
|
return c != "s" and c != "u"
|
|
|
|
|
|
def X(base):
|
|
# X Remove ending only after l, i or u*e
|
|
c = base[-1]
|
|
cc = base[-3]
|
|
return c == "l" or c == "i" or (c == "e" and cc == "u")
|
|
|
|
|
|
def Y(base):
|
|
# Y Remove ending only after in
|
|
return base[-2:] == "in"
|
|
|
|
|
|
def Z(base):
|
|
# Z Do not remove ending after f
|
|
return base[-1] != "f"
|
|
|
|
|
|
def a(base):
|
|
# a Remove ending only after d, f, ph, th, l, er, or, es or t
|
|
c = base[-1]
|
|
l2 = base[-2:]
|
|
return (c == "d" or c == "f" or l2 == "ph" or l2 == "th" or c == "l"
|
|
or l2 == "er" or l2 == "or" or l2 == "es" or c == "t")
|
|
|
|
|
|
def b(base):
|
|
# b Minimum stem length = 3 and do not remove ending after met or ryst
|
|
return len(base) > 2 and not (base.endswith("met")
|
|
or base.endswith("ryst"))
|
|
|
|
|
|
def c(base):
|
|
# c Remove ending only after l
|
|
return base[-1] == "l"
|
|
|
|
|
|
# Endings
|
|
|
|
m = [None] * 12
|
|
|
|
m[11] = dict((
|
|
("alistically", B),
|
|
("arizability", A),
|
|
("izationally", B)))
|
|
m[10] = dict((
|
|
("antialness", A),
|
|
("arisations", A),
|
|
("arizations", A),
|
|
("entialness", A)))
|
|
m[9] = dict((
|
|
("allically", C),
|
|
("antaneous", A),
|
|
("antiality", A),
|
|
("arisation", A),
|
|
("arization", A),
|
|
("ationally", B),
|
|
("ativeness", A),
|
|
("eableness", E),
|
|
("entations", A),
|
|
("entiality", A),
|
|
("entialize", A),
|
|
("entiation", A),
|
|
("ionalness", A),
|
|
("istically", A),
|
|
("itousness", A),
|
|
("izability", A),
|
|
("izational", A)))
|
|
m[8] = dict((
|
|
("ableness", A),
|
|
("arizable", A),
|
|
("entation", A),
|
|
("entially", A),
|
|
("eousness", A),
|
|
("ibleness", A),
|
|
("icalness", A),
|
|
("ionalism", A),
|
|
("ionality", A),
|
|
("ionalize", A),
|
|
("iousness", A),
|
|
("izations", A),
|
|
("lessness", A)))
|
|
m[7] = dict((
|
|
("ability", A),
|
|
("aically", A),
|
|
("alistic", B),
|
|
("alities", A),
|
|
("ariness", E),
|
|
("aristic", A),
|
|
("arizing", A),
|
|
("ateness", A),
|
|
("atingly", A),
|
|
("ational", B),
|
|
("atively", A),
|
|
("ativism", A),
|
|
("elihood", E),
|
|
("encible", A),
|
|
("entally", A),
|
|
("entials", A),
|
|
("entiate", A),
|
|
("entness", A),
|
|
("fulness", A),
|
|
("ibility", A),
|
|
("icalism", A),
|
|
("icalist", A),
|
|
("icality", A),
|
|
("icalize", A),
|
|
("ication", G),
|
|
("icianry", A),
|
|
("ination", A),
|
|
("ingness", A),
|
|
("ionally", A),
|
|
("isation", A),
|
|
("ishness", A),
|
|
("istical", A),
|
|
("iteness", A),
|
|
("iveness", A),
|
|
("ivistic", A),
|
|
("ivities", A),
|
|
("ization", F),
|
|
("izement", A),
|
|
("oidally", A),
|
|
("ousness", A)))
|
|
m[6] = dict((
|
|
("aceous", A),
|
|
("acious", B),
|
|
("action", G),
|
|
("alness", A),
|
|
("ancial", A),
|
|
("ancies", A),
|
|
("ancing", B),
|
|
("ariser", A),
|
|
("arized", A),
|
|
("arizer", A),
|
|
("atable", A),
|
|
("ations", B),
|
|
("atives", A),
|
|
("eature", Z),
|
|
("efully", A),
|
|
("encies", A),
|
|
("encing", A),
|
|
("ential", A),
|
|
("enting", C),
|
|
("entist", A),
|
|
("eously", A),
|
|
("ialist", A),
|
|
("iality", A),
|
|
("ialize", A),
|
|
("ically", A),
|
|
("icance", A),
|
|
("icians", A),
|
|
("icists", A),
|
|
("ifully", A),
|
|
("ionals", A),
|
|
("ionate", D),
|
|
("ioning", A),
|
|
("ionist", A),
|
|
("iously", A),
|
|
("istics", A),
|
|
("izable", E),
|
|
("lessly", A),
|
|
("nesses", A),
|
|
("oidism", A)))
|
|
m[5] = dict((
|
|
("acies", A),
|
|
("acity", A),
|
|
("aging", B),
|
|
("aical", A),
|
|
("alist", A),
|
|
("alism", B),
|
|
("ality", A),
|
|
("alize", A),
|
|
("allic", b),
|
|
("anced", B),
|
|
("ances", B),
|
|
("antic", C),
|
|
("arial", A),
|
|
("aries", A),
|
|
("arily", A),
|
|
("arity", B),
|
|
("arize", A),
|
|
("aroid", A),
|
|
("ately", A),
|
|
("ating", I),
|
|
("ation", B),
|
|
("ative", A),
|
|
("ators", A),
|
|
("atory", A),
|
|
("ature", E),
|
|
("early", Y),
|
|
("ehood", A),
|
|
("eless", A),
|
|
("elily", A),
|
|
("ement", A),
|
|
("enced", A),
|
|
("ences", A),
|
|
("eness", E),
|
|
("ening", E),
|
|
("ental", A),
|
|
("ented", C),
|
|
("ently", A),
|
|
("fully", A),
|
|
("ially", A),
|
|
("icant", A),
|
|
("ician", A),
|
|
("icide", A),
|
|
("icism", A),
|
|
("icist", A),
|
|
("icity", A),
|
|
("idine", I),
|
|
("iedly", A),
|
|
("ihood", A),
|
|
("inate", A),
|
|
("iness", A),
|
|
("ingly", B),
|
|
("inism", J),
|
|
("inity", c),
|
|
("ional", A),
|
|
("ioned", A),
|
|
("ished", A),
|
|
("istic", A),
|
|
("ities", A),
|
|
("itous", A),
|
|
("ively", A),
|
|
("ivity", A),
|
|
("izers", F),
|
|
("izing", F),
|
|
("oidal", A),
|
|
("oides", A),
|
|
("otide", A),
|
|
("ously", A)))
|
|
m[4] = dict((
|
|
("able", A),
|
|
("ably", A),
|
|
("ages", B),
|
|
("ally", B),
|
|
("ance", B),
|
|
("ancy", B),
|
|
("ants", B),
|
|
("aric", A),
|
|
("arly", K),
|
|
("ated", I),
|
|
("ates", A),
|
|
("atic", B),
|
|
("ator", A),
|
|
("ealy", Y),
|
|
("edly", E),
|
|
("eful", A),
|
|
("eity", A),
|
|
("ence", A),
|
|
("ency", A),
|
|
("ened", E),
|
|
("enly", E),
|
|
("eous", A),
|
|
("hood", A),
|
|
("ials", A),
|
|
("ians", A),
|
|
("ible", A),
|
|
("ibly", A),
|
|
("ical", A),
|
|
("ides", L),
|
|
("iers", A),
|
|
("iful", A),
|
|
("ines", M),
|
|
("ings", N),
|
|
("ions", B),
|
|
("ious", A),
|
|
("isms", B),
|
|
("ists", A),
|
|
("itic", H),
|
|
("ized", F),
|
|
("izer", F),
|
|
("less", A),
|
|
("lily", A),
|
|
("ness", A),
|
|
("ogen", A),
|
|
("ward", A),
|
|
("wise", A),
|
|
("ying", B),
|
|
("yish", A)))
|
|
m[3] = dict((
|
|
("acy", A),
|
|
("age", B),
|
|
("aic", A),
|
|
("als", b),
|
|
("ant", B),
|
|
("ars", O),
|
|
("ary", F),
|
|
("ata", A),
|
|
("ate", A),
|
|
("eal", Y),
|
|
("ear", Y),
|
|
("ely", E),
|
|
("ene", E),
|
|
("ent", C),
|
|
("ery", E),
|
|
("ese", A),
|
|
("ful", A),
|
|
("ial", A),
|
|
("ian", A),
|
|
("ics", A),
|
|
("ide", L),
|
|
("ied", A),
|
|
("ier", A),
|
|
("ies", P),
|
|
("ily", A),
|
|
("ine", M),
|
|
("ing", N),
|
|
("ion", Q),
|
|
("ish", C),
|
|
("ism", B),
|
|
("ist", A),
|
|
("ite", a),
|
|
("ity", A),
|
|
("ium", A),
|
|
("ive", A),
|
|
("ize", F),
|
|
("oid", A),
|
|
("one", R),
|
|
("ous", A)))
|
|
m[2] = dict((
|
|
("ae", A),
|
|
("al", b),
|
|
("ar", X),
|
|
("as", B),
|
|
("ed", E),
|
|
("en", F),
|
|
("es", E),
|
|
("ia", A),
|
|
("ic", A),
|
|
("is", A),
|
|
("ly", B),
|
|
("on", S),
|
|
("or", T),
|
|
("um", U),
|
|
("us", V),
|
|
("yl", R),
|
|
("s'", A),
|
|
("'s", A)))
|
|
m[1] = dict((
|
|
("a", A),
|
|
("e", A),
|
|
("i", A),
|
|
("o", A),
|
|
("s", W),
|
|
("y", B)))
|
|
|
|
|
|
def remove_ending(word):
|
|
length = len(word)
|
|
el = 11
|
|
while el > 0:
|
|
if length - el > 1:
|
|
ending = word[length - el:]
|
|
cond = m[el].get(ending)
|
|
if cond:
|
|
base = word[:length - el]
|
|
if cond(base):
|
|
return base
|
|
el -= 1
|
|
return word
|
|
|
|
|
|
_endings = (("iev", "ief"),
|
|
("uct", "uc"),
|
|
("iev", "ief"),
|
|
("uct", "uc"),
|
|
("umpt", "um"),
|
|
("rpt", "rb"),
|
|
("urs", "ur"),
|
|
("istr", "ister"),
|
|
("metr", "meter"),
|
|
("olv", "olut"),
|
|
("ul", "l", "aoi"),
|
|
("bex", "bic"),
|
|
("dex", "dic"),
|
|
("pex", "pic"),
|
|
("tex", "tic"),
|
|
("ax", "ac"),
|
|
("ex", "ec"),
|
|
("ix", "ic"),
|
|
("lux", "luc"),
|
|
("uad", "uas"),
|
|
("vad", "vas"),
|
|
("cid", "cis"),
|
|
("lid", "lis"),
|
|
("erid", "eris"),
|
|
("pand", "pans"),
|
|
("end", "ens", "s"),
|
|
("ond", "ons"),
|
|
("lud", "lus"),
|
|
("rud", "rus"),
|
|
("her", "hes", "pt"),
|
|
("mit", "mis"),
|
|
("ent", "ens", "m"),
|
|
("ert", "ers"),
|
|
("et", "es", "n"),
|
|
("yt", "ys"),
|
|
("yz", "ys"))
|
|
|
|
|
|
# Hash the ending rules by the last letter of the target ending
|
|
_endingrules = defaultdict(list)
|
|
for rule in _endings:
|
|
_endingrules[rule[0][-1]].append(rule)
|
|
|
|
_doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt"))
|
|
|
|
|
|
def fix_ending(word):
|
|
if word[-2:] in _doubles:
|
|
word = word[:-1]
|
|
|
|
for endingrule in _endingrules[word[-1]]:
|
|
target, newend = endingrule[:2]
|
|
if word.endswith(target):
|
|
if len(endingrule) > 2:
|
|
exceptafter = endingrule[2]
|
|
c = word[0 - (len(target) + 1)]
|
|
if c in exceptafter:
|
|
return word
|
|
|
|
return word[:0 - len(target)] + newend
|
|
|
|
return word
|
|
|
|
|
|
def stem(word):
|
|
"""Returns the stemmed version of the argument string.
|
|
"""
|
|
return fix_ending(remove_ending(word))
|