243 lines
6.6 KiB
Python
243 lines
6.6 KiB
Python
"""This module contains an object that implements the Paice-Husk stemming
|
|
algorithm.
|
|
|
|
If you just want to use the standard Paice-Husk stemming rules, use the
|
|
module's ``stem()`` function::
|
|
|
|
stemmed_word = stem(word)
|
|
|
|
If you want to use a custom rule set, read the rules into a string where the
|
|
rules are separated by newlines, and instantiate the object with the string,
|
|
then use the object's stem method to stem words::
|
|
|
|
stemmer = PaiceHuskStemmer(my_rules_string)
|
|
stemmed_word = stemmer.stem(word)
|
|
"""
|
|
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
|
|
class PaiceHuskStemmer(object):
|
|
"""Implements the Paice-Husk stemming algorithm.
|
|
"""
|
|
|
|
rule_expr = re.compile(r"""
|
|
^(?P<ending>\w+)
|
|
(?P<intact>[*]?)
|
|
(?P<num>\d+)
|
|
(?P<append>\w*)
|
|
(?P<cont>[.>])
|
|
""", re.UNICODE | re.VERBOSE)
|
|
|
|
stem_expr = re.compile("^\w+", re.UNICODE)
|
|
|
|
def __init__(self, ruletable):
|
|
"""
|
|
:param ruletable: a string containing the rule data, separated
|
|
by newlines.
|
|
"""
|
|
self.rules = defaultdict(list)
|
|
self.read_rules(ruletable)
|
|
|
|
def read_rules(self, ruletable):
|
|
rule_expr = self.rule_expr
|
|
rules = self.rules
|
|
|
|
for line in ruletable.split("\n"):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
match = rule_expr.match(line)
|
|
if match:
|
|
ending = match.group("ending")[::-1]
|
|
lastchar = ending[-1]
|
|
intact = match.group("intact") == "*"
|
|
num = int(match.group("num"))
|
|
append = match.group("append")
|
|
cont = match.group("cont") == ">"
|
|
|
|
rules[lastchar].append((ending, intact, num, append, cont))
|
|
else:
|
|
raise Exception("Bad rule: %r" % line)
|
|
|
|
def first_vowel(self, word):
|
|
vp = min([p for p in [word.find(v) for v in "aeiou"]
|
|
if p > -1])
|
|
yp = word.find("y")
|
|
if yp > 0 and yp < vp:
|
|
return yp
|
|
return vp
|
|
|
|
def strip_prefix(self, word):
|
|
for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
|
|
"nano", "pico", "pseudo"):
|
|
if word.startswith(prefix):
|
|
return word[len(prefix):]
|
|
return word
|
|
|
|
def stem(self, word):
|
|
"""Returns a stemmed version of the argument string.
|
|
"""
|
|
|
|
rules = self.rules
|
|
match = self.stem_expr.match(word)
|
|
if not match:
|
|
return word
|
|
stem = self.strip_prefix(match.group(0))
|
|
|
|
is_intact = True
|
|
continuing = True
|
|
while continuing:
|
|
pfv = self.first_vowel(stem)
|
|
rulelist = rules.get(stem[-1])
|
|
if not rulelist:
|
|
break
|
|
|
|
continuing = False
|
|
for ending, intact, num, append, cont in rulelist:
|
|
if stem.endswith(ending):
|
|
if intact and not is_intact:
|
|
continue
|
|
newlen = len(stem) - num + len(append)
|
|
|
|
if ((pfv == 0 and newlen < 2)
|
|
or (pfv > 0 and newlen < 3)):
|
|
# If word starts with vowel, minimum stem length is 2.
|
|
# If word starts with consonant, minimum stem length is
|
|
# 3.
|
|
continue
|
|
|
|
is_intact = False
|
|
stem = stem[:0 - num] + append
|
|
|
|
continuing = cont
|
|
break
|
|
|
|
return stem
|
|
|
|
# The default rules for the Paice-Husk stemming algorithm
|
|
|
|
defaultrules = """
|
|
ai*2. { -ia > - if intact }
|
|
a*1. { -a > - if intact }
|
|
bb1. { -bb > -b }
|
|
city3s. { -ytic > -ys }
|
|
ci2> { -ic > - }
|
|
cn1t> { -nc > -nt }
|
|
dd1. { -dd > -d }
|
|
dei3y> { -ied > -y }
|
|
deec2ss. { -ceed > -cess }
|
|
dee1. { -eed > -ee }
|
|
de2> { -ed > - }
|
|
dooh4> { -hood > - }
|
|
e1> { -e > - }
|
|
feil1v. { -lief > -liev }
|
|
fi2> { -if > - }
|
|
gni3> { -ing > - }
|
|
gai3y. { -iag > -y }
|
|
ga2> { -ag > - }
|
|
gg1. { -gg > -g }
|
|
ht*2. { -th > - if intact }
|
|
hsiug5ct. { -guish > -ct }
|
|
hsi3> { -ish > - }
|
|
i*1. { -i > - if intact }
|
|
i1y> { -i > -y }
|
|
ji1d. { -ij > -id -- see nois4j> & vis3j> }
|
|
juf1s. { -fuj > -fus }
|
|
ju1d. { -uj > -ud }
|
|
jo1d. { -oj > -od }
|
|
jeh1r. { -hej > -her }
|
|
jrev1t. { -verj > -vert }
|
|
jsim2t. { -misj > -mit }
|
|
jn1d. { -nj > -nd }
|
|
j1s. { -j > -s }
|
|
lbaifi6. { -ifiabl > - }
|
|
lbai4y. { -iabl > -y }
|
|
lba3> { -abl > - }
|
|
lbi3. { -ibl > - }
|
|
lib2l> { -bil > -bl }
|
|
lc1. { -cl > c }
|
|
lufi4y. { -iful > -y }
|
|
luf3> { -ful > - }
|
|
lu2. { -ul > - }
|
|
lai3> { -ial > - }
|
|
lau3> { -ual > - }
|
|
la2> { -al > - }
|
|
ll1. { -ll > -l }
|
|
mui3. { -ium > - }
|
|
mu*2. { -um > - if intact }
|
|
msi3> { -ism > - }
|
|
mm1. { -mm > -m }
|
|
nois4j> { -sion > -j }
|
|
noix4ct. { -xion > -ct }
|
|
noi3> { -ion > - }
|
|
nai3> { -ian > - }
|
|
na2> { -an > - }
|
|
nee0. { protect -een }
|
|
ne2> { -en > - }
|
|
nn1. { -nn > -n }
|
|
pihs4> { -ship > - }
|
|
pp1. { -pp > -p }
|
|
re2> { -er > - }
|
|
rae0. { protect -ear }
|
|
ra2. { -ar > - }
|
|
ro2> { -or > - }
|
|
ru2> { -ur > - }
|
|
rr1. { -rr > -r }
|
|
rt1> { -tr > -t }
|
|
rei3y> { -ier > -y }
|
|
sei3y> { -ies > -y }
|
|
sis2. { -sis > -s }
|
|
si2> { -is > - }
|
|
ssen4> { -ness > - }
|
|
ss0. { protect -ss }
|
|
suo3> { -ous > - }
|
|
su*2. { -us > - if intact }
|
|
s*1> { -s > - if intact }
|
|
s0. { -s > -s }
|
|
tacilp4y. { -plicat > -ply }
|
|
ta2> { -at > - }
|
|
tnem4> { -ment > - }
|
|
tne3> { -ent > - }
|
|
tna3> { -ant > - }
|
|
tpir2b. { -ript > -rib }
|
|
tpro2b. { -orpt > -orb }
|
|
tcud1. { -duct > -duc }
|
|
tpmus2. { -sumpt > -sum }
|
|
tpec2iv. { -cept > -ceiv }
|
|
tulo2v. { -olut > -olv }
|
|
tsis0. { protect -sist }
|
|
tsi3> { -ist > - }
|
|
tt1. { -tt > -t }
|
|
uqi3. { -iqu > - }
|
|
ugo1. { -ogu > -og }
|
|
vis3j> { -siv > -j }
|
|
vie0. { protect -eiv }
|
|
vi2> { -iv > - }
|
|
ylb1> { -bly > -bl }
|
|
yli3y> { -ily > -y }
|
|
ylp0. { protect -ply }
|
|
yl2> { -ly > - }
|
|
ygo1. { -ogy > -og }
|
|
yhp1. { -phy > -ph }
|
|
ymo1. { -omy > -om }
|
|
ypo1. { -opy > -op }
|
|
yti3> { -ity > - }
|
|
yte3> { -ety > - }
|
|
ytl2. { -lty > -l }
|
|
yrtsi5. { -istry > - }
|
|
yra3> { -ary > - }
|
|
yro3> { -ory > - }
|
|
yfi3. { -ify > - }
|
|
ycn2t> { -ncy > -nt }
|
|
yca3> { -acy > - }
|
|
zi2> { -iz > - }
|
|
zy1s. { -yz > -ys }
|
|
"""
|
|
|
|
# Make the standard rules available as a module-level function
|
|
|
|
stem = PaiceHuskStemmer(defaultrules).stem
|