microproduct/atmosphericDelay/ISCEApp/site-packages/whoosh/lang/snowball/english.py

466 lines
17 KiB
Python

from .bases import _StandardStemmer
from whoosh.compat import u
class EnglishStemmer(_StandardStemmer):
"""
The English Snowball stemmer.
:cvar __vowels: The English vowels.
:type __vowels: unicode
:cvar __double_consonants: The English double consonants.
:type __double_consonants: tuple
:cvar __li_ending: Letters that may directly appear before a word final 'li'.
:type __li_ending: unicode
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
:type __step0_suffixes: tuple
:cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm.
:type __step1a_suffixes: tuple
:cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm.
:type __step1b_suffixes: tuple
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
:type __step2_suffixes: tuple
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
:type __step3_suffixes: tuple
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
:type __step4_suffixes: tuple
:cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
:type __step5_suffixes: tuple
:cvar __special_words: A dictionary containing words
which have to be stemmed specially.
:type __special_words: dict
:note: A detailed description of the English
stemming algorithm can be found under
http://snowball.tartarus.org/algorithms/english/stemmer.html
"""
__vowels = "aeiouy"
__double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn",
"pp", "rr", "tt")
__li_ending = "cdeghkmnrt"
__step0_suffixes = ("'s'", "'s", "'")
__step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
__step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
__step2_suffixes = ('ization', 'ational', 'fulness', 'ousness',
'iveness', 'tional', 'biliti', 'lessli',
'entli', 'ation', 'alism', 'aliti', 'ousli',
'iviti', 'fulli', 'enci', 'anci', 'abli',
'izer', 'ator', 'alli', 'bli', 'ogi', 'li')
__step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti',
'ative', 'ical', 'ness', 'ful')
__step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment',
'ant', 'ent', 'ism', 'ate', 'iti', 'ous',
'ive', 'ize', 'ion', 'al', 'er', 'ic')
__step5_suffixes = ("e", "l")
__special_words = {"skis": "ski",
"skies": "sky",
"dying": "die",
"lying": "lie",
"tying": "tie",
"idly": "idl",
"gently": "gentl",
"ugly": "ugli",
"early": "earli",
"only": "onli",
"singly": "singl",
"sky": "sky",
"news": "news",
"howe": "howe",
"atlas": "atlas",
"cosmos": "cosmos",
"bias": "bias",
"andes": "andes",
"inning": "inning",
"innings": "inning",
"outing": "outing",
"outings": "outing",
"canning": "canning",
"cannings": "canning",
"herring": "herring",
"herrings": "herring",
"earring": "earring",
"earrings": "earring",
"proceed": "proceed",
"proceeds": "proceed",
"proceeded": "proceed",
"proceeding": "proceed",
"exceed": "exceed",
"exceeds": "exceed",
"exceeded": "exceed",
"exceeding": "exceed",
"succeed": "succeed",
"succeeds": "succeed",
"succeeded": "succeed",
"succeeding": "succeed"}
def stem(self, word):
"""
Stem an English word and return the stemmed form.
:param word: The word that is stemmed.
:type word: str or unicode
:return: The stemmed form.
:rtype: unicode
"""
word = word.lower()
if word in self.__special_words:
return self.__special_words[word]
# Map the different apostrophe characters to a single consistent one
word = (word.replace(u("\u2019"), u("\x27"))
.replace(u("\u2018"), u("\x27"))
.replace(u("\u201B"), u("\x27")))
if word.startswith(u("\x27")):
word = word[1:]
if word.startswith("y"):
word = "".join(("Y", word[1:]))
for i in range(1, len(word)):
if word[i - 1] in self.__vowels and word[i] == "y":
word = "".join((word[:i], "Y", word[i + 1:]))
step1a_vowel_found = False
step1b_vowel_found = False
r1 = ""
r2 = ""
if word.startswith(("gener", "commun", "arsen")):
if word.startswith(("gener", "arsen")):
r1 = word[5:]
else:
r1 = word[6:]
for i in range(1, len(r1)):
if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels:
r2 = r1[i + 1:]
break
else:
r1, r2 = self._r1r2_standard(word, self.__vowels)
# STEP 0
for suffix in self.__step0_suffixes:
if word.endswith(suffix):
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
r2 = r2[:-len(suffix)]
break
# STEP 1a
for suffix in self.__step1a_suffixes:
if word.endswith(suffix):
if suffix == "sses":
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
elif suffix in ("ied", "ies"):
if len(word[:-len(suffix)]) > 1:
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
else:
word = word[:-1]
r1 = r1[:-1]
r2 = r2[:-1]
elif suffix == "s":
for letter in word[:-2]:
if letter in self.__vowels:
step1a_vowel_found = True
break
if step1a_vowel_found:
word = word[:-1]
r1 = r1[:-1]
r2 = r2[:-1]
break
# STEP 1b
for suffix in self.__step1b_suffixes:
if word.endswith(suffix):
if suffix in ("eed", "eedly"):
if r1.endswith(suffix):
word = "".join((word[:-len(suffix)], "ee"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "ee"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "ee"))
else:
r2 = ""
else:
for letter in word[:-len(suffix)]:
if letter in self.__vowels:
step1b_vowel_found = True
break
if step1b_vowel_found:
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
r2 = r2[:-len(suffix)]
if word.endswith(("at", "bl", "iz")):
word = "".join((word, "e"))
r1 = "".join((r1, "e"))
if len(word) > 5 or len(r1) >= 3:
r2 = "".join((r2, "e"))
elif word.endswith(self.__double_consonants):
word = word[:-1]
r1 = r1[:-1]
r2 = r2[:-1]
elif ((r1 == "" and len(word) >= 3 and
word[-1] not in self.__vowels and
word[-1] not in "wxY" and
word[-2] in self.__vowels and
word[-3] not in self.__vowels)
or
(r1 == "" and len(word) == 2 and
word[0] in self.__vowels and
word[1] not in self.__vowels)):
word = "".join((word, "e"))
if len(r1) > 0:
r1 = "".join((r1, "e"))
if len(r2) > 0:
r2 = "".join((r2, "e"))
break
# STEP 1c
if (len(word) > 2
and word[-1] in "yY"
and word[-2] not in self.__vowels):
word = "".join((word[:-1], "i"))
if len(r1) >= 1:
r1 = "".join((r1[:-1], "i"))
else:
r1 = ""
if len(r2) >= 1:
r2 = "".join((r2[:-1], "i"))
else:
r2 = ""
# STEP 2
for suffix in self.__step2_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
if suffix == "tional":
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
elif suffix in ("enci", "anci", "abli"):
word = "".join((word[:-1], "e"))
if len(r1) >= 1:
r1 = "".join((r1[:-1], "e"))
else:
r1 = ""
if len(r2) >= 1:
r2 = "".join((r2[:-1], "e"))
else:
r2 = ""
elif suffix == "entli":
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
elif suffix in ("izer", "ization"):
word = "".join((word[:-len(suffix)], "ize"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "ize"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "ize"))
else:
r2 = ""
elif suffix in ("ational", "ation", "ator"):
word = "".join((word[:-len(suffix)], "ate"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "ate"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "ate"))
else:
r2 = "e"
elif suffix in ("alism", "aliti", "alli"):
word = "".join((word[:-len(suffix)], "al"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "al"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "al"))
else:
r2 = ""
elif suffix == "fulness":
word = word[:-4]
r1 = r1[:-4]
r2 = r2[:-4]
elif suffix in ("ousli", "ousness"):
word = "".join((word[:-len(suffix)], "ous"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "ous"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "ous"))
else:
r2 = ""
elif suffix in ("iveness", "iviti"):
word = "".join((word[:-len(suffix)], "ive"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "ive"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "ive"))
else:
r2 = "e"
elif suffix in ("biliti", "bli"):
word = "".join((word[:-len(suffix)], "ble"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "ble"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "ble"))
else:
r2 = ""
elif suffix == "ogi" and word[-4] == "l":
word = word[:-1]
r1 = r1[:-1]
r2 = r2[:-1]
elif suffix in ("fulli", "lessli"):
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
elif suffix == "li" and word[-3] in self.__li_ending:
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
break
# STEP 3
for suffix in self.__step3_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
if suffix == "tional":
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
elif suffix == "ational":
word = "".join((word[:-len(suffix)], "ate"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "ate"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "ate"))
else:
r2 = ""
elif suffix == "alize":
word = word[:-3]
r1 = r1[:-3]
r2 = r2[:-3]
elif suffix in ("icate", "iciti", "ical"):
word = "".join((word[:-len(suffix)], "ic"))
if len(r1) >= len(suffix):
r1 = "".join((r1[:-len(suffix)], "ic"))
else:
r1 = ""
if len(r2) >= len(suffix):
r2 = "".join((r2[:-len(suffix)], "ic"))
else:
r2 = ""
elif suffix in ("ful", "ness"):
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
r2 = r2[:-len(suffix)]
elif suffix == "ative" and r2.endswith(suffix):
word = word[:-5]
r1 = r1[:-5]
r2 = r2[:-5]
break
# STEP 4
for suffix in self.__step4_suffixes:
if word.endswith(suffix):
if r2.endswith(suffix):
if suffix == "ion":
if word[-4] in "st":
word = word[:-3]
r1 = r1[:-3]
r2 = r2[:-3]
else:
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
r2 = r2[:-len(suffix)]
break
# STEP 5
if r2.endswith("l") and word[-2] == "l":
word = word[:-1]
elif r2.endswith("e"):
word = word[:-1]
elif r1.endswith("e"):
if len(word) >= 4 and (word[-2] in self.__vowels or
word[-2] in "wxY" or
word[-3] not in self.__vowels or
word[-4] in self.__vowels):
word = word[:-1]
word = word.replace("Y", "y")
return word