258 lines
12 KiB
Python
258 lines
12 KiB
Python
from .bases import _StandardStemmer
|
|
|
|
from whoosh.compat import u
|
|
|
|
|
|
class RomanianStemmer(_StandardStemmer):
|
|
|
|
"""
|
|
The Romanian Snowball stemmer.
|
|
|
|
:cvar __vowels: The Romanian vowels.
|
|
:type __vowels: unicode
|
|
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
|
|
:type __step0_suffixes: tuple
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
:type __step1_suffixes: tuple
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
:type __step2_suffixes: tuple
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
:type __step3_suffixes: tuple
|
|
:note: A detailed description of the Romanian
|
|
stemming algorithm can be found under
|
|
http://snowball.tartarus.org/algorithms/romanian/stemmer.html
|
|
|
|
"""
|
|
|
|
__vowels = u("aeiou\u0103\xE2\xEE")
|
|
__step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor',
|
|
'atei', u('a\u0163ie'), u('a\u0163ia'), 'aua',
|
|
'ele', 'iua', 'iei', 'ile', 'ul', 'ea',
|
|
'ii')
|
|
__step1_suffixes = ('abilitate', 'abilitati', u('abilit\u0103\u0163i'),
|
|
'ibilitate', u('abilit\u0103i'), 'ivitate',
|
|
'ivitati', u('ivit\u0103\u0163i'), 'icitate',
|
|
'icitati', u('icit\u0103\u0163i'), 'icatori',
|
|
u('ivit\u0103i'), u('icit\u0103i'), 'icator',
|
|
u('a\u0163iune'), 'atoare', u('\u0103toare'),
|
|
u('i\u0163iune'), 'itoare', 'iciva', 'icive',
|
|
'icivi', u('iciv\u0103'), 'icala', 'icale',
|
|
'icali', u('ical\u0103'), 'ativa', 'ative',
|
|
'ativi', u('ativ\u0103'), 'atori', u('\u0103tori'),
|
|
'itiva', 'itive', 'itivi', u('itiv\u0103'),
|
|
'itori', 'iciv', 'ical', 'ativ', 'ator',
|
|
u('\u0103tor'), 'itiv', 'itor')
|
|
__step2_suffixes = ('abila', 'abile', 'abili', u('abil\u0103'),
|
|
'ibila', 'ibile', 'ibili', u('ibil\u0103'),
|
|
'atori', 'itate', 'itati', u('it\u0103\u0163i'),
|
|
'abil', 'ibil', 'oasa', u('oas\u0103'), 'oase',
|
|
'anta', 'ante', 'anti', u('ant\u0103'), 'ator',
|
|
u('it\u0103i'), 'iune', 'iuni', 'isme', 'ista',
|
|
'iste', 'isti', u('ist\u0103'), u('i\u015Fti'),
|
|
'ata', u('at\u0103'), 'ati', 'ate', 'uta',
|
|
u('ut\u0103'), 'uti', 'ute', 'ita', u('it\u0103'),
|
|
'iti', 'ite', 'ica', 'ice', 'ici', u('ic\u0103'),
|
|
'osi', u('o\u015Fi'), 'ant', 'iva', 'ive', 'ivi',
|
|
u('iv\u0103'), 'ism', 'ist', 'at', 'ut', 'it',
|
|
'ic', 'os', 'iv')
|
|
__step3_suffixes = (u('seser\u0103\u0163i'), u('aser\u0103\u0163i'),
|
|
u('iser\u0103\u0163i'), u('\xE2ser\u0103\u0163i'),
|
|
u('user\u0103\u0163i'), u('seser\u0103m'),
|
|
u('aser\u0103m'), u('iser\u0103m'), u('\xE2ser\u0103m'),
|
|
u('user\u0103m'), u('ser\u0103\u0163i'), u('sese\u015Fi'),
|
|
u('seser\u0103'), u('easc\u0103'), u('ar\u0103\u0163i'),
|
|
u('ur\u0103\u0163i'), u('ir\u0103\u0163i'),
|
|
u('\xE2r\u0103\u0163i'), u('ase\u015Fi'),
|
|
u('aser\u0103'), u('ise\u015Fi'), u('iser\u0103'),
|
|
u('\xe2se\u015Fi'), u('\xE2ser\u0103'),
|
|
u('use\u015Fi'), u('user\u0103'), u('ser\u0103m'),
|
|
'sesem', 'indu', '\xE2ndu', u('eaz\u0103'),
|
|
u('e\u015Fti'), u('e\u015Fte'), u('\u0103\u015Fti'),
|
|
u('\u0103\u015Fte'), u('ea\u0163i'), u('ia\u0163i'),
|
|
u('ar\u0103m'), u('ur\u0103m'), u('ir\u0103m'),
|
|
u('\xE2r\u0103m'), 'asem', 'isem',
|
|
'\xE2sem', 'usem', u('se\u015Fi'), u('ser\u0103'),
|
|
'sese', 'are', 'ere', 'ire', '\xE2re',
|
|
'ind', '\xE2nd', 'eze', 'ezi', 'esc',
|
|
u('\u0103sc'), 'eam', 'eai', 'eau', 'iam',
|
|
'iai', 'iau', u('a\u015Fi'), u('ar\u0103'),
|
|
u('u\u015Fi'), u('ur\u0103'), u('i\u015Fi'), u('ir\u0103'),
|
|
u('\xE2\u015Fi'), u('\xe2r\u0103'), 'ase',
|
|
'ise', '\xE2se', 'use', u('a\u0163i'),
|
|
u('e\u0163i'), u('i\u0163i'), u('\xe2\u0163i'), 'sei',
|
|
'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui',
|
|
'\xE2i', u('\u0103m'), 'em', 'im', '\xE2m',
|
|
'se')
|
|
|
|
def stem(self, word):
|
|
"""
|
|
Stem a Romanian word and return the stemmed form.
|
|
|
|
:param word: The word that is stemmed.
|
|
:type word: str or unicode
|
|
:return: The stemmed form.
|
|
:rtype: unicode
|
|
|
|
"""
|
|
word = word.lower()
|
|
|
|
step1_success = False
|
|
step2_success = False
|
|
|
|
for i in range(1, len(word) - 1):
|
|
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
|
|
if word[i] == "u":
|
|
word = "".join((word[:i], "U", word[i + 1:]))
|
|
|
|
elif word[i] == "i":
|
|
word = "".join((word[:i], "I", word[i + 1:]))
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
rv = self._rv_standard(word, self.__vowels)
|
|
|
|
# STEP 0: Removal of plurals and other simplifications
|
|
for suffix in self.__step0_suffixes:
|
|
if word.endswith(suffix):
|
|
if suffix in r1:
|
|
if suffix in ("ul", "ului"):
|
|
word = word[:-len(suffix)]
|
|
|
|
if suffix in rv:
|
|
rv = rv[:-len(suffix)]
|
|
else:
|
|
rv = ""
|
|
|
|
elif (suffix == "aua" or suffix == "atei" or
|
|
(suffix == "ile" and word[-5:-3] != "ab")):
|
|
word = word[:-2]
|
|
|
|
elif suffix in ("ea", "ele", "elor"):
|
|
word = "".join((word[:-len(suffix)], "e"))
|
|
|
|
if suffix in rv:
|
|
rv = "".join((rv[:-len(suffix)], "e"))
|
|
else:
|
|
rv = ""
|
|
|
|
elif suffix in ("ii", "iua", "iei",
|
|
"iile", "iilor", "ilor"):
|
|
word = "".join((word[:-len(suffix)], "i"))
|
|
|
|
if suffix in rv:
|
|
rv = "".join((rv[:-len(suffix)], "i"))
|
|
else:
|
|
rv = ""
|
|
|
|
elif suffix in ("a\u0163ie", "a\u0163ia"):
|
|
word = word[:-1]
|
|
break
|
|
|
|
# STEP 1: Reduction of combining suffixes
|
|
while True:
|
|
|
|
replacement_done = False
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
if word.endswith(suffix):
|
|
if suffix in r1:
|
|
step1_success = True
|
|
replacement_done = True
|
|
|
|
if suffix in ("abilitate", "abilitati",
|
|
"abilit\u0103i",
|
|
"abilit\u0103\u0163i"):
|
|
word = "".join((word[:-len(suffix)], "abil"))
|
|
|
|
elif suffix == "ibilitate":
|
|
word = word[:-5]
|
|
|
|
elif suffix in ("ivitate", "ivitati",
|
|
"ivit\u0103i",
|
|
"ivit\u0103\u0163i"):
|
|
word = "".join((word[:-len(suffix)], "iv"))
|
|
|
|
elif suffix in ("icitate", "icitati", "icit\u0103i",
|
|
"icit\u0103\u0163i", "icator",
|
|
"icatori", "iciv", "iciva",
|
|
"icive", "icivi", "iciv\u0103",
|
|
"ical", "icala", "icale", "icali",
|
|
"ical\u0103"):
|
|
word = "".join((word[:-len(suffix)], "ic"))
|
|
|
|
elif suffix in ("ativ", "ativa", "ative", "ativi",
|
|
"ativ\u0103", "a\u0163iune",
|
|
"atoare", "ator", "atori",
|
|
"\u0103toare",
|
|
"\u0103tor", "\u0103tori"):
|
|
word = "".join((word[:-len(suffix)], "at"))
|
|
|
|
if suffix in r2:
|
|
r2 = "".join((r2[:-len(suffix)], "at"))
|
|
|
|
elif suffix in ("itiv", "itiva", "itive", "itivi",
|
|
"itiv\u0103", "i\u0163iune",
|
|
"itoare", "itor", "itori"):
|
|
word = "".join((word[:-len(suffix)], "it"))
|
|
|
|
if suffix in r2:
|
|
r2 = "".join((r2[:-len(suffix)], "it"))
|
|
else:
|
|
step1_success = False
|
|
break
|
|
|
|
if not replacement_done:
|
|
break
|
|
|
|
# STEP 2: Removal of standard suffixes
|
|
for suffix in self.__step2_suffixes:
|
|
if word.endswith(suffix):
|
|
if suffix in r2:
|
|
step2_success = True
|
|
|
|
if suffix in ("iune", "iuni"):
|
|
if word[-5] == "\u0163":
|
|
word = "".join((word[:-5], "t"))
|
|
|
|
elif suffix in ("ism", "isme", "ist", "ista", "iste",
|
|
"isti", "ist\u0103", "i\u015Fti"):
|
|
word = "".join((word[:-len(suffix)], "ist"))
|
|
|
|
else:
|
|
word = word[:-len(suffix)]
|
|
break
|
|
|
|
# STEP 3: Removal of verb suffixes
|
|
if not step1_success and not step2_success:
|
|
for suffix in self.__step3_suffixes:
|
|
try:
|
|
if word.endswith(suffix):
|
|
if suffix in rv:
|
|
if suffix in (u('seser\u0103\u0163i'), u('seser\u0103m'),
|
|
u('ser\u0103\u0163i'), u('sese\u015Fi'),
|
|
u('seser\u0103'), u('ser\u0103m'), 'sesem',
|
|
u('se\u015Fi'), u('ser\u0103'), 'sese',
|
|
u('a\u0163i'), u('e\u0163i'), u('i\u0163i'),
|
|
u('\xE2\u0163i'), 'sei', u('\u0103m'),
|
|
'em', 'im', '\xE2m', 'se'):
|
|
word = word[:-len(suffix)]
|
|
rv = rv[:-len(suffix)]
|
|
else:
|
|
if (not rv.startswith(suffix) and
|
|
rv[rv.index(suffix) - 1] not in
|
|
"aeio\u0103\xE2\xEE"):
|
|
word = word[:-len(suffix)]
|
|
break
|
|
except UnicodeDecodeError:
|
|
# The word is unicode, but suffix is not
|
|
continue
|
|
|
|
# STEP 4: Removal of final vowel
|
|
for suffix in ("ie", "a", "e", "i", "\u0103"):
|
|
if word.endswith(suffix):
|
|
if suffix in rv:
|
|
word = word[:-len(suffix)]
|
|
break
|
|
|
|
word = word.replace("I", "i").replace("U", "u")
|
|
return word
|