microproduct/atmosphericDelay/ISCEApp/site-packages/whoosh/lang/snowball/romanian.py

258 lines
12 KiB
Python

from .bases import _StandardStemmer
from whoosh.compat import u
class RomanianStemmer(_StandardStemmer):
"""
The Romanian Snowball stemmer.
:cvar __vowels: The Romanian vowels.
:type __vowels: unicode
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
:type __step0_suffixes: tuple
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
:type __step1_suffixes: tuple
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
:type __step2_suffixes: tuple
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
:type __step3_suffixes: tuple
:note: A detailed description of the Romanian
stemming algorithm can be found under
http://snowball.tartarus.org/algorithms/romanian/stemmer.html
"""
__vowels = u("aeiou\u0103\xE2\xEE")
__step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor',
'atei', u('a\u0163ie'), u('a\u0163ia'), 'aua',
'ele', 'iua', 'iei', 'ile', 'ul', 'ea',
'ii')
__step1_suffixes = ('abilitate', 'abilitati', u('abilit\u0103\u0163i'),
'ibilitate', u('abilit\u0103i'), 'ivitate',
'ivitati', u('ivit\u0103\u0163i'), 'icitate',
'icitati', u('icit\u0103\u0163i'), 'icatori',
u('ivit\u0103i'), u('icit\u0103i'), 'icator',
u('a\u0163iune'), 'atoare', u('\u0103toare'),
u('i\u0163iune'), 'itoare', 'iciva', 'icive',
'icivi', u('iciv\u0103'), 'icala', 'icale',
'icali', u('ical\u0103'), 'ativa', 'ative',
'ativi', u('ativ\u0103'), 'atori', u('\u0103tori'),
'itiva', 'itive', 'itivi', u('itiv\u0103'),
'itori', 'iciv', 'ical', 'ativ', 'ator',
u('\u0103tor'), 'itiv', 'itor')
__step2_suffixes = ('abila', 'abile', 'abili', u('abil\u0103'),
'ibila', 'ibile', 'ibili', u('ibil\u0103'),
'atori', 'itate', 'itati', u('it\u0103\u0163i'),
'abil', 'ibil', 'oasa', u('oas\u0103'), 'oase',
'anta', 'ante', 'anti', u('ant\u0103'), 'ator',
u('it\u0103i'), 'iune', 'iuni', 'isme', 'ista',
'iste', 'isti', u('ist\u0103'), u('i\u015Fti'),
'ata', u('at\u0103'), 'ati', 'ate', 'uta',
u('ut\u0103'), 'uti', 'ute', 'ita', u('it\u0103'),
'iti', 'ite', 'ica', 'ice', 'ici', u('ic\u0103'),
'osi', u('o\u015Fi'), 'ant', 'iva', 'ive', 'ivi',
u('iv\u0103'), 'ism', 'ist', 'at', 'ut', 'it',
'ic', 'os', 'iv')
__step3_suffixes = (u('seser\u0103\u0163i'), u('aser\u0103\u0163i'),
u('iser\u0103\u0163i'), u('\xE2ser\u0103\u0163i'),
u('user\u0103\u0163i'), u('seser\u0103m'),
u('aser\u0103m'), u('iser\u0103m'), u('\xE2ser\u0103m'),
u('user\u0103m'), u('ser\u0103\u0163i'), u('sese\u015Fi'),
u('seser\u0103'), u('easc\u0103'), u('ar\u0103\u0163i'),
u('ur\u0103\u0163i'), u('ir\u0103\u0163i'),
u('\xE2r\u0103\u0163i'), u('ase\u015Fi'),
u('aser\u0103'), u('ise\u015Fi'), u('iser\u0103'),
u('\xe2se\u015Fi'), u('\xE2ser\u0103'),
u('use\u015Fi'), u('user\u0103'), u('ser\u0103m'),
'sesem', 'indu', '\xE2ndu', u('eaz\u0103'),
u('e\u015Fti'), u('e\u015Fte'), u('\u0103\u015Fti'),
u('\u0103\u015Fte'), u('ea\u0163i'), u('ia\u0163i'),
u('ar\u0103m'), u('ur\u0103m'), u('ir\u0103m'),
u('\xE2r\u0103m'), 'asem', 'isem',
'\xE2sem', 'usem', u('se\u015Fi'), u('ser\u0103'),
'sese', 'are', 'ere', 'ire', '\xE2re',
'ind', '\xE2nd', 'eze', 'ezi', 'esc',
u('\u0103sc'), 'eam', 'eai', 'eau', 'iam',
'iai', 'iau', u('a\u015Fi'), u('ar\u0103'),
u('u\u015Fi'), u('ur\u0103'), u('i\u015Fi'), u('ir\u0103'),
u('\xE2\u015Fi'), u('\xe2r\u0103'), 'ase',
'ise', '\xE2se', 'use', u('a\u0163i'),
u('e\u0163i'), u('i\u0163i'), u('\xe2\u0163i'), 'sei',
'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui',
'\xE2i', u('\u0103m'), 'em', 'im', '\xE2m',
'se')
def stem(self, word):
"""
Stem a Romanian word and return the stemmed form.
:param word: The word that is stemmed.
:type word: str or unicode
:return: The stemmed form.
:rtype: unicode
"""
word = word.lower()
step1_success = False
step2_success = False
for i in range(1, len(word) - 1):
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
if word[i] == "u":
word = "".join((word[:i], "U", word[i + 1:]))
elif word[i] == "i":
word = "".join((word[:i], "I", word[i + 1:]))
r1, r2 = self._r1r2_standard(word, self.__vowels)
rv = self._rv_standard(word, self.__vowels)
# STEP 0: Removal of plurals and other simplifications
for suffix in self.__step0_suffixes:
if word.endswith(suffix):
if suffix in r1:
if suffix in ("ul", "ului"):
word = word[:-len(suffix)]
if suffix in rv:
rv = rv[:-len(suffix)]
else:
rv = ""
elif (suffix == "aua" or suffix == "atei" or
(suffix == "ile" and word[-5:-3] != "ab")):
word = word[:-2]
elif suffix in ("ea", "ele", "elor"):
word = "".join((word[:-len(suffix)], "e"))
if suffix in rv:
rv = "".join((rv[:-len(suffix)], "e"))
else:
rv = ""
elif suffix in ("ii", "iua", "iei",
"iile", "iilor", "ilor"):
word = "".join((word[:-len(suffix)], "i"))
if suffix in rv:
rv = "".join((rv[:-len(suffix)], "i"))
else:
rv = ""
elif suffix in ("a\u0163ie", "a\u0163ia"):
word = word[:-1]
break
# STEP 1: Reduction of combining suffixes
while True:
replacement_done = False
for suffix in self.__step1_suffixes:
if word.endswith(suffix):
if suffix in r1:
step1_success = True
replacement_done = True
if suffix in ("abilitate", "abilitati",
"abilit\u0103i",
"abilit\u0103\u0163i"):
word = "".join((word[:-len(suffix)], "abil"))
elif suffix == "ibilitate":
word = word[:-5]
elif suffix in ("ivitate", "ivitati",
"ivit\u0103i",
"ivit\u0103\u0163i"):
word = "".join((word[:-len(suffix)], "iv"))
elif suffix in ("icitate", "icitati", "icit\u0103i",
"icit\u0103\u0163i", "icator",
"icatori", "iciv", "iciva",
"icive", "icivi", "iciv\u0103",
"ical", "icala", "icale", "icali",
"ical\u0103"):
word = "".join((word[:-len(suffix)], "ic"))
elif suffix in ("ativ", "ativa", "ative", "ativi",
"ativ\u0103", "a\u0163iune",
"atoare", "ator", "atori",
"\u0103toare",
"\u0103tor", "\u0103tori"):
word = "".join((word[:-len(suffix)], "at"))
if suffix in r2:
r2 = "".join((r2[:-len(suffix)], "at"))
elif suffix in ("itiv", "itiva", "itive", "itivi",
"itiv\u0103", "i\u0163iune",
"itoare", "itor", "itori"):
word = "".join((word[:-len(suffix)], "it"))
if suffix in r2:
r2 = "".join((r2[:-len(suffix)], "it"))
else:
step1_success = False
break
if not replacement_done:
break
# STEP 2: Removal of standard suffixes
for suffix in self.__step2_suffixes:
if word.endswith(suffix):
if suffix in r2:
step2_success = True
if suffix in ("iune", "iuni"):
if word[-5] == "\u0163":
word = "".join((word[:-5], "t"))
elif suffix in ("ism", "isme", "ist", "ista", "iste",
"isti", "ist\u0103", "i\u015Fti"):
word = "".join((word[:-len(suffix)], "ist"))
else:
word = word[:-len(suffix)]
break
# STEP 3: Removal of verb suffixes
if not step1_success and not step2_success:
for suffix in self.__step3_suffixes:
try:
if word.endswith(suffix):
if suffix in rv:
if suffix in (u('seser\u0103\u0163i'), u('seser\u0103m'),
u('ser\u0103\u0163i'), u('sese\u015Fi'),
u('seser\u0103'), u('ser\u0103m'), 'sesem',
u('se\u015Fi'), u('ser\u0103'), 'sese',
u('a\u0163i'), u('e\u0163i'), u('i\u0163i'),
u('\xE2\u0163i'), 'sei', u('\u0103m'),
'em', 'im', '\xE2m', 'se'):
word = word[:-len(suffix)]
rv = rv[:-len(suffix)]
else:
if (not rv.startswith(suffix) and
rv[rv.index(suffix) - 1] not in
"aeio\u0103\xE2\xEE"):
word = word[:-len(suffix)]
break
except UnicodeDecodeError:
# The word is unicode, but suffix is not
continue
# STEP 4: Removal of final vowel
for suffix in ("ie", "a", "e", "i", "\u0103"):
if word.endswith(suffix):
if suffix in rv:
word = word[:-len(suffix)]
break
word = word.replace("I", "i").replace("U", "u")
return word