microproduct/atmosphericDelay/ISCEApp/site-packages/whoosh/lang/snowball/hungarian.py

269 lines
11 KiB
Python

from whoosh.compat import u
class HungarianStemmer(object):
"""
The Hungarian Snowball stemmer.
:cvar __vowels: The Hungarian vowels.
:type __vowels: unicode
:cvar __digraphs: The Hungarian digraphs.
:type __digraphs: tuple
:cvar __double_consonants: The Hungarian double consonants.
:type __double_consonants: tuple
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
:type __step1_suffixes: tuple
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
:type __step2_suffixes: tuple
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
:type __step3_suffixes: tuple
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
:type __step4_suffixes: tuple
:cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
:type __step5_suffixes: tuple
:cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
:type __step6_suffixes: tuple
:cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
:type __step7_suffixes: tuple
:cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
:type __step8_suffixes: tuple
:cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
:type __step9_suffixes: tuple
:note: A detailed description of the Hungarian
stemming algorithm can be found under
http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
"""
__vowels = u("aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB")
__digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
__double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg",
"ggy", "jj", "kk", "ll", "lly", "mm",
"nn", "nny", "pp", "rr", "ss", "ssz",
"tt", "tty", "vv", "zz", "zzs")
__step1_suffixes = ("al", "el")
__step2_suffixes = (u('k\xE9ppen'), u('onk\xE9nt'), u('enk\xE9nt'),
u('ank\xE9nt'), u('k\xE9pp'), u('k\xE9nt'), 'ban',
'ben', 'nak', 'nek', 'val', 'vel', u('t\xF3l'),
u('t\xF5l'), u('r\xF3l'), u('r\xF5l'), u('b\xF3l'),
u('b\xF5l'), 'hoz', 'hez', u('h\xF6z'),
u('n\xE1l'), u('n\xE9l'), u('\xE9rt'), 'kor',
'ba', 'be', 'ra', 're', 'ig', 'at', 'et',
'ot', u('\xF6t'), 'ul', u('\xFCl'), u('v\xE1'),
u('v\xE9'), 'en', 'on', 'an', u('\xF6n'),
'n', 't')
__step3_suffixes = (u("\xE1nk\xE9nt"), u("\xE1n"), u("\xE9n"))
__step4_suffixes = ('astul', u('est\xFCl'), u('\xE1stul'),
u('\xE9st\xFCl'), 'stul', u('st\xFCl'))
__step5_suffixes = (u("\xE1"), u("\xE9"))
__step6_suffixes = (u('ok\xE9'), u('\xF6k\xE9'), u('ak\xE9'),
u('ek\xE9'), u('\xE1k\xE9'), u('\xE1\xE9i'),
u('\xE9k\xE9'), u('\xE9\xE9i'), u('k\xE9'),
u('\xE9i'), u('\xE9\xE9'), u('\xE9'))
__step7_suffixes = (u('\xE1juk'), u('\xE9j\xFCk'), u('\xFCnk'),
'unk', 'juk', u('j\xFCk'), u('\xE1nk'),
u('\xE9nk'), 'nk', 'uk', u('\xFCk'), 'em',
'om', 'am', 'od', 'ed', 'ad', u('\xF6d'),
'ja', 'je', u('\xE1m'), u('\xE1d'), u('\xE9m'),
u('\xE9d'), 'm', 'd', 'a', 'e', 'o',
u('\xE1'), u('\xE9'))
__step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok',
'eitek', u('\xE1itok'), u('\xE9itek'), 'jaim',
'jeim', 'jaid', 'jeid', 'eink', 'aink',
'itek', 'jeik', 'jaik', u('\xE1ink'),
u('\xE9ink'), 'aim', 'eim', 'aid', 'eid',
'jai', 'jei', 'ink', 'aik', 'eik',
u('\xE1im'), u('\xE1id'), u('\xE1ik'), u('\xE9im'),
u('\xE9id'), u('\xE9ik'), 'im', 'id', 'ai',
'ei', 'ik', u('\xE1i'), u('\xE9i'), 'i')
__step9_suffixes = (u("\xE1k"), u("\xE9k"), u("\xF6k"), "ok",
"ek", "ak", "k")
def stem(self, word):
"""
Stem an Hungarian word and return the stemmed form.
:param word: The word that is stemmed.
:type word: str or unicode
:return: The stemmed form.
:rtype: unicode
"""
word = word.lower()
r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
# STEP 1: Remove instrumental case
if r1.endswith(self.__step1_suffixes):
for double_cons in self.__double_consonants:
if word[-2 - len(double_cons):-2] == double_cons:
word = "".join((word[:-4], word[-3]))
if r1[-2 - len(double_cons):-2] == double_cons:
r1 = "".join((r1[:-4], r1[-3]))
break
# STEP 2: Remove frequent cases
for suffix in self.__step2_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
if r1.endswith(u("\xE1")):
word = "".join((word[:-1], "a"))
r1 = "".join((r1[:-1], "a"))
elif r1.endswith(u("\xE9")):
word = "".join((word[:-1], "e"))
r1 = "".join((r1[:-1], "e"))
break
# STEP 3: Remove special cases
for suffix in self.__step3_suffixes:
if r1.endswith(suffix):
if suffix == u("\xE9n"):
word = "".join((word[:-2], "e"))
r1 = "".join((r1[:-2], "e"))
else:
word = "".join((word[:-len(suffix)], "a"))
r1 = "".join((r1[:-len(suffix)], "a"))
break
# STEP 4: Remove other cases
for suffix in self.__step4_suffixes:
if r1.endswith(suffix):
if suffix == u("\xE1stul"):
word = "".join((word[:-5], "a"))
r1 = "".join((r1[:-5], "a"))
elif suffix == u("\xE9st\xFCl"):
word = "".join((word[:-5], "e"))
r1 = "".join((r1[:-5], "e"))
else:
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
break
# STEP 5: Remove factive case
for suffix in self.__step5_suffixes:
if r1.endswith(suffix):
for double_cons in self.__double_consonants:
if word[-1 - len(double_cons):-1] == double_cons:
word = "".join((word[:-3], word[-2]))
if r1[-1 - len(double_cons):-1] == double_cons:
r1 = "".join((r1[:-3], r1[-2]))
break
# STEP 6: Remove owned
for suffix in self.__step6_suffixes:
if r1.endswith(suffix):
if suffix in (u("\xE1k\xE9"), u("\xE1\xE9i")):
word = "".join((word[:-3], "a"))
r1 = "".join((r1[:-3], "a"))
elif suffix in (u("\xE9k\xE9"), u("\xE9\xE9i"),
u("\xE9\xE9")):
word = "".join((word[:-len(suffix)], "e"))
r1 = "".join((r1[:-len(suffix)], "e"))
else:
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
break
# STEP 7: Remove singular owner suffixes
for suffix in self.__step7_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
if suffix in (u("\xE1nk"), u("\xE1juk"), u("\xE1m"),
u("\xE1d"), u("\xE1")):
word = "".join((word[:-len(suffix)], "a"))
r1 = "".join((r1[:-len(suffix)], "a"))
elif suffix in (u("\xE9nk"), u("\xE9j\xFCk"),
u("\xE9m"), u("\xE9d"), u("\xE9")):
word = "".join((word[:-len(suffix)], "e"))
r1 = "".join((r1[:-len(suffix)], "e"))
else:
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
break
# STEP 8: Remove plural owner suffixes
for suffix in self.__step8_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
if suffix in (u("\xE1im"), u("\xE1id"), u("\xE1i"),
u("\xE1ink"), u("\xE1itok"), u("\xE1ik")):
word = "".join((word[:-len(suffix)], "a"))
r1 = "".join((r1[:-len(suffix)], "a"))
elif suffix in (u("\xE9im"), u("\xE9id"), u("\xE9i"),
u("\xE9ink"), u("\xE9itek"), u("\xE9ik")):
word = "".join((word[:-len(suffix)], "e"))
r1 = "".join((r1[:-len(suffix)], "e"))
else:
word = word[:-len(suffix)]
r1 = r1[:-len(suffix)]
break
# STEP 9: Remove plural suffixes
for suffix in self.__step9_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
if suffix == u("\xE1k"):
word = "".join((word[:-2], "a"))
elif suffix == u("\xE9k"):
word = "".join((word[:-2], "e"))
else:
word = word[:-len(suffix)]
break
return word
def __r1_hungarian(self, word, vowels, digraphs):
"""
Return the region R1 that is used by the Hungarian stemmer.
If the word begins with a vowel, R1 is defined as the region
after the first consonant or digraph (= two letters stand for
one phoneme) in the word. If the word begins with a consonant,
it is defined as the region after the first vowel in the word.
If the word does not contain both a vowel and consonant, R1
is the null region at the end of the word.
:param word: The Hungarian word whose region R1 is determined.
:type word: str or unicode
:param vowels: The Hungarian vowels that are used to determine
the region R1.
:type vowels: unicode
:param digraphs: The digraphs that are used to determine the
region R1.
:type digraphs: tuple
:return: the region R1 for the respective word.
:rtype: unicode
:note: This helper method is invoked by the stem method of the subclass
HungarianStemmer. It is not to be invoked directly!
"""
r1 = ""
if word[0] in vowels:
for digraph in digraphs:
if digraph in word[1:]:
r1 = word[word.index(digraph[-1]) + 1:]
return r1
for i in range(1, len(word)):
if word[i] not in vowels:
r1 = word[i + 1:]
break
else:
for i in range(1, len(word)):
if word[i] in vowels:
r1 = word[i + 1:]
break
return r1