269 lines
11 KiB
Python
269 lines
11 KiB
Python
from whoosh.compat import u
|
|
|
|
class HungarianStemmer(object):
|
|
|
|
"""
|
|
The Hungarian Snowball stemmer.
|
|
|
|
:cvar __vowels: The Hungarian vowels.
|
|
:type __vowels: unicode
|
|
:cvar __digraphs: The Hungarian digraphs.
|
|
:type __digraphs: tuple
|
|
:cvar __double_consonants: The Hungarian double consonants.
|
|
:type __double_consonants: tuple
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
:type __step1_suffixes: tuple
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
:type __step2_suffixes: tuple
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
:type __step3_suffixes: tuple
|
|
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
|
|
:type __step4_suffixes: tuple
|
|
:cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
|
|
:type __step5_suffixes: tuple
|
|
:cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
|
|
:type __step6_suffixes: tuple
|
|
:cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
|
|
:type __step7_suffixes: tuple
|
|
:cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
|
|
:type __step8_suffixes: tuple
|
|
:cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
|
|
:type __step9_suffixes: tuple
|
|
:note: A detailed description of the Hungarian
|
|
stemming algorithm can be found under
|
|
http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
|
|
|
|
"""
|
|
|
|
__vowels = u("aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB")
|
|
__digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
|
|
__double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg",
|
|
"ggy", "jj", "kk", "ll", "lly", "mm",
|
|
"nn", "nny", "pp", "rr", "ss", "ssz",
|
|
"tt", "tty", "vv", "zz", "zzs")
|
|
|
|
__step1_suffixes = ("al", "el")
|
|
__step2_suffixes = (u('k\xE9ppen'), u('onk\xE9nt'), u('enk\xE9nt'),
|
|
u('ank\xE9nt'), u('k\xE9pp'), u('k\xE9nt'), 'ban',
|
|
'ben', 'nak', 'nek', 'val', 'vel', u('t\xF3l'),
|
|
u('t\xF5l'), u('r\xF3l'), u('r\xF5l'), u('b\xF3l'),
|
|
u('b\xF5l'), 'hoz', 'hez', u('h\xF6z'),
|
|
u('n\xE1l'), u('n\xE9l'), u('\xE9rt'), 'kor',
|
|
'ba', 'be', 'ra', 're', 'ig', 'at', 'et',
|
|
'ot', u('\xF6t'), 'ul', u('\xFCl'), u('v\xE1'),
|
|
u('v\xE9'), 'en', 'on', 'an', u('\xF6n'),
|
|
'n', 't')
|
|
__step3_suffixes = (u("\xE1nk\xE9nt"), u("\xE1n"), u("\xE9n"))
|
|
__step4_suffixes = ('astul', u('est\xFCl'), u('\xE1stul'),
|
|
u('\xE9st\xFCl'), 'stul', u('st\xFCl'))
|
|
__step5_suffixes = (u("\xE1"), u("\xE9"))
|
|
__step6_suffixes = (u('ok\xE9'), u('\xF6k\xE9'), u('ak\xE9'),
|
|
u('ek\xE9'), u('\xE1k\xE9'), u('\xE1\xE9i'),
|
|
u('\xE9k\xE9'), u('\xE9\xE9i'), u('k\xE9'),
|
|
u('\xE9i'), u('\xE9\xE9'), u('\xE9'))
|
|
__step7_suffixes = (u('\xE1juk'), u('\xE9j\xFCk'), u('\xFCnk'),
|
|
'unk', 'juk', u('j\xFCk'), u('\xE1nk'),
|
|
u('\xE9nk'), 'nk', 'uk', u('\xFCk'), 'em',
|
|
'om', 'am', 'od', 'ed', 'ad', u('\xF6d'),
|
|
'ja', 'je', u('\xE1m'), u('\xE1d'), u('\xE9m'),
|
|
u('\xE9d'), 'm', 'd', 'a', 'e', 'o',
|
|
u('\xE1'), u('\xE9'))
|
|
__step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok',
|
|
'eitek', u('\xE1itok'), u('\xE9itek'), 'jaim',
|
|
'jeim', 'jaid', 'jeid', 'eink', 'aink',
|
|
'itek', 'jeik', 'jaik', u('\xE1ink'),
|
|
u('\xE9ink'), 'aim', 'eim', 'aid', 'eid',
|
|
'jai', 'jei', 'ink', 'aik', 'eik',
|
|
u('\xE1im'), u('\xE1id'), u('\xE1ik'), u('\xE9im'),
|
|
u('\xE9id'), u('\xE9ik'), 'im', 'id', 'ai',
|
|
'ei', 'ik', u('\xE1i'), u('\xE9i'), 'i')
|
|
__step9_suffixes = (u("\xE1k"), u("\xE9k"), u("\xF6k"), "ok",
|
|
"ek", "ak", "k")
|
|
|
|
def stem(self, word):
|
|
"""
|
|
Stem an Hungarian word and return the stemmed form.
|
|
|
|
:param word: The word that is stemmed.
|
|
:type word: str or unicode
|
|
:return: The stemmed form.
|
|
:rtype: unicode
|
|
|
|
"""
|
|
word = word.lower()
|
|
|
|
r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
|
|
|
|
# STEP 1: Remove instrumental case
|
|
if r1.endswith(self.__step1_suffixes):
|
|
for double_cons in self.__double_consonants:
|
|
if word[-2 - len(double_cons):-2] == double_cons:
|
|
word = "".join((word[:-4], word[-3]))
|
|
|
|
if r1[-2 - len(double_cons):-2] == double_cons:
|
|
r1 = "".join((r1[:-4], r1[-3]))
|
|
break
|
|
|
|
# STEP 2: Remove frequent cases
|
|
for suffix in self.__step2_suffixes:
|
|
if word.endswith(suffix):
|
|
if r1.endswith(suffix):
|
|
word = word[:-len(suffix)]
|
|
r1 = r1[:-len(suffix)]
|
|
|
|
if r1.endswith(u("\xE1")):
|
|
word = "".join((word[:-1], "a"))
|
|
r1 = "".join((r1[:-1], "a"))
|
|
|
|
elif r1.endswith(u("\xE9")):
|
|
word = "".join((word[:-1], "e"))
|
|
r1 = "".join((r1[:-1], "e"))
|
|
break
|
|
|
|
# STEP 3: Remove special cases
|
|
for suffix in self.__step3_suffixes:
|
|
if r1.endswith(suffix):
|
|
if suffix == u("\xE9n"):
|
|
word = "".join((word[:-2], "e"))
|
|
r1 = "".join((r1[:-2], "e"))
|
|
else:
|
|
word = "".join((word[:-len(suffix)], "a"))
|
|
r1 = "".join((r1[:-len(suffix)], "a"))
|
|
break
|
|
|
|
# STEP 4: Remove other cases
|
|
for suffix in self.__step4_suffixes:
|
|
if r1.endswith(suffix):
|
|
if suffix == u("\xE1stul"):
|
|
word = "".join((word[:-5], "a"))
|
|
r1 = "".join((r1[:-5], "a"))
|
|
|
|
elif suffix == u("\xE9st\xFCl"):
|
|
word = "".join((word[:-5], "e"))
|
|
r1 = "".join((r1[:-5], "e"))
|
|
else:
|
|
word = word[:-len(suffix)]
|
|
r1 = r1[:-len(suffix)]
|
|
break
|
|
|
|
# STEP 5: Remove factive case
|
|
for suffix in self.__step5_suffixes:
|
|
if r1.endswith(suffix):
|
|
for double_cons in self.__double_consonants:
|
|
if word[-1 - len(double_cons):-1] == double_cons:
|
|
word = "".join((word[:-3], word[-2]))
|
|
|
|
if r1[-1 - len(double_cons):-1] == double_cons:
|
|
r1 = "".join((r1[:-3], r1[-2]))
|
|
break
|
|
|
|
# STEP 6: Remove owned
|
|
for suffix in self.__step6_suffixes:
|
|
if r1.endswith(suffix):
|
|
if suffix in (u("\xE1k\xE9"), u("\xE1\xE9i")):
|
|
word = "".join((word[:-3], "a"))
|
|
r1 = "".join((r1[:-3], "a"))
|
|
|
|
elif suffix in (u("\xE9k\xE9"), u("\xE9\xE9i"),
|
|
u("\xE9\xE9")):
|
|
word = "".join((word[:-len(suffix)], "e"))
|
|
r1 = "".join((r1[:-len(suffix)], "e"))
|
|
else:
|
|
word = word[:-len(suffix)]
|
|
r1 = r1[:-len(suffix)]
|
|
break
|
|
|
|
# STEP 7: Remove singular owner suffixes
|
|
for suffix in self.__step7_suffixes:
|
|
if word.endswith(suffix):
|
|
if r1.endswith(suffix):
|
|
if suffix in (u("\xE1nk"), u("\xE1juk"), u("\xE1m"),
|
|
u("\xE1d"), u("\xE1")):
|
|
word = "".join((word[:-len(suffix)], "a"))
|
|
r1 = "".join((r1[:-len(suffix)], "a"))
|
|
|
|
elif suffix in (u("\xE9nk"), u("\xE9j\xFCk"),
|
|
u("\xE9m"), u("\xE9d"), u("\xE9")):
|
|
word = "".join((word[:-len(suffix)], "e"))
|
|
r1 = "".join((r1[:-len(suffix)], "e"))
|
|
else:
|
|
word = word[:-len(suffix)]
|
|
r1 = r1[:-len(suffix)]
|
|
break
|
|
|
|
# STEP 8: Remove plural owner suffixes
|
|
for suffix in self.__step8_suffixes:
|
|
if word.endswith(suffix):
|
|
if r1.endswith(suffix):
|
|
if suffix in (u("\xE1im"), u("\xE1id"), u("\xE1i"),
|
|
u("\xE1ink"), u("\xE1itok"), u("\xE1ik")):
|
|
word = "".join((word[:-len(suffix)], "a"))
|
|
r1 = "".join((r1[:-len(suffix)], "a"))
|
|
|
|
elif suffix in (u("\xE9im"), u("\xE9id"), u("\xE9i"),
|
|
u("\xE9ink"), u("\xE9itek"), u("\xE9ik")):
|
|
word = "".join((word[:-len(suffix)], "e"))
|
|
r1 = "".join((r1[:-len(suffix)], "e"))
|
|
else:
|
|
word = word[:-len(suffix)]
|
|
r1 = r1[:-len(suffix)]
|
|
break
|
|
|
|
# STEP 9: Remove plural suffixes
|
|
for suffix in self.__step9_suffixes:
|
|
if word.endswith(suffix):
|
|
if r1.endswith(suffix):
|
|
if suffix == u("\xE1k"):
|
|
word = "".join((word[:-2], "a"))
|
|
elif suffix == u("\xE9k"):
|
|
word = "".join((word[:-2], "e"))
|
|
else:
|
|
word = word[:-len(suffix)]
|
|
break
|
|
|
|
return word
|
|
|
|
def __r1_hungarian(self, word, vowels, digraphs):
|
|
"""
|
|
Return the region R1 that is used by the Hungarian stemmer.
|
|
|
|
If the word begins with a vowel, R1 is defined as the region
|
|
after the first consonant or digraph (= two letters stand for
|
|
one phoneme) in the word. If the word begins with a consonant,
|
|
it is defined as the region after the first vowel in the word.
|
|
If the word does not contain both a vowel and consonant, R1
|
|
is the null region at the end of the word.
|
|
|
|
:param word: The Hungarian word whose region R1 is determined.
|
|
:type word: str or unicode
|
|
:param vowels: The Hungarian vowels that are used to determine
|
|
the region R1.
|
|
:type vowels: unicode
|
|
:param digraphs: The digraphs that are used to determine the
|
|
region R1.
|
|
:type digraphs: tuple
|
|
:return: the region R1 for the respective word.
|
|
:rtype: unicode
|
|
:note: This helper method is invoked by the stem method of the subclass
|
|
HungarianStemmer. It is not to be invoked directly!
|
|
|
|
"""
|
|
r1 = ""
|
|
if word[0] in vowels:
|
|
for digraph in digraphs:
|
|
if digraph in word[1:]:
|
|
r1 = word[word.index(digraph[-1]) + 1:]
|
|
return r1
|
|
|
|
for i in range(1, len(word)):
|
|
if word[i] not in vowels:
|
|
r1 = word[i + 1:]
|
|
break
|
|
else:
|
|
for i in range(1, len(word)):
|
|
if word[i] in vowels:
|
|
r1 = word[i + 1:]
|
|
break
|
|
|
|
return r1
|