134 lines
4.8 KiB
Python
134 lines
4.8 KiB
Python
|
# Base classes
|
||
|
|
||
|
|
||
|
class _ScandinavianStemmer(object):
|
||
|
|
||
|
"""
|
||
|
This subclass encapsulates a method for defining the string region R1.
|
||
|
It is used by the Danish, Norwegian, and Swedish stemmer.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def _r1_scandinavian(self, word, vowels):
|
||
|
"""
|
||
|
Return the region R1 that is used by the Scandinavian stemmers.
|
||
|
|
||
|
R1 is the region after the first non-vowel following a vowel,
|
||
|
or is the null region at the end of the word if there is no
|
||
|
such non-vowel. But then R1 is adjusted so that the region
|
||
|
before it contains at least three letters.
|
||
|
|
||
|
:param word: The word whose region R1 is determined.
|
||
|
:type word: str or unicode
|
||
|
:param vowels: The vowels of the respective language that are
|
||
|
used to determine the region R1.
|
||
|
:type vowels: unicode
|
||
|
:return: the region R1 for the respective word.
|
||
|
:rtype: unicode
|
||
|
:note: This helper method is invoked by the respective stem method of
|
||
|
the subclasses DanishStemmer, NorwegianStemmer, and
|
||
|
SwedishStemmer. It is not to be invoked directly!
|
||
|
|
||
|
"""
|
||
|
r1 = ""
|
||
|
for i in range(1, len(word)):
|
||
|
if word[i] not in vowels and word[i - 1] in vowels:
|
||
|
if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0:
|
||
|
r1 = word[3:]
|
||
|
elif len(word[:i + 1]) >= 3:
|
||
|
r1 = word[i + 1:]
|
||
|
else:
|
||
|
return word
|
||
|
break
|
||
|
|
||
|
return r1
|
||
|
|
||
|
|
||
|
class _StandardStemmer(object):
|
||
|
"""
|
||
|
This subclass encapsulates two methods for defining the standard versions
|
||
|
of the string regions R1, R2, and RV.
|
||
|
"""
|
||
|
|
||
|
def _r1r2_standard(self, word, vowels):
|
||
|
"""
|
||
|
Return the standard interpretations of the string regions R1 and R2.
|
||
|
|
||
|
R1 is the region after the first non-vowel following a vowel,
|
||
|
or is the null region at the end of the word if there is no
|
||
|
such non-vowel.
|
||
|
|
||
|
R2 is the region after the first non-vowel following a vowel
|
||
|
in R1, or is the null region at the end of the word if there
|
||
|
is no such non-vowel.
|
||
|
|
||
|
:param word: The word whose regions R1 and R2 are determined.
|
||
|
:type word: str or unicode
|
||
|
:param vowels: The vowels of the respective language that are
|
||
|
used to determine the regions R1 and R2.
|
||
|
:type vowels: unicode
|
||
|
:return: (r1,r2), the regions R1 and R2 for the respective word.
|
||
|
:rtype: tuple
|
||
|
:note: This helper method is invoked by the respective stem method of
|
||
|
the subclasses DutchStemmer, FinnishStemmer,
|
||
|
FrenchStemmer, GermanStemmer, ItalianStemmer,
|
||
|
PortugueseStemmer, RomanianStemmer, and SpanishStemmer.
|
||
|
It is not to be invoked directly!
|
||
|
:note: A detailed description of how to define R1 and R2
|
||
|
can be found at http://snowball.tartarus.org/texts/r1r2.html
|
||
|
|
||
|
"""
|
||
|
r1 = ""
|
||
|
r2 = ""
|
||
|
for i in range(1, len(word)):
|
||
|
if word[i] not in vowels and word[i - 1] in vowels:
|
||
|
r1 = word[i + 1:]
|
||
|
break
|
||
|
|
||
|
for i in range(1, len(r1)):
|
||
|
if r1[i] not in vowels and r1[i - 1] in vowels:
|
||
|
r2 = r1[i + 1:]
|
||
|
break
|
||
|
|
||
|
return (r1, r2)
|
||
|
|
||
|
def _rv_standard(self, word, vowels):
|
||
|
"""
|
||
|
Return the standard interpretation of the string region RV.
|
||
|
|
||
|
If the second letter is a consonant, RV is the region after the
|
||
|
next following vowel. If the first two letters are vowels, RV is
|
||
|
the region after the next following consonant. Otherwise, RV is
|
||
|
the region after the third letter.
|
||
|
|
||
|
:param word: The word whose region RV is determined.
|
||
|
:type word: str or unicode
|
||
|
:param vowels: The vowels of the respective language that are
|
||
|
used to determine the region RV.
|
||
|
:type vowels: unicode
|
||
|
:return: the region RV for the respective word.
|
||
|
:rtype: unicode
|
||
|
:note: This helper method is invoked by the respective stem method of
|
||
|
the subclasses ItalianStemmer, PortugueseStemmer,
|
||
|
RomanianStemmer, and SpanishStemmer. It is not to be
|
||
|
invoked directly!
|
||
|
|
||
|
"""
|
||
|
rv = ""
|
||
|
if len(word) >= 2:
|
||
|
if word[1] not in vowels:
|
||
|
for i in range(2, len(word)):
|
||
|
if word[i] in vowels:
|
||
|
rv = word[i + 1:]
|
||
|
break
|
||
|
|
||
|
elif word[:2] in vowels:
|
||
|
for i in range(2, len(word)):
|
||
|
if word[i] not in vowels:
|
||
|
rv = word[i + 1:]
|
||
|
break
|
||
|
else:
|
||
|
rv = word[3:]
|
||
|
|
||
|
return rv
|