microproduct/atmosphericDelay/ISCEApp/site-packages/whoosh/lang/snowball/finnish.py

from .bases import _StandardStemmer

from whoosh.compat import u


class FinnishStemmer(_StandardStemmer):
    """
    The Finnish Snowball stemmer.

    :cvar __vowels: The Finnish vowels.
    :type __vowels: unicode
    :cvar __restricted_vowels: A subset of the Finnish vowels.
    :type __restricted_vowels: unicode
    :cvar __long_vowels: The Finnish vowels in their long forms.
    :type __long_vowels: tuple
    :cvar __consonants: The Finnish consonants.
    :type __consonants: unicode
    :cvar __double_consonants: The Finnish double consonants.
    :type __double_consonants: tuple
    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
    :type __step1_suffixes: tuple
    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
    :type __step2_suffixes: tuple
    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
    :type __step3_suffixes: tuple
    :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
    :type __step4_suffixes: tuple
    :note: A detailed description of the Finnish
           stemming algorithm can be found under
           http://snowball.tartarus.org/algorithms/finnish/stemmer.html
    """

    __vowels = u("aeiouy\xE4\xF6")
    __restricted_vowels = u("aeiou\xE4\xF6")
    __long_vowels = ("aa", "ee", "ii", "oo", "uu", u("\xE4\xE4"),
                     u("\xF6\xF6"))
    __consonants = "bcdfghjklmnpqrstvwxz"
    __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
                           "kk", "ll", "mm", "nn", "pp", "qq", "rr",
                           "ss", "tt", "vv", "ww", "xx", "zz")
    __step1_suffixes = ('kaan', u('k\xE4\xE4n'), 'sti', 'kin', 'han',
                        u('h\xE4n'), 'ko', u('k\xF6'), 'pa', u('p\xE4'))
    __step2_suffixes = ('nsa', u('ns\xE4'), 'mme', 'nne', 'si', 'ni',
                        'an', u('\xE4n'), 'en')
    __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin',
                        'hon', u('h\xE4n'), u('h\xF6n'), 'den', 'tta',
                        u('tt\xE4'), 'ssa', u('ss\xE4'), 'sta',
                        u('st\xE4'), 'lla', u('ll\xE4'), 'lta',
                        u('lt\xE4'), 'lle', 'ksi', 'ine', 'ta',
                        u('t\xE4'), 'na', u('n\xE4'), 'a', u('\xE4'),
                        'n')
    __step4_suffixes = ('impi', 'impa', u('imp\xE4'), 'immi', 'imma',
                        u('imm\xE4'), 'mpi', 'mpa', u('mp\xE4'), 'mmi',
                        'mma', u('mm\xE4'), 'eja', u('ej\xE4'))

    def stem(self, word):
        """
        Stem a Finnish word and return the stemmed form.

        :param word: The word that is stemmed.
        :type word: str or unicode
        :return: The stemmed form.
        :rtype: unicode

        """
        word = word.lower()

        step3_success = False

        r1, r2 = self._r1r2_standard(word, self.__vowels)

        # STEP 1: Particles etc.
        for suffix in self.__step1_suffixes:
            if r1.endswith(suffix):
                if suffix == "sti":
                    if suffix in r2:
                        word = word[:-3]
                        r1 = r1[:-3]
                        r2 = r2[:-3]
                else:
                    if word[-len(suffix) - 1] in u("ntaeiouy\xE4\xF6"):
                        word = word[:-len(suffix)]
                        r1 = r1[:-len(suffix)]
                        r2 = r2[:-len(suffix)]
                break

        # STEP 2: Possessives
        for suffix in self.__step2_suffixes:
            if r1.endswith(suffix):
                if suffix == "si":
                    if word[-3] != "k":
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]

                elif suffix == "ni":
                    word = word[:-2]
                    r1 = r1[:-2]
                    r2 = r2[:-2]
                    if word.endswith("kse"):
                        word = "".join((word[:-3], "ksi"))

                    if r1.endswith("kse"):
                        r1 = "".join((r1[:-3], "ksi"))

                    if r2.endswith("kse"):
                        r2 = "".join((r2[:-3], "ksi"))

                elif suffix == "an":
                    if (word[-4:-2] in ("ta", "na") or
                        word[-5:-2] in ("ssa", "sta", "lla", "lta")):
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]

                elif suffix == u("\xE4n"):
                    if (word[-4:-2] in (u("t\xE4"), u("n\xE4")) or
                        word[-5:-2] in (u("ss\xE4"), u("st\xE4"),
                                        u("ll\xE4"), u("lt\xE4"))):
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]

                elif suffix == "en":
                    if word[-5:-2] in ("lle", "ine"):
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]
                else:
                    word = word[:-3]
                    r1 = r1[:-3]
                    r2 = r2[:-3]
                break

        # STEP 3: Cases
        for suffix in self.__step3_suffixes:
            if r1.endswith(suffix):
                if suffix in ("han", "hen", "hin", "hon", u("h\xE4n"),
                              u("h\xF6n")):
                    if ((suffix == "han" and word[-4] == "a") or
                        (suffix == "hen" and word[-4] == "e") or
                        (suffix == "hin" and word[-4] == "i") or
                        (suffix == "hon" and word[-4] == "o") or
                        (suffix == u("h\xE4n") and word[-4] == u("\xE4")) or
                        (suffix == u("h\xF6n") and word[-4] == u("\xF6"))):
                        word = word[:-3]
                        r1 = r1[:-3]
                        r2 = r2[:-3]
                        step3_success = True

                elif suffix in ("siin", "den", "tten"):
                    if (word[-len(suffix) - 1] == "i" and
                        word[-len(suffix) - 2] in self.__restricted_vowels):
                        word = word[:-len(suffix)]
                        r1 = r1[:-len(suffix)]
                        r2 = r2[:-len(suffix)]
                        step3_success = True
                    else:
                        continue

                elif suffix == "seen":
                    if word[-6:-4] in self.__long_vowels:
                        word = word[:-4]
                        r1 = r1[:-4]
                        r2 = r2[:-4]
                        step3_success = True
                    else:
                        continue

                elif suffix in ("a", u("\xE4")):
                    if word[-2] in self.__vowels and word[-3] in self.__consonants:
                        word = word[:-1]
                        r1 = r1[:-1]
                        r2 = r2[:-1]
                        step3_success = True

                elif suffix in ("tta", u("tt\xE4")):
                    if word[-4] == "e":
                        word = word[:-3]
                        r1 = r1[:-3]
                        r2 = r2[:-3]
                        step3_success = True

                elif suffix == "n":
                    word = word[:-1]
                    r1 = r1[:-1]
                    r2 = r2[:-1]
                    step3_success = True

                    if word[-2:] == "ie" or word[-2:] in self.__long_vowels:
                        word = word[:-1]
                        r1 = r1[:-1]
                        r2 = r2[:-1]
                else:
                    word = word[:-len(suffix)]
                    r1 = r1[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                    step3_success = True
                break

        # STEP 4: Other endings
        for suffix in self.__step4_suffixes:
            if r2.endswith(suffix):
                if suffix in ("mpi", "mpa", u("mp\xE4"), "mmi", "mma",
                              u("mm\xE4")):
                    if word[-5:-3] != "po":
                        word = word[:-3]
                        r1 = r1[:-3]
                        r2 = r2[:-3]
                else:
                    word = word[:-len(suffix)]
                    r1 = r1[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                break

        # STEP 5: Plurals
        if step3_success and len(r1) >= 1 and r1[-1] in "ij":
            word = word[:-1]
            r1 = r1[:-1]

        elif (not step3_success and len(r1) >= 2 and
              r1[-1] == "t" and r1[-2] in self.__vowels):
            word = word[:-1]
            r1 = r1[:-1]
            r2 = r2[:-1]
            if r2.endswith("imma"):
                word = word[:-4]
                r1 = r1[:-4]
            elif r2.endswith("mma") and r2[-5:-3] != "po":
                word = word[:-3]
                r1 = r1[:-3]

        # STEP 6: Tidying up
        if r1[-2:] in self.__long_vowels:
            word = word[:-1]
            r1 = r1[:-1]

        if (len(r1) >= 2 and r1[-2] in self.__consonants and
            r1[-1] in u("a\xE4ei")):
            word = word[:-1]
            r1 = r1[:-1]

        if r1.endswith(("oj", "uj")):
            word = word[:-1]
            r1 = r1[:-1]

        if r1.endswith("jo"):
            word = word[:-1]
            r1 = r1[:-1]

        # If the word ends with a double consonant
        # followed by zero or more vowels, the last consonant is removed.
        for i in range(1, len(word)):
            if word[-i] in self.__vowels:
                continue
            else:
                if i == 1:
                    if word[-i - 1:] in self.__double_consonants:
                        word = word[:-1]
                else:
                    if word[-i - 1:-i + 1] in self.__double_consonants:
                        word = "".join((word[:-i], word[-i + 1:]))
                break


        return word