microproduct/atmosphericDelay/ISCEApp/site-packages/whoosh/lang/snowball/danish.py

from .bases import _ScandinavianStemmer

from whoosh.compat import u


class DanishStemmer(_ScandinavianStemmer):
    """
    The Danish Snowball stemmer.

    :cvar __vowels: The Danish vowels.
    :type __vowels: unicode
    :cvar __consonants: The Danish consonants.
    :type __consonants: unicode
    :cvar __double_consonants: The Danish double consonants.
    :type __double_consonants: tuple
    :cvar __s_ending: Letters that may directly appear before a word final 's'.
    :type __s_ending: unicode
    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
    :type __step1_suffixes: tuple
    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
    :type __step2_suffixes: tuple
    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
    :type __step3_suffixes: tuple
    :note: A detailed description of the Danish
           stemming algorithm can be found under
           http://snowball.tartarus.org/algorithms/danish/stemmer.html

    """

    # The language's vowels and other important characters are defined.
    __vowels = u("aeiouy\xE6\xE5\xF8")
    __consonants = "bcdfghjklmnpqrstvwxz"
    __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
                           "kk", "ll", "mm", "nn", "pp", "qq", "rr",
                           "ss", "tt", "vv", "ww", "xx", "zz")
    __s_ending = u("abcdfghjklmnoprtvyz\xE5")

    # The different suffixes, divided into the algorithm's steps
    # and organized by length, are listed in tuples.
    __step1_suffixes = ("erendes", "erende", "hedens", "ethed",
                        "erede", "heden", "heder", "endes",
                        "ernes", "erens", "erets", "ered",
                        "ende", "erne", "eren", "erer", "heds",
                        "enes", "eres", "eret", "hed", "ene", "ere",
                        "ens", "ers", "ets", "en", "er", "es", "et",
                        "e", "s")
    __step2_suffixes = ("gd", "dt", "gt", "kt")
    __step3_suffixes = ("elig", u("l\xF8st"), "lig", "els", "ig")

    def stem(self, word):
        """
        Stem a Danish word and return the stemmed form.

        :param word: The word that is stemmed.
        :type word: str or unicode
        :return: The stemmed form.
        :rtype: unicode

        """
        # Every word is put into lower case for normalization.
        word = word.lower()

        # After this, the required regions are generated
        # by the respective helper method.
        r1 = self._r1_scandinavian(word, self.__vowels)

        # Then the actual stemming process starts.
        # Every new step is explicitly indicated
        # according to the descriptions on the Snowball website.

        # STEP 1
        for suffix in self.__step1_suffixes:
            if r1.endswith(suffix):
                if suffix == "s":
                    if word[-2] in self.__s_ending:
                        word = word[:-1]
                        r1 = r1[:-1]
                else:
                    word = word[:-len(suffix)]
                    r1 = r1[:-len(suffix)]
                break

        # STEP 2
        for suffix in self.__step2_suffixes:
            if r1.endswith(suffix):
                word = word[:-1]
                r1 = r1[:-1]
                break

        # STEP 3
        if r1.endswith("igst"):
            word = word[:-2]
            r1 = r1[:-2]

        for suffix in self.__step3_suffixes:
            if r1.endswith(suffix):
                if suffix == u("l\xF8st"):
                    word = word[:-1]
                    r1 = r1[:-1]
                else:
                    word = word[:-len(suffix)]
                    r1 = r1[:-len(suffix)]

                    if r1.endswith(self.__step2_suffixes):
                        word = word[:-1]
                        r1 = r1[:-1]
                break

        # STEP 4: Undouble
        for double_cons in self.__double_consonants:
            if word.endswith(double_cons) and len(word) > 3:
                word = word[:-1]
                break

        return word
入所测试定稿上传 2023-08-28 10:17:29 +00:00			`from .bases import _ScandinavianStemmer`

			`from whoosh.compat import u`


			`class DanishStemmer(_ScandinavianStemmer):`
			`"""`
			`The Danish Snowball stemmer.`

			`:cvar __vowels: The Danish vowels.`
			`:type __vowels: unicode`
			`:cvar __consonants: The Danish consonants.`
			`:type __consonants: unicode`
			`:cvar __double_consonants: The Danish double consonants.`
			`:type __double_consonants: tuple`
			`:cvar __s_ending: Letters that may directly appear before a word final 's'.`
			`:type __s_ending: unicode`
			`:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.`
			`:type __step1_suffixes: tuple`
			`:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.`
			`:type __step2_suffixes: tuple`
			`:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.`
			`:type __step3_suffixes: tuple`
			`:note: A detailed description of the Danish`
			`stemming algorithm can be found under`
			`http://snowball.tartarus.org/algorithms/danish/stemmer.html`

			`"""`

			`# The language's vowels and other important characters are defined.`
			`__vowels = u("aeiouy\xE6\xE5\xF8")`
			`__consonants = "bcdfghjklmnpqrstvwxz"`
			`__double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",`
			`"kk", "ll", "mm", "nn", "pp", "qq", "rr",`
			`"ss", "tt", "vv", "ww", "xx", "zz")`
			`__s_ending = u("abcdfghjklmnoprtvyz\xE5")`

			`# The different suffixes, divided into the algorithm's steps`
			`# and organized by length, are listed in tuples.`
			`__step1_suffixes = ("erendes", "erende", "hedens", "ethed",`
			`"erede", "heden", "heder", "endes",`
			`"ernes", "erens", "erets", "ered",`
			`"ende", "erne", "eren", "erer", "heds",`
			`"enes", "eres", "eret", "hed", "ene", "ere",`
			`"ens", "ers", "ets", "en", "er", "es", "et",`
			`"e", "s")`
			`__step2_suffixes = ("gd", "dt", "gt", "kt")`
			`__step3_suffixes = ("elig", u("l\xF8st"), "lig", "els", "ig")`

			`def stem(self, word):`
			`"""`
			`Stem a Danish word and return the stemmed form.`

			`:param word: The word that is stemmed.`
			`:type word: str or unicode`
			`:return: The stemmed form.`
			`:rtype: unicode`

			`"""`
			`# Every word is put into lower case for normalization.`
			`word = word.lower()`

			`# After this, the required regions are generated`
			`# by the respective helper method.`
			`r1 = self._r1_scandinavian(word, self.__vowels)`

			`# Then the actual stemming process starts.`
			`# Every new step is explicitly indicated`
			`# according to the descriptions on the Snowball website.`

			`# STEP 1`
			`for suffix in self.__step1_suffixes:`
			`if r1.endswith(suffix):`
			`if suffix == "s":`
			`if word[-2] in self.__s_ending:`
			`word = word[:-1]`
			`r1 = r1[:-1]`
			`else:`
			`word = word[:-len(suffix)]`
			`r1 = r1[:-len(suffix)]`
			`break`

			`# STEP 2`
			`for suffix in self.__step2_suffixes:`
			`if r1.endswith(suffix):`
			`word = word[:-1]`
			`r1 = r1[:-1]`
			`break`

			`# STEP 3`
			`if r1.endswith("igst"):`
			`word = word[:-2]`
			`r1 = r1[:-2]`

			`for suffix in self.__step3_suffixes:`
			`if r1.endswith(suffix):`
			`if suffix == u("l\xF8st"):`
			`word = word[:-1]`
			`r1 = r1[:-1]`
			`else:`
			`word = word[:-len(suffix)]`
			`r1 = r1[:-len(suffix)]`

			`if r1.endswith(self.__step2_suffixes):`
			`word = word[:-1]`
			`r1 = r1[:-1]`
			`break`

			`# STEP 4: Undouble`
			`for double_cons in self.__double_consonants:`
			`if word.endswith(double_cons) and len(word) > 3:`
			`word = word[:-1]`
			`break`

			`return word`