microproduct/atmosphericDelay-s1a/ISCEApp/site-packages/whoosh/lang/snowball/russian.py

from whoosh.compat import u

class RussianStemmer(object):
    """
    The Russian Snowball stemmer.

    :cvar __perfective_gerund_suffixes: Suffixes to be deleted.
    :type __perfective_gerund_suffixes: tuple
    :cvar __adjectival_suffixes: Suffixes to be deleted.
    :type __adjectival_suffixes: tuple
    :cvar __reflexive_suffixes: Suffixes to be deleted.
    :type __reflexive_suffixes: tuple
    :cvar __verb_suffixes: Suffixes to be deleted.
    :type __verb_suffixes: tuple
    :cvar __noun_suffixes: Suffixes to be deleted.
    :type __noun_suffixes: tuple
    :cvar __superlative_suffixes: Suffixes to be deleted.
    :type __superlative_suffixes: tuple
    :cvar __derivational_suffixes: Suffixes to be deleted.
    :type __derivational_suffixes: tuple
    :note: A detailed description of the Russian
           stemming algorithm can be found under
           http://snowball.tartarus.org/algorithms/russian/stemmer.html

    """

    __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'",
                                      "ivshi", "yvshi", "vshi", "iv",
                                      "yv", "v")
    __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a',
                               'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego',
                               'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu',
                               'ui^ushchikh', 'ui^ushchykh',
                               'ui^ushchui^u', 'ui^ushchaia',
                               'ui^ushchoi^u', 'ui^ushchei^u',
                               'i^ushchi^ui^u', 'i^ushchi^ai^a',
                               'ui^ushchee', 'ui^ushchie',
                               'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`',
                               'ui^ushchii`', 'ui^ushchyi`',
                               'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim',
                               'ui^ushchym', 'ui^ushchom', 'i^ushchimi',
                               'i^ushchymi', 'i^ushchego', 'i^ushchogo',
                               'i^ushchemu', 'i^ushchomu', 'i^ushchikh',
                               'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a',
                               'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee',
                               'i^ushchie', 'i^ushchye', 'i^ushchoe',
                               'i^ushchei`', 'i^ushchii`',
                               'i^ushchyi`', 'i^ushchoi`', 'i^ushchem',
                               'i^ushchim', 'i^ushchym', 'i^ushchom',
                               'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u',
                               'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a',
                               'shchimi', 'shchymi', 'shchego', 'shchogo',
                               'shchemu', 'shchomu', 'shchikh', 'shchykh',
                               'shchui^u', 'shchai^a', 'shchoi^u',
                               'shchei^u', 'ivshimi', 'ivshymi',
                               'ivshego', 'ivshogo', 'ivshemu', 'ivshomu',
                               'ivshikh', 'ivshykh', 'ivshui^u',
                               'ivshai^a', 'ivshoi^u', 'ivshei^u',
                               'yvshimi', 'yvshymi', 'yvshego', 'yvshogo',
                               'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh',
                               'yvshui^u', 'yvshai^a', 'yvshoi^u',
                               'yvshei^u', 'vshi^ui^u', 'vshi^ai^a',
                               'shchee', 'shchie', 'shchye', 'shchoe',
                               'shchei`', 'shchii`', 'shchyi`', 'shchoi`',
                               'shchem', 'shchim', 'shchym', 'shchom',
                               'ivshee', 'ivshie', 'ivshye', 'ivshoe',
                               'ivshei`', 'ivshii`', 'ivshyi`',
                               'ivshoi`', 'ivshem', 'ivshim', 'ivshym',
                               'ivshom', 'yvshee', 'yvshie', 'yvshye',
                               'yvshoe', 'yvshei`', 'yvshii`',
                               'yvshyi`', 'yvshoi`', 'yvshem',
                               'yvshim', 'yvshym', 'yvshom', 'vshimi',
                               'vshymi', 'vshego', 'vshogo', 'vshemu',
                               'vshomu', 'vshikh', 'vshykh', 'vshui^u',
                               'vshai^a', 'vshoi^u', 'vshei^u',
                               'emi^ui^u', 'emi^ai^a', 'nni^ui^u',
                               'nni^ai^a', 'vshee',
                               'vshie', 'vshye', 'vshoe', 'vshei`',
                               'vshii`', 'vshyi`', 'vshoi`',
                               'vshem', 'vshim', 'vshym', 'vshom',
                               'emimi', 'emymi', 'emego', 'emogo',
                               'ememu', 'emomu', 'emikh', 'emykh',
                               'emui^u', 'emai^a', 'emoi^u', 'emei^u',
                               'nnimi', 'nnymi', 'nnego', 'nnogo',
                               'nnemu', 'nnomu', 'nnikh', 'nnykh',
                               'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u',
                               'emee', 'emie', 'emye', 'emoe',
                               'emei`', 'emii`', 'emyi`',
                               'emoi`', 'emem', 'emim', 'emym',
                               'emom', 'nnee', 'nnie', 'nnye', 'nnoe',
                               'nnei`', 'nnii`', 'nnyi`',
                               'nnoi`', 'nnem', 'nnim', 'nnym',
                               'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi',
                               'ego', 'ogo', 'emu', 'omu', 'ikh',
                               'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u',
                               'ee', 'ie', 'ye', 'oe', 'ei`',
                               'ii`', 'yi`', 'oi`', 'em',
                               'im', 'ym', 'om')
    __reflexive_suffixes = ("si^a", "s'")
    __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut',
                         "ish'", 'ete', 'i`te', 'i^ut', 'nno',
                         'ila', 'yla', 'ena', 'ite', 'ili', 'yli',
                         'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny',
                         "it'", "yt'", 'ui^u', 'la', 'na', 'li',
                         'em', 'lo', 'no', 'et', 'ny', "t'",
                         'ei`', 'ui`', 'il', 'yl', 'im',
                         'ym', 'en', 'it', 'yt', 'i^u', 'i`',
                         'l', 'n')
    __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh',
                         'ami', 'iei`', 'i^am', 'iem', 'akh',
                         'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov',
                         'ie', "'e", 'ei', 'ii', 'ei`',
                         'oi`', 'ii`', 'em', 'am', 'om',
                         'i^u', 'i^a', 'a', 'e', 'i', 'i`',
                         'o', 'u', 'y', "'")
    __superlative_suffixes = ("ei`she", "ei`sh")
    __derivational_suffixes = ("ost'", "ost")

    def stem(self, word):
        """
        Stem a Russian word and return the stemmed form.

        :param word: The word that is stemmed.
        :type word: str or unicode
        :return: The stemmed form.
        :rtype: unicode

        """
        chr_exceeded = False
        for i in range(len(word)):
            if ord(word[i]) > 255:
                chr_exceeded = True
                break

        if chr_exceeded:
            word = self.__cyrillic_to_roman(word)

        step1_success = False
        adjectival_removed = False
        verb_removed = False
        undouble_success = False
        superlative_removed = False

        rv, r2 = self.__regions_russian(word)

        # Step 1
        for suffix in self.__perfective_gerund_suffixes:
            if rv.endswith(suffix):
                if suffix in ("v", "vshi", "vshis'"):
                    if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or
                        rv[-len(suffix) - 1:-len(suffix)] == "a"):
                        word = word[:-len(suffix)]
                        r2 = r2[:-len(suffix)]
                        rv = rv[:-len(suffix)]
                        step1_success = True
                        break
                else:
                    word = word[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                    rv = rv[:-len(suffix)]
                    step1_success = True
                    break

        if not step1_success:
            for suffix in self.__reflexive_suffixes:
                if rv.endswith(suffix):
                    word = word[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                    rv = rv[:-len(suffix)]
                    break

            for suffix in self.__adjectival_suffixes:
                if rv.endswith(suffix):
                    if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a',
                              'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u',
                              'i^ushchei^u', 'i^ushchimi', 'i^ushchymi',
                              'i^ushchego', 'i^ushchogo', 'i^ushchemu',
                              'i^ushchomu', 'i^ushchikh', 'i^ushchykh',
                              'shchi^ui^u', 'shchi^ai^a', 'i^ushchee',
                              'i^ushchie', 'i^ushchye', 'i^ushchoe',
                              'i^ushchei`', 'i^ushchii`', 'i^ushchyi`',
                              'i^ushchoi`', 'i^ushchem', 'i^ushchim',
                              'i^ushchym', 'i^ushchom', 'vshi^ui^u',
                              'vshi^ai^a', 'shchui^u', 'shchai^a',
                              'shchoi^u', 'shchei^u', 'emi^ui^u',
                              'emi^ai^a', 'nni^ui^u', 'nni^ai^a',
                              'shchimi', 'shchymi', 'shchego', 'shchogo',
                              'shchemu', 'shchomu', 'shchikh', 'shchykh',
                              'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u',
                              'shchee', 'shchie', 'shchye', 'shchoe',
                              'shchei`', 'shchii`', 'shchyi`', 'shchoi`',
                              'shchem', 'shchim', 'shchym', 'shchom',
                              'vshimi', 'vshymi', 'vshego', 'vshogo',
                              'vshemu', 'vshomu', 'vshikh', 'vshykh',
                              'emui^u', 'emai^a', 'emoi^u', 'emei^u',
                              'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u',
                              'vshee', 'vshie', 'vshye', 'vshoe',
                              'vshei`', 'vshii`', 'vshyi`', 'vshoi`',
                              'vshem', 'vshim', 'vshym', 'vshom',
                              'emimi', 'emymi', 'emego', 'emogo',
                              'ememu', 'emomu', 'emikh', 'emykh',
                              'nnimi', 'nnymi', 'nnego', 'nnogo',
                              'nnemu', 'nnomu', 'nnikh', 'nnykh',
                              'emee', 'emie', 'emye', 'emoe', 'emei`',
                              'emii`', 'emyi`', 'emoi`', 'emem', 'emim',
                              'emym', 'emom', 'nnee', 'nnie', 'nnye',
                              'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`',
                              'nnem', 'nnim', 'nnym', 'nnom'):
                        if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or
                            rv[-len(suffix) - 1:-len(suffix)] == "a"):
                            word = word[:-len(suffix)]
                            r2 = r2[:-len(suffix)]
                            rv = rv[:-len(suffix)]
                            adjectival_removed = True
                            break
                    else:
                        word = word[:-len(suffix)]
                        r2 = r2[:-len(suffix)]
                        rv = rv[:-len(suffix)]
                        adjectival_removed = True
                        break

            if not adjectival_removed:
                for suffix in self.__verb_suffixes:
                    if rv.endswith(suffix):
                        if suffix in ("la", "na", "ete", "i`te", "li",
                                      "i`", "l", "em", "n", "lo", "no",
                                      "et", "i^ut", "ny", "t'", "esh'",
                                      "nno"):
                            if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or
                                rv[-len(suffix) - 1:-len(suffix)] == "a"):
                                word = word[:-len(suffix)]
                                r2 = r2[:-len(suffix)]
                                rv = rv[:-len(suffix)]
                                verb_removed = True
                                break
                        else:
                            word = word[:-len(suffix)]
                            r2 = r2[:-len(suffix)]
                            rv = rv[:-len(suffix)]
                            verb_removed = True
                            break

            if not adjectival_removed and not verb_removed:
                for suffix in self.__noun_suffixes:
                    if rv.endswith(suffix):
                        word = word[:-len(suffix)]
                        r2 = r2[:-len(suffix)]
                        rv = rv[:-len(suffix)]
                        break

        # Step 2
        if rv.endswith("i"):
            word = word[:-1]
            r2 = r2[:-1]

        # Step 3
        for suffix in self.__derivational_suffixes:
            if r2.endswith(suffix):
                word = word[:-len(suffix)]
                break

        # Step 4
        if word.endswith("nn"):
            word = word[:-1]
            undouble_success = True

        if not undouble_success:
            for suffix in self.__superlative_suffixes:
                if word.endswith(suffix):
                    word = word[:-len(suffix)]
                    superlative_removed = True
                    break
            if word.endswith("nn"):
                word = word[:-1]

        if not undouble_success and not superlative_removed:
            if word.endswith("'"):
                word = word[:-1]

        if chr_exceeded:
            word = self.__roman_to_cyrillic(word)
        return word

    def __regions_russian(self, word):
        """
        Return the regions RV and R2 which are used by the Russian stemmer.

        In any word, RV is the region after the first vowel,
        or the end of the word if it contains no vowel.

        R2 is the region after the first non-vowel following
        a vowel in R1, or the end of the word if there is no such non-vowel.

        R1 is the region after the first non-vowel following a vowel,
        or the end of the word if there is no such non-vowel.

        :param word: The Russian word whose regions RV and R2 are determined.
        :type word: str or unicode
        :return: the regions RV and R2 for the respective Russian word.
        :rtype: tuple
        :note: This helper method is invoked by the stem method of the subclass
               RussianStemmer. It is not to be invoked directly!

        """
        r1 = ""
        r2 = ""
        rv = ""

        vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y")
        word = (word.replace("i^a", "A")
                    .replace("i^u", "U")
                    .replace("e`", "E"))

        for i in range(1, len(word)):
            if word[i] not in vowels and word[i - 1] in vowels:
                r1 = word[i + 1:]
                break

        for i in range(1, len(r1)):
            if r1[i] not in vowels and r1[i - 1] in vowels:
                r2 = r1[i + 1:]
                break

        for i in range(len(word)):
            if word[i] in vowels:
                rv = word[i + 1:]
                break

        r2 = (r2.replace("A", "i^a")
                .replace("U", "i^u")
                .replace("E", "e`"))
        rv = (rv.replace("A", "i^a")
              .replace("U", "i^u")
              .replace("E", "e`"))
        return (rv, r2)

    def __cyrillic_to_roman(self, word):
        """
        Transliterate a Russian word into the Roman alphabet.

        A Russian word whose letters consist of the Cyrillic
        alphabet are transliterated into the Roman alphabet
        in order to ease the forthcoming stemming process.

        :param word: The word that is transliterated.
        :type word: unicode
        :return: the transliterated word.
        :rtype: unicode
        :note: This helper method is invoked by the stem method of the subclass
               RussianStemmer. It is not to be invoked directly!

        """
        word = (word.replace(u("\u0410"), "a").replace(u("\u0430"), "a")
                    .replace(u("\u0411"), "b").replace(u("\u0431"), "b")
                    .replace(u("\u0412"), "v").replace(u("\u0432"), "v")
                    .replace(u("\u0413"), "g").replace(u("\u0433"), "g")
                    .replace(u("\u0414"), "d").replace(u("\u0434"), "d")
                    .replace(u("\u0415"), "e").replace(u("\u0435"), "e")
                    .replace(u("\u0401"), "e").replace(u("\u0451"), "e")
                    .replace(u("\u0416"), "zh").replace(u("\u0436"), "zh")
                    .replace(u("\u0417"), "z").replace(u("\u0437"), "z")
                    .replace(u("\u0418"), "i").replace(u("\u0438"), "i")
                    .replace(u("\u0419"), "i`").replace(u("\u0439"), "i`")
                    .replace(u("\u041A"), "k").replace(u("\u043A"), "k")
                    .replace(u("\u041B"), "l").replace(u("\u043B"), "l")
                    .replace(u("\u041C"), "m").replace(u("\u043C"), "m")
                    .replace(u("\u041D"), "n").replace(u("\u043D"), "n")
                    .replace(u("\u041E"), "o").replace(u("\u043E"), "o")
                    .replace(u("\u041F"), "p").replace(u("\u043F"), "p")
                    .replace(u("\u0420"), "r").replace(u("\u0440"), "r")
                    .replace(u("\u0421"), "s").replace(u("\u0441"), "s")
                    .replace(u("\u0422"), "t").replace(u("\u0442"), "t")
                    .replace(u("\u0423"), "u").replace(u("\u0443"), "u")
                    .replace(u("\u0424"), "f").replace(u("\u0444"), "f")
                    .replace(u("\u0425"), "kh").replace(u("\u0445"), "kh")
                    .replace(u("\u0426"), "t^s").replace(u("\u0446"), "t^s")
                    .replace(u("\u0427"), "ch").replace(u("\u0447"), "ch")
                    .replace(u("\u0428"), "sh").replace(u("\u0448"), "sh")
                    .replace(u("\u0429"), "shch").replace(u("\u0449"), "shch")
                    .replace(u("\u042A"), "''").replace(u("\u044A"), "''")
                    .replace(u("\u042B"), "y").replace(u("\u044B"), "y")
                    .replace(u("\u042C"), "'").replace(u("\u044C"), "'")
                    .replace(u("\u042D"), "e`").replace(u("\u044D"), "e`")
                    .replace(u("\u042E"), "i^u").replace(u("\u044E"), "i^u")
                    .replace(u("\u042F"), "i^a").replace(u("\u044F"), "i^a"))
        return word

    def __roman_to_cyrillic(self, word):
        """
        Transliterate a Russian word back into the Cyrillic alphabet.

        A Russian word formerly transliterated into the Roman alphabet
        in order to ease the stemming process, is transliterated back
        into the Cyrillic alphabet, its original form.

        :param word: The word that is transliterated.
        :type word: str or unicode
        :return: word, the transliterated word.
        :rtype: unicode
        :note: This helper method is invoked by the stem method of the subclass
               RussianStemmer. It is not to be invoked directly!

        """
        word = (word.replace("i^u", u("\u044E")).replace("i^a", u("\u044F"))
                    .replace("shch", u("\u0449")).replace("kh", u("\u0445"))
                    .replace("t^s", u("\u0446")).replace("ch", u("\u0447"))
                    .replace("e`", u("\u044D")).replace("i`", u("\u0439"))
                    .replace("sh", u("\u0448")).replace("k", u("\u043A"))
                    .replace("e", u("\u0435")).replace("zh", u("\u0436"))
                    .replace("a", u("\u0430")).replace("b", u("\u0431"))
                    .replace("v", u("\u0432")).replace("g", u("\u0433"))
                    .replace("d", u("\u0434")).replace("e", u("\u0435"))
                    .replace("z", u("\u0437")).replace("i", u("\u0438"))
                    .replace("l", u("\u043B")).replace("m", u("\u043C"))
                    .replace("n", u("\u043D")).replace("o", u("\u043E"))
                    .replace("p", u("\u043F")).replace("r", u("\u0440"))
                    .replace("s", u("\u0441")).replace("t", u("\u0442"))
                    .replace("u", u("\u0443")).replace("f", u("\u0444"))
                    .replace("''", u("\u044A")).replace("y", u("\u044B"))
                    .replace("'", u("\u044C")))
        return word