microproduct/atmosphericDelay/ISCEApp/site-packages/whoosh/lang/porter2.py

314 lines
8.1 KiB
Python

"""An implementation of the Porter2 stemming algorithm.
See http://snowball.tartarus.org/algorithms/english/stemmer.html
Adapted from pyporter2 by Michael Dirolf.
This algorithm is more correct but (at least in this implementation)
several times slower than the original porter algorithm as implemented
in stemming.porter.
"""
import re
r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)")
ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$")
ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$")
ccy_exp = re.compile(r"([aeiouy])y")
s1a_exp = re.compile(r"[aeiouy].")
s1b_exp = re.compile(r"[aeiouy]")
def get_r1(word):
# exceptional forms
if word.startswith('gener') or word.startswith('arsen'):
return 5
if word.startswith('commun'):
return 6
# normal form
match = r_exp.match(word)
if match:
return match.start(1)
return len(word)
def get_r2(word):
match = r_exp.match(word, get_r1(word))
if match:
return match.start(1)
return len(word)
def ends_with_short_syllable(word):
if len(word) == 2:
if ewss_exp1.match(word):
return True
if ewss_exp2.match(word):
return True
return False
def is_short_word(word):
if ends_with_short_syllable(word):
if get_r1(word) == len(word):
return True
return False
def remove_initial_apostrophe(word):
if word.startswith("'"):
return word[1:]
return word
def capitalize_consonant_ys(word):
if word.startswith('y'):
word = 'Y' + word[1:]
return ccy_exp.sub('\g<1>Y', word)
def step_0(word):
if word.endswith("'s'"):
return word[:-3]
if word.endswith("'s"):
return word[:-2]
if word.endswith("'"):
return word[:-1]
return word
def step_1a(word):
if word.endswith('sses'):
return word[:-4] + 'ss'
if word.endswith('ied') or word.endswith('ies'):
if len(word) > 4:
return word[:-3] + 'i'
else:
return word[:-3] + 'ie'
if word.endswith('us') or word.endswith('ss'):
return word
if word.endswith('s'):
preceding = word[:-1]
if s1a_exp.search(preceding):
return preceding
return word
return word
doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt')
def ends_with_double(word):
for double in doubles:
if word.endswith(double):
return True
return False
def step_1b_helper(word):
if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
return word + 'e'
if ends_with_double(word):
return word[:-1]
if is_short_word(word):
return word + 'e'
return word
s1b_suffixes = ('ed', 'edly', 'ing', 'ingly')
def step_1b(word, r1):
if word.endswith('eedly'):
if len(word) - 5 >= r1:
return word[:-3]
return word
if word.endswith('eed'):
if len(word) - 3 >= r1:
return word[:-1]
return word
for suffix in s1b_suffixes:
if word.endswith(suffix):
preceding = word[:-len(suffix)]
if s1b_exp.search(preceding):
return step_1b_helper(preceding)
return word
return word
def step_1c(word):
if word.endswith('y') or word.endswith('Y') and len(word) > 1:
if word[-2] not in 'aeiouy':
if len(word) > 2:
return word[:-1] + 'i'
return word
def step_2_helper(word, r1, end, repl, prev):
if word.endswith(end):
if len(word) - len(end) >= r1:
if prev == []:
return word[:-len(end)] + repl
for p in prev:
if word[:-len(end)].endswith(p):
return word[:-len(end)] + repl
return word
return None
s2_triples = (('ization', 'ize', []),
('ational', 'ate', []),
('fulness', 'ful', []),
('ousness', 'ous', []),
('iveness', 'ive', []),
('tional', 'tion', []),
('biliti', 'ble', []),
('lessli', 'less', []),
('entli', 'ent', []),
('ation', 'ate', []),
('alism', 'al', []),
('aliti', 'al', []),
('ousli', 'ous', []),
('iviti', 'ive', []),
('fulli', 'ful', []),
('enci', 'ence', []),
('anci', 'ance', []),
('abli', 'able', []),
('izer', 'ize', []),
('ator', 'ate', []),
('alli', 'al', []),
('bli', 'ble', []),
('ogi', 'og', ['l']),
('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't']))
def step_2(word, r1):
for trip in s2_triples:
attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2])
if attempt:
return attempt
return word
def step_3_helper(word, r1, r2, end, repl, r2_necessary):
if word.endswith(end):
if len(word) - len(end) >= r1:
if not r2_necessary:
return word[:-len(end)] + repl
else:
if len(word) - len(end) >= r2:
return word[:-len(end)] + repl
return word
return None
s3_triples = (('ational', 'ate', False),
('tional', 'tion', False),
('alize', 'al', False),
('icate', 'ic', False),
('iciti', 'ic', False),
('ative', '', True),
('ical', 'ic', False),
('ness', '', False),
('ful', '', False))
def step_3(word, r1, r2):
for trip in s3_triples:
attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2])
if attempt:
return attempt
return word
s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement',
'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize')
def step_4(word, r2):
for end in s4_delete_list:
if word.endswith(end):
if len(word) - len(end) >= r2:
return word[:-len(end)]
return word
if word.endswith('sion') or word.endswith('tion'):
if len(word) - 3 >= r2:
return word[:-3]
return word
def step_5(word, r1, r2):
if word.endswith('l'):
if len(word) - 1 >= r2 and word[-2] == 'l':
return word[:-1]
return word
if word.endswith('e'):
if len(word) - 1 >= r2:
return word[:-1]
if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]):
return word[:-1]
return word
def normalize_ys(word):
return word.replace('Y', 'y')
exceptional_forms = {'skis': 'ski',
'skies': 'sky',
'dying': 'die',
'lying': 'lie',
'tying': 'tie',
'idly': 'idl',
'gently': 'gentl',
'ugly': 'ugli',
'early': 'earli',
'only': 'onli',
'singly': 'singl',
'sky': 'sky',
'news': 'news',
'howe': 'howe',
'atlas': 'atlas',
'cosmos': 'cosmos',
'bias': 'bias',
'andes': 'andes'}
exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring',
'earring', 'proceed', 'exceed', 'succeed'])
def stem(word):
if len(word) <= 2:
return word
word = remove_initial_apostrophe(word)
# handle some exceptional forms
if word in exceptional_forms:
return exceptional_forms[word]
word = capitalize_consonant_ys(word)
r1 = get_r1(word)
r2 = get_r2(word)
word = step_0(word)
word = step_1a(word)
# handle some more exceptional forms
if word in exceptional_early_exit_post_1a:
return word
word = step_1b(word, r1)
word = step_1c(word)
word = step_2(word, r1)
word = step_3(word, r1, r2)
word = step_4(word, r2)
word = step_5(word, r1, r2)
word = normalize_ys(word)
return word