416 lines
17 KiB
Python
416 lines
17 KiB
Python
# coding= utf-8
|
|
|
|
# This script implements the Double Metaphone algorythm (c) 1998, 1999 by
|
|
# Lawrence Philips. It was translated to Python from the C source written by
|
|
# Kevin Atkinson (http://aspell.net/metaphone/) By Andrew Collins - January 12,
|
|
# 2007 who claims no rights to this work.
|
|
# http://atomboy.isa-geek.com:8080/plone/Members/acoil/programing/double-metaphone
|
|
|
|
import re
|
|
|
|
from whoosh.compat import u
|
|
|
|
vowels = frozenset("AEIOUY")
|
|
slavo_germ_exp = re.compile("W|K|CZ|WITZ")
|
|
silent_starts = re.compile("GN|KN|PN|WR|PS")
|
|
|
|
|
|
def double_metaphone(text):
|
|
text = text.upper()
|
|
slavo_germanic = bool(slavo_germ_exp.search(text))
|
|
|
|
length = len(text)
|
|
text = "--" + text + " "
|
|
first = pos = 2
|
|
last = first + length - 1
|
|
primary = secondary = ""
|
|
|
|
if silent_starts.match(text, pos):
|
|
pos += 1
|
|
|
|
while pos < length + 2:
|
|
ch = text[pos]
|
|
|
|
if ch in vowels:
|
|
# all init vowels now map to 'A'
|
|
if pos != first:
|
|
next = (None, 1)
|
|
else:
|
|
next = ("A", 1)
|
|
elif ch == "B":
|
|
#"-mb", e.g", "dumb", already skipped over... see 'M' below
|
|
if text[pos + 1] == "B":
|
|
next = ("P", 2)
|
|
else:
|
|
next = ("P", 1)
|
|
elif ch == "C":
|
|
# various germanic
|
|
if (pos > (first + 1) and text[pos - 2] not in vowels and text[pos - 1:pos + 2] == 'ACH' and \
|
|
(text[pos + 2] not in ['I', 'E'] or text[pos - 2:pos + 4] in ['BACHER', 'MACHER'])):
|
|
next = ('K', 2)
|
|
# special case 'CAESAR'
|
|
elif pos == first and text[first:first + 6] == 'CAESAR':
|
|
next = ('S', 2)
|
|
elif text[pos:pos + 4] == 'CHIA': # italian 'chianti'
|
|
next = ('K', 2)
|
|
elif text[pos:pos + 2] == 'CH':
|
|
# find 'michael'
|
|
if pos > first and text[pos:pos + 4] == 'CHAE':
|
|
next = ('K', 'X', 2)
|
|
elif pos == first and (text[pos + 1:pos + 6] in ['HARAC', 'HARIS'] or \
|
|
text[pos + 1:pos + 4] in ["HOR", "HYM", "HIA", "HEM"]) and text[first:first + 5] != 'CHORE':
|
|
next = ('K', 2)
|
|
# germanic, greek, or otherwise 'ch' for 'kh' sound
|
|
elif text[first:first + 4] in ['VAN ', 'VON '] or text[first:first + 3] == 'SCH' \
|
|
or text[pos - 2:pos + 4] in ["ORCHES", "ARCHIT", "ORCHID"] \
|
|
or text[pos + 2] in ['T', 'S'] \
|
|
or ((text[pos - 1] in ["A", "O", "U", "E"] or pos == first) \
|
|
and text[pos + 2] in ["L", "R", "N", "M", "B", "H", "F", "V", "W", " "]):
|
|
next = ('K', 1)
|
|
else:
|
|
if pos > first:
|
|
if text[first:first + 2] == 'MC':
|
|
next = ('K', 2)
|
|
else:
|
|
next = ('X', 'K', 2)
|
|
else:
|
|
next = ('X', 2)
|
|
# e.g, 'czerny'
|
|
elif text[pos:pos + 2] == 'CZ' and text[pos - 2:pos + 2] != 'WICZ':
|
|
next = ('S', 'X', 2)
|
|
# e.g., 'focaccia'
|
|
elif text[pos + 1:pos + 4] == 'CIA':
|
|
next = ('X', 3)
|
|
# double 'C', but not if e.g. 'McClellan'
|
|
elif text[pos:pos + 2] == 'CC' and not (pos == (first + 1) and text[first] == 'M'):
|
|
# 'bellocchio' but not 'bacchus'
|
|
if text[pos + 2] in ["I", "E", "H"] and text[pos + 2:pos + 4] != 'HU':
|
|
# 'accident', 'accede' 'succeed'
|
|
if (pos == (first + 1) and text[first] == 'A') or \
|
|
text[pos - 1:pos + 4] in ['UCCEE', 'UCCES']:
|
|
next = ('KS', 3)
|
|
# 'bacci', 'bertucci', other italian
|
|
else:
|
|
next = ('X', 3)
|
|
else:
|
|
next = ('K', 2)
|
|
elif text[pos:pos + 2] in ["CK", "CG", "CQ"]:
|
|
next = ('K', 'K', 2)
|
|
elif text[pos:pos + 2] in ["CI", "CE", "CY"]:
|
|
# italian vs. english
|
|
if text[pos:pos + 3] in ["CIO", "CIE", "CIA"]:
|
|
next = ('S', 'X', 2)
|
|
else:
|
|
next = ('S', 2)
|
|
else:
|
|
# name sent in 'mac caffrey', 'mac gregor
|
|
if text[pos + 1:pos + 3] in [" C", " Q", " G"]:
|
|
next = ('K', 3)
|
|
else:
|
|
if text[pos + 1] in ["C", "K", "Q"] and text[pos + 1:pos + 3] not in ["CE", "CI"]:
|
|
next = ('K', 2)
|
|
else: # default for 'C'
|
|
next = ('K', 1)
|
|
elif ch == u('\xc7'):
|
|
next = ('S', 1)
|
|
elif ch == 'D':
|
|
if text[pos:pos + 2] == 'DG':
|
|
if text[pos + 2] in ['I', 'E', 'Y']: # e.g. 'edge'
|
|
next = ('J', 3)
|
|
else:
|
|
next = ('TK', 2)
|
|
elif text[pos:pos + 2] in ['DT', 'DD']:
|
|
next = ('T', 2)
|
|
else:
|
|
next = ('T', 1)
|
|
elif ch == 'F':
|
|
if text[pos + 1] == 'F':
|
|
next = ('F', 2)
|
|
else:
|
|
next = ('F', 1)
|
|
elif ch == 'G':
|
|
if text[pos + 1] == 'H':
|
|
if pos > first and text[pos - 1] not in vowels:
|
|
next = ('K', 2)
|
|
elif pos < (first + 3):
|
|
if pos == first: # 'ghislane', ghiradelli
|
|
if text[pos + 2] == 'I':
|
|
next = ('J', 2)
|
|
else:
|
|
next = ('K', 2)
|
|
# Parker's rule (with some further refinements) - e.g., 'hugh'
|
|
elif (pos > (first + 1) and text[pos - 2] in ['B', 'H', 'D']) \
|
|
or (pos > (first + 2) and text[pos - 3] in ['B', 'H', 'D']) \
|
|
or (pos > (first + 3) and text[pos - 4] in ['B', 'H']):
|
|
next = (None, 2)
|
|
else:
|
|
# e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
|
|
if pos > (first + 2) and text[pos - 1] == 'U' \
|
|
and text[pos - 3] in ["C", "G", "L", "R", "T"]:
|
|
next = ('F', 2)
|
|
else:
|
|
if pos > first and text[pos - 1] != 'I':
|
|
next = ('K', 2)
|
|
elif text[pos + 1] == 'N':
|
|
if pos == (first + 1) and text[first] in vowels and not slavo_germanic:
|
|
next = ('KN', 'N', 2)
|
|
else:
|
|
# not e.g. 'cagney'
|
|
if text[pos + 2:pos + 4] != 'EY' and text[pos + 1] != 'Y' and not slavo_germanic:
|
|
next = ('N', 'KN', 2)
|
|
else:
|
|
next = ('KN', 2)
|
|
# 'tagliaro'
|
|
elif text[pos + 1:pos + 3] == 'LI' and not slavo_germanic:
|
|
next = ('KL', 'L', 2)
|
|
# -ges-,-gep-,-gel-, -gie- at beginning
|
|
elif pos == first and (text[pos + 1] == 'Y' \
|
|
or text[pos + 1:pos + 3] in ["ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"]):
|
|
next = ('K', 'J', 2)
|
|
# -ger-, -gy-
|
|
elif (text[pos + 1:pos + 2] == 'ER' or text[pos + 1] == 'Y') \
|
|
and text[first:first + 6] not in ["DANGER", "RANGER", "MANGER"] \
|
|
and text[pos - 1] not in ['E', 'I'] and text[pos - 1:pos + 2] not in ['RGY', 'OGY']:
|
|
next = ('K', 'J', 2)
|
|
# italian e.g, 'biaggi'
|
|
elif text[pos + 1] in ['E', 'I', 'Y'] or text[pos - 1:pos + 3] in ["AGGI", "OGGI"]:
|
|
# obvious germanic
|
|
if text[first:first + 4] in ['VON ', 'VAN '] or text[first:first + 3] == 'SCH' \
|
|
or text[pos + 1:pos + 3] == 'ET':
|
|
next = ('K', 2)
|
|
else:
|
|
# always soft if french ending
|
|
if text[pos + 1:pos + 5] == 'IER ':
|
|
next = ('J', 2)
|
|
else:
|
|
next = ('J', 'K', 2)
|
|
elif text[pos + 1] == 'G':
|
|
next = ('K', 2)
|
|
else:
|
|
next = ('K', 1)
|
|
elif ch == 'H':
|
|
# only keep if first & before vowel or btw. 2 vowels
|
|
if (pos == first or text[pos - 1] in vowels) and text[pos + 1] in vowels:
|
|
next = ('H', 2)
|
|
else: # (also takes care of 'HH')
|
|
next = (None, 1)
|
|
elif ch == 'J':
|
|
# obvious spanish, 'jose', 'san jacinto'
|
|
if text[pos:pos + 4] == 'JOSE' or text[first:first + 4] == 'SAN ':
|
|
if (pos == first and text[pos + 4] == ' ') or text[first:first + 4] == 'SAN ':
|
|
next = ('H',)
|
|
else:
|
|
next = ('J', 'H')
|
|
elif pos == first and text[pos:pos + 4] != 'JOSE':
|
|
next = ('J', 'A') # Yankelovich/Jankelowicz
|
|
else:
|
|
# spanish pron. of e.g. 'bajador'
|
|
if text[pos - 1] in vowels and not slavo_germanic \
|
|
and text[pos + 1] in ['A', 'O']:
|
|
next = ('J', 'H')
|
|
else:
|
|
if pos == last:
|
|
next = ('J', ' ')
|
|
else:
|
|
if text[pos + 1] not in ["L", "T", "K", "S", "N", "M", "B", "Z"] \
|
|
and text[pos - 1] not in ["S", "K", "L"]:
|
|
next = ('J',)
|
|
else:
|
|
next = (None,)
|
|
if text[pos + 1] == 'J':
|
|
next = next + (2,)
|
|
else:
|
|
next = next + (1,)
|
|
elif ch == 'K':
|
|
if text[pos + 1] == 'K':
|
|
next = ('K', 2)
|
|
else:
|
|
next = ('K', 1)
|
|
elif ch == 'L':
|
|
if text[pos + 1] == 'L':
|
|
# spanish e.g. 'cabrillo', 'gallegos'
|
|
if (pos == (last - 2) and text[pos - 1:pos + 3] in ["ILLO", "ILLA", "ALLE"]) \
|
|
or ((text[last - 1:last + 1] in ["AS", "OS"] or text[last] in ["A", "O"]) \
|
|
and text[pos - 1:pos + 3] == 'ALLE'):
|
|
next = ('L', '', 2)
|
|
else:
|
|
next = ('L', 2)
|
|
else:
|
|
next = ('L', 1)
|
|
elif ch == 'M':
|
|
if text[pos + 1:pos + 4] == 'UMB' \
|
|
and (pos + 1 == last or text[pos + 2:pos + 4] == 'ER') \
|
|
or text[pos + 1] == 'M':
|
|
next = ('M', 2)
|
|
else:
|
|
next = ('M', 1)
|
|
elif ch == 'N':
|
|
if text[pos + 1] == 'N':
|
|
next = ('N', 2)
|
|
else:
|
|
next = ('N', 1)
|
|
elif ch == u('\xd1'):
|
|
next = ('N', 1)
|
|
elif ch == 'P':
|
|
if text[pos + 1] == 'H':
|
|
next = ('F', 2)
|
|
elif text[pos + 1] in ['P', 'B']: # also account for "campbell", "raspberry"
|
|
next = ('P', 2)
|
|
else:
|
|
next = ('P', 1)
|
|
elif ch == 'Q':
|
|
if text[pos + 1] == 'Q':
|
|
next = ('K', 2)
|
|
else:
|
|
next = ('K', 1)
|
|
elif ch == 'R':
|
|
# french e.g. 'rogier', but exclude 'hochmeier'
|
|
if pos == last and not slavo_germanic \
|
|
and text[pos - 2:pos] == 'IE' and text[pos - 4:pos - 2] not in ['ME', 'MA']:
|
|
next = ('', 'R')
|
|
else:
|
|
next = ('R',)
|
|
if text[pos + 1] == 'R':
|
|
next = next + (2,)
|
|
else:
|
|
next = next + (1,)
|
|
elif ch == 'S':
|
|
# special cases 'island', 'isle', 'carlisle', 'carlysle'
|
|
if text[pos - 1:pos + 2] in ['ISL', 'YSL']:
|
|
next = (None, 1)
|
|
# special case 'sugar-'
|
|
elif pos == first and text[first:first + 5] == 'SUGAR':
|
|
next = ('X', 'S', 1)
|
|
elif text[pos:pos + 2] == 'SH':
|
|
# germanic
|
|
if text[pos + 1:pos + 5] in ["HEIM", "HOEK", "HOLM", "HOLZ"]:
|
|
next = ('S', 2)
|
|
else:
|
|
next = ('X', 2)
|
|
# italian & armenian
|
|
elif text[pos:pos + 3] in ["SIO", "SIA"] or text[pos:pos + 4] == 'SIAN':
|
|
if not slavo_germanic:
|
|
next = ('S', 'X', 3)
|
|
else:
|
|
next = ('S', 3)
|
|
# german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
|
|
# also, -sz- in slavic language altho in hungarian it is pronounced 's'
|
|
elif (pos == first and text[pos + 1] in ["M", "N", "L", "W"]) or text[pos + 1] == 'Z':
|
|
next = ('S', 'X')
|
|
if text[pos + 1] == 'Z':
|
|
next = next + (2,)
|
|
else:
|
|
next = next + (1,)
|
|
elif text[pos:pos + 2] == 'SC':
|
|
# Schlesinger's rule
|
|
if text[pos + 2] == 'H':
|
|
# dutch origin, e.g. 'school', 'schooner'
|
|
if text[pos + 3:pos + 5] in ["OO", "ER", "EN", "UY", "ED", "EM"]:
|
|
# 'schermerhorn', 'schenker'
|
|
if text[pos + 3:pos + 5] in ['ER', 'EN']:
|
|
next = ('X', 'SK', 3)
|
|
else:
|
|
next = ('SK', 3)
|
|
else:
|
|
if pos == first and text[first + 3] not in vowels and text[first + 3] != 'W':
|
|
next = ('X', 'S', 3)
|
|
else:
|
|
next = ('X', 3)
|
|
elif text[pos + 2] in ['I', 'E', 'Y']:
|
|
next = ('S', 3)
|
|
else:
|
|
next = ('SK', 3)
|
|
# french e.g. 'resnais', 'artois'
|
|
elif pos == last and text[pos - 2:pos] in ['AI', 'OI']:
|
|
next = ('', 'S', 1)
|
|
else:
|
|
next = ('S',)
|
|
if text[pos + 1] in ['S', 'Z']:
|
|
next = next + (2,)
|
|
else:
|
|
next = next + (1,)
|
|
elif ch == 'T':
|
|
if text[pos:pos + 4] == 'TION':
|
|
next = ('X', 3)
|
|
elif text[pos:pos + 3] in ['TIA', 'TCH']:
|
|
next = ('X', 3)
|
|
elif text[pos:pos + 2] == 'TH' or text[pos:pos + 3] == 'TTH':
|
|
# special case 'thomas', 'thames' or germanic
|
|
if text[pos + 2:pos + 4] in ['OM', 'AM'] or text[first:first + 4] in ['VON ', 'VAN '] \
|
|
or text[first:first + 3] == 'SCH':
|
|
next = ('T', 2)
|
|
else:
|
|
next = ('0', 'T', 2)
|
|
elif text[pos + 1] in ['T', 'D']:
|
|
next = ('T', 2)
|
|
else:
|
|
next = ('T', 1)
|
|
elif ch == 'V':
|
|
if text[pos + 1] == 'V':
|
|
next = ('F', 2)
|
|
else:
|
|
next = ('F', 1)
|
|
elif ch == 'W':
|
|
# can also be in middle of word
|
|
if text[pos:pos + 2] == 'WR':
|
|
next = ('R', 2)
|
|
elif pos == first and (text[pos + 1] in vowels or text[pos:pos + 2] == 'WH'):
|
|
# Wasserman should match Vasserman
|
|
if text[pos + 1] in vowels:
|
|
next = ('A', 'F', 1)
|
|
else:
|
|
next = ('A', 1)
|
|
# Arnow should match Arnoff
|
|
elif (pos == last and text[pos - 1] in vowels) \
|
|
or text[pos - 1:pos + 5] in ["EWSKI", "EWSKY", "OWSKI", "OWSKY"] \
|
|
or text[first:first + 3] == 'SCH':
|
|
next = ('', 'F', 1)
|
|
# polish e.g. 'filipowicz'
|
|
elif text[pos:pos + 4] in ["WICZ", "WITZ"]:
|
|
next = ('TS', 'FX', 4)
|
|
else: # default is to skip it
|
|
next = (None, 1)
|
|
elif ch == 'X':
|
|
# french e.g. breaux
|
|
next = (None,)
|
|
if not(pos == last and (text[pos - 3:pos] in ["IAU", "EAU"] \
|
|
or text[pos - 2:pos] in ['AU', 'OU'])):
|
|
next = ('KS',)
|
|
if text[pos + 1] in ['C', 'X']:
|
|
next = next + (2,)
|
|
else:
|
|
next = next + (1,)
|
|
elif ch == 'Z':
|
|
# chinese pinyin e.g. 'zhao'
|
|
if text[pos + 1] == 'H':
|
|
next = ('J',)
|
|
elif text[pos + 1:pos + 3] in ["ZO", "ZI", "ZA"] \
|
|
or (slavo_germanic and pos > first and text[pos - 1] != 'T'):
|
|
next = ('S', 'TS')
|
|
else:
|
|
next = ('S',)
|
|
if text[pos + 1] == 'Z':
|
|
next = next + (2,)
|
|
else:
|
|
next = next + (1,)
|
|
else:
|
|
next = (None, 1)
|
|
|
|
if len(next) == 2:
|
|
if next[0]:
|
|
primary += next[0]
|
|
secondary += next[0]
|
|
pos += next[1]
|
|
elif len(next) == 3:
|
|
if next[0]:
|
|
primary += next[0]
|
|
if next[1]:
|
|
secondary += next[1]
|
|
pos += next[2]
|
|
|
|
if primary == secondary:
|
|
return (primary, None)
|
|
else:
|
|
return (primary, secondary)
|
|
|