On Mon, Aug 11, 2008 at 12:13 PM, Dave Webster <[EMAIL PROTECTED]> wrote: > Thanks, Timothy. I'm pretty sure that there is no such thing as a "beautiful" > implementation of double-metaphone but I would personally like to have a copy > of your python implementation. I have a fairly elegant version of the > original > metaphone algorithm I wrote myself (in PERL, many years ago) but I've > never found > the time to reverse-engineer the original C++ code for double-metaphone and > "pythonize" it. > > On Mon, Aug 11, 2008 at 2:08 PM, Timothy Grant <[EMAIL PROTECTED]> wrote: >> On Mon, Aug 11, 2008 at 8:44 AM, Casey <[EMAIL PROTECTED]> wrote: >>> My first thought is that you should be looking at implementations of >>> Hamming Distance. If you are actually looking for something like >>> SOUNDEX you might also want to look at the double metaphor algorithm, >>> which is significantly harder to implement but provides better >>> matching and is less susceptible to differences based on name origins. >>> -- >>> http://mail.python.org/mailman/listinfo/python-list >>> >> >> I responded in the thread of the poster's original message on this >> subject, but will do the same here. I have a horribly ugly version of >> the double-metaphone algorithm in python that does work, and may be of >> some use in solving this problem. >> >> -- >> Stand Fast, >> tjg. [Timothy Grant] >> > This is truly cringe-worthy, and pretty much a direct port of the C++ code. It need unit tests (which are on my "to-do someday" list) but even though it's ugly it does work and I have managed to do real work with it.
-- Stand Fast, tjg. [Timothy Grant]
# # DMetaph.py # # Copyright 2006 by Timothy (rhacer) Grant # # Based on an algorithm and code by Lawrence Philips # # This code is licensed under the terms of the GNU Public License v2 # import re DEBUG = False class DMetaph(str): """ DMetaph(<word>) creates a Double Metaphone encoding of the word passed in. """ def __init__(self, s): self.s = s.upper() + ' ' # Padding allows safe indexing beyond the end self.length = len(s) self.last = self.length - 1 self.current = 0 self.primary = "" self.secondary = "" self.alternate = False if self.StringAt(0, 2, 'GN', 'KN', 'PN', 'WR', 'PS'): self.current += 1 if self.s[0] == 'X': self.MetaphAdd('S') self.current += 1 while len(self.primary) < 4 or len(self.secondary) < 4: if DEBUG: print "processing character: %s current: %s length: %s" % ( self.s[self.current], self.current, self.length) if self.current >= self.length: break self.ProcessString() self.metaph = self.primary self.metaph2 = '' if len(self.metaph) > 4: self.metaph = self.metaph[:4] if self.alternate: self.metaph2 = self.secondary if len(self.metaph2) > 4: self.metaph2 = self.metaph2[:4] def SlavoGermanic(self): if ('W' in self.s or 'K' in self.s or 'CZ' in self.s or 'WITZ' in self.s): return True return False def MetaphAdd(self, main, alt=None): if main: self.primary += main if alt: self.alternate = True if (alt[0] <> ' '): self.secondary += alt else: if (main[0] <> ' '): self.secondary += main def IsVowel(self, at): if (at < 0) or (at > self.length): return False if self.s[at] in 'AEIOUY': return True return False def StringAt(self, start, length, *sub_strings): if self.s[start:start + length] in sub_strings: return True return False def print_diags(self, section): print"section: %s current: %s" % (section, self.current) def ProcessString(self): if self.IsVowel(self.current): if DEBUG: self.print_diags('VOWEL') if self.current == 0: self.MetaphAdd('A') self.current += 1 return elif self.s[self.current] == 'B': if DEBUG: self.print_diags('B') self.MetaphAdd('P') if self.s[self.current + 1] == 'B': self.current += 2 else: self.current += 1 return elif self.s[self.current] == 'C': if DEBUG: self.print_diags('C') if (self.current > 1 and not self.IsVowel(self.current - 2) and self.StringAt(self.current - 1, 3, 'ACH') and self.s[self.current + 2] <> 'I' and self.s[self.current + 2] <> 'E' or self.StringAt(self.current - 2, 6, 'BACHER', 'MACHER')): self.MetaphAdd('K') self.current += 2 return if self.current == 0 and self.StringAt(self.current, 6, 'CAESAR'): self.MetaphAdd('S') self.current += 2 return if self.StringAt(self.current, 4, 'CHIA'): self.MetaphAdd('K') self.current += 2 return if self.StringAt(self.current, 2, 'CH'): if self.current > 0 and self.StringAt(self.current, 4, 'CHAE'): self.MetaphAdd('K', 'X') self.current += 2 return if (self.current == 0 and (self.StringAt(self.current + 1, 5, 'HARAC', 'HARIS') or self.StringAt(self.current + 1, 3, 'HOR', 'HYM', 'HIA', 'HEM')) and not self.StringAt(0, 5, 'CHORE')): self.MetaphAdd('K') self.current += 2 return if ((self.StringAt(0, 4, 'VAN ', 'VON ') or self.StringAt(0, 3, 'SCH')) or self.StringAt(self.current - 2, 6, 'ORCHES', 'ARCHIT', 'ORCHID') or self.s[self.current - 2] in 'TS' or (self.s[self.current - 1] in 'AOUE' or self.current == 0) and self.s[self.current + 2] in 'LRNMBHFVW '): self.MetaphAdd('K') else: if self.current > 0: if self.StringAt(0, 2, 'MC'): self.MetaphAdd('K') else: self.MetaphAdd('X', 'K') else: self.MetaphAdd('X') self.current += 2 return if (self.StringAt(self.current, 2, 'CZ') and not self.StringAt(self.current - 2, 4, 'WICZ')): self.MetaphAdd('S', 'X') self.current += 2 return if self.StringAt(self.current + 1, 3, 'CIA'): self.MetaphAdd('X') self.current += 3 return if (self.StringAt(self.current, 2, 'CC') and not (self.current == 1 and self.s[0] == 'M')): if (self.s[self.current + 2] in 'IEH' and not self.StringAt(self.current + 2, 2, 'HU')): if (self.current == 1 and self.s[self.current - 1] == 'A' or self.StringAt(self.current - 1, 5, 'UCCEE', 'UCCES')): self.MetaphAdd('KS') else: self.MetaphAdd('X') self.current += 3 return else: self.MetaphAdd('K') self.current += 2 return if self.StringAt(self.current, 2, 'CK', 'CG', 'CQ'): self.MetaphAdd('K') self.current += 2 return if self.StringAt(self.current, 2, 'CI', 'CE', 'CY'): if self.StringAt(self.current, 3, 'CIO', 'CIE', 'CIA'): self.MetaphAdd('S', 'X') else: self.MetaphAdd('S') self.current += 2 return self.MetaphAdd('K') if self.StringAt(self.current + 1, 2, ' C', ' Q', ' G'): self.current += 3 else: if (self.s[self.current + 1] in 'CKQ' and not self.StringAt(self.current + 1, 2, 'CE', 'CI')): self.current += 2 else: self.current += 1 return elif self.s[self.current] == 'D': if DEBUG: self.print_diags('D') if self.StringAt(self.current, 2, 'DG'): if self.s[self.current + 2] in 'IEY': self.MetaphAdd('J') self.current += 3 return else: self.MetaphAdd('TK') self.current += 2 if self.StringAt(self.current, 2, 'DT', 'DD'): self.MetaphAdd('T') self.current += 2 return self.MetaphAdd('T') self.current += 1 return elif self.s[self.current] == 'F': if DEBUG: self.print_diags('F') if self.s[self.current + 1] == 'F': self.current += 2 else: self.current += 1 self.MetaphAdd('F') return elif self.s[self.current] == 'G': if DEBUG: self.print_diags('G') if self.s[self.current + 1] == 'H': if self.current > 0 and not self.IsVowel(self.current - 1): self.MetaphAdd('K') self.current += 2 return if self.current < 3: if self.current == 0: if self.s[self.current + 2] == 'I': self.MetaphAdd('J') else: self.MetaphAdd('K') self.current += 2 return if ((self.current > 1 and self.s[self.current - 2] in 'BHD') or (self.current > 2 and self.s[self.current - 3] in 'BHD') or (self.current > 3 and self.s[self.current - 4] in 'BH')): self.current += 2 return else: if (self.current > 2 and self.s[self.current - 1] == 'U' and self.s[self.current - 3] in 'CGLRT'): self.MetaphAdd('F') else: if self.current > 0 and self.s[self.current - 1] <> 'I': self.MetaphAdd('K') self.current += 2 return if self.s[self.current + 1] == 'N': if self.current == 1 and self.IsVowel(0) and not self.SlavoGermanic(): self.MetaphAdd('KN', 'N') else: if (not self.StringAt(self.current + 2, 2, 'EY') and self.s[self.current + 1] <> 'Y' and not self.SlavoGermanic()): self.MetaphAdd('N', 'KN') else: self.MetaphAdd('KN') self.current += 2 return if self.StringAt(self.current + 1, 2, 'LI') and not self.SlavoGermanic(): self.MetaphAdd('KL', 'L') self.current += 2 return if ((self.current == 0 and self.s[self.current + 1] == 'Y') or self.StringAt(self.current + 1, 2, 'ES', 'EP', 'EB', 'EL', 'EY', 'IB', 'IL', 'IN', 'IE', 'EI', 'ER')): self.MetaphAdd('K', 'J') self.current += 2 return if (self.StringAt(self.current + 1, 2, 'ER') or self.s[self.current + 1] == 'Y' and not self.StringAt(0, 6, 'DANGER', 'RANGER', 'MANGER') and not self.s[self.current - 1] in 'EI' and not self.StringAt(self.current - 1, 3, 'RGY', 'OGY')): self.MetaphAdd('K', 'J') self.current += 2 return if (self.s[self.current + 1] in 'EIY' or self.StringAt(self.current - 1, 4, 'AGGI', 'OGGI')): if ((self.StringAt(0, 4, 'VAN ', 'VON ') or self.StringAt(0, 3, 'SCH')) or self.StringAt(self.current + 1, 2, 'ET')): self.MetaphAdd('K') else: if self.StringAt(self.current + 1, 4, 'IER'): self.MetaphAdd('J') else: self.MetaphAdd('J', 'K') self.current += 2 return if self.s[self.current + 1] == 'G': self.current += 2 else: self.current += 1 self.MetaphAdd('K') return elif self.s[self.current] == 'H': if DEBUG: self.print_diags('H') if ((self.current == 0 or self.IsVowel(self.current - 1)) and self.IsVowel(self.current + 1)): self.MetaphAdd('H') self.current += 2 else: self.current += 1 return elif self.s[self.current] == 'J': if DEBUG: self.print_diags('J') if self.StringAt(self.current, 4, 'JOSE') or self.StringAt(0, 4, 'SAN '): if ((self.current == 0 and self.s[self.current + 4] == ' ') or self.StringAt(0, 4, 'SAN ')): self.MetaphAdd('H') else: self.MetaphAdd('J', 'H') self.current += 1 return if self.current == 0 and not self.StringAt(self.current, 4, 'JOSE'): self.MetaphAdd('J', 'A') else: if (self.IsVowel(self.current - 1) and not self.SlavoGermanic() and (self.s[self.current + 1] == 'A' or self.s[self.current + 1] == 'O')): self.MetaphAdd('J', 'H') else: if self.current == self.last: self.MetaphAdd('J', ' ') else: if (not self.s[self.current + 1] in 'LTKSNMBZ' and not self.s[self.current - 1] in 'SKL'): self.MetaphAdd('J') if self.s[self.current + 1] == 'J': self.current += 2 else: self.current += 1 return elif self.s[self.current] == 'K': if DEBUG: self.print_diags('K') if self.s[self.current + 1] == 'K': self.current += 2 else: self.current += 1 self.MetaphAdd('K') return elif self.s[self.current] == 'L': if DEBUG: self.print_diags('L') if self.s[self.current + 1] == 'L': if ((self.current == self.length - 3 and self.StringAt(self.current - 1, 4, 'ILLO', 'ILLA', 'ALLE')) or ((self.StringAt(self.last - 1, 2, 'AS', 'OS') or self.StringAt(self.last, 1, 'A', 'O')) and self.StringAt(self.current - 1, 4, 'ALLE'))): self.MetaphAdd('L', ' ') self.current += 2 return self.current += 2 else: self.current += 1 self.MetaphAdd('L') return elif self.s[self.current] == 'M': if DEBUG: self.print_diags('M') if ((self.StringAt(self.current - 1, 3, 'UMB') and ((self.current + 1 == self.last) or self.StringAt(self.current + 2, 2, 'ER'))) or (self.s[self.current + 1] == 'M')): self.current += 2 else: self.current += 1 self.MetaphAdd('M') return elif self.s[self.current] == 'N': if DEBUG: self.print_diags('N') if self.s[self.current + 1] == 'N': self.current += 2 else: self.current += 1 self.MetaphAdd('N') return elif self.s[self.current] == 'P': if DEBUG: self.print_diags('P') if self.s[self.current + 1] == 'H': self.MetaphAdd('F') self.current += 2 return if self.s[self.current + 1] in 'PB': self.current += 2 else: self.current += 1 self.MetaphAdd('P') return elif self.s[self.current] == 'Q': if DEBUG: self.print_diags('Q') if self.s[self.current + 1] == 'Q': self.current += 2 else: self.current += 1 self.MetaphAdd('K') return elif self.s[self.current] == 'R': if DEBUG: self.print_diags('R') if (self.current == self.last and not self.SlavoGermanic() and self.StringAt(self.current - 2, 2, 'IE') and not self.StringAt(self.current - 4, 2, 'ME', 'MA')): self.MetaphAdd('','R') else: self.MetaphAdd('R') if self.s[self.current + 1] == 'R': self.current += 2 else: self.current += 1 return elif self.s[self.current] == 'S': if DEBUG: self.print_diags('S') if self.StringAt(self.current + 1, 3, 'ISL', 'YSL'): self.current += 1 return if self.current == 0 and self.StringAt(self.current, 5, 'SUGAR'): self.MetaphAdd('X', 'S') self.current += 1 if self.StringAt(self.current, 2, 'SH'): if self.StringAt(self.current + 1, 4, 'HEIM', 'HOEK', 'HOLM', 'HOLZ'): self.MetaphAdd('S') else: self.MetaphAdd('X') self.current += 2 return if (self.StringAt(self.current, 3, 'SIO', 'SIA') or self.StringAt(self.current, 4, 'SIAN')): if not self.SlavoGermanic(): self.MetaphAdd('S', 'X') else: self.MetaphAdd('S') self.current += 3 return if ((self.current == 0 and self.s[self.current + 1] in 'MNLW') or self.s[self.current + 1] == 'Z'): self.MetaphAdd('S', 'X') if self.s[self.current + 1] == 'Z': self.current += 2 else: self.current += 1 return if self.StringAt(self.current, 2, 'SC'): if self.s[self.current + 2] == 'H': if self.StringAt(self.current + 3, 2, 'OO', 'ER', 'EN', 'UY', 'ED', 'EM'): if self.StringAt(self.current + 3, 2, 'ER', 'EN'): self.MetaphAdd('X', 'SK') else: self.MetaphAdd('SK') self.current += 3 return else: if (self.current == 0 and not self.IsVowel(3) and self.s[3] <> 'W'): self.MetaphAdd('X', 'S') else: self.MetaphAdd('X') self.current += 3 return if self.s[self.current + 2] in 'IEY': self.MetaphAdd('S') self.current += 3 return self.MetaphAdd('SK') self.current += 3 return if (self.current == self.last and self.StringAt(self.current - 2, 2, 'AI', 'OI')): self.MetaphAdd('', 'S') else: self.MetaphAdd('S') if self.s[self.current + 1] in 'SZ': self.current += 2 else: self.current += 1 return elif self.s[self.current] == 'T': if DEBUG: self.print_diags('T') if self.StringAt(self.current, 4, 'TION'): self.MetaphAdd('X') self.current += 3 return if self.StringAt(self.current, 3, 'TIA', 'TCH'): self.MetaphAdd('X') self.current += 3 return if (self.StringAt(self.current, 2, 'TH') or self.StringAt(self.current, 3, 'TTH')): if (self.StringAt(self.current + 2, 2, 'OM', 'AM') or self.StringAt(0, 4, 'VAN ', 'VON ') or self.StringAt(0, 3, 'SCH')): self.MetaphAdd('T') else: self.MetaphAdd('0', 'T') self.current += 2 return if self.s[self.current + 1] in 'TD': self.current += 2 else: self.current += 1 self.MetaphAdd('T') return elif self.s[self.current] == 'V': if DEBUG: self.print_diags('V') if self.s[self.current + 1] == 'V': self.current += 2 else: self.current += 1 self.MetaphAdd('F') return elif self.s[self.current] == 'W': if DEBUG: self.print_diags('W') if self.StringAt(self.current, 2, 'WR'): self.MetaphAdd('R') self.current += 2 return if (self.current == 0 and (self.IsVowel(self.current + 1) or self.StringAt(self.current, 2, 'WH'))): if self.IsVowel(self.current + 1): self.MetaphAdd('A', 'F') else: self.MetaphAdd('A') if ((self.current == self.last and self.IsVowel(self.current - 1)) or self.StringAt(self.current - 1, 5, 'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY') or self.StringAt(0, 3, 'SCH')): self.MetaphAdd('', 'F') self.current += 1 return if self.StringAt(self.current, 4, 'WICZ', 'WITZ'): self.MetaphAdd('TS', 'FX') self.current += 4 return self.current += 1 return elif self.s[self.current] == 'X': if DEBUG: self.print_diags('X') if (not (self.current == self.last and (self.StringAt(self.current - 3, 3, 'IAU', 'EAU') or self.StringAt(self.current - 2, 2, 'AU', 'OU')))): self.MetaphAdd('KS') if self.s[self.current + 1] in 'CX': self.current += 2 else: self.current += 1 elif self.s[self.current] == 'Z': if DEBUG: self.print_diags('Z') if self.s[self.current + 1] == 'H': self.MetaphAdd('J') self.current += 2 return else: if (self.StringAt(self.current + 1, 2, 'ZO', 'ZI', 'ZA') or (self.SlavoGermanic() and (self.current > 0 and self.s[self.current - 1] <> 'T'))): self.MetaphAdd('S', 'TS') else: self.MetaphAdd('S') if self.s[self.current + 1] == 'Z': self.current += 2 else: self.current += 1 else: self.current += 1 def __str__(self): if self.metaph2: return "<%s: %s, %s>" % (self.s.strip(), self.metaph, self.metaph2) else: return "<%s: %s>" % (self.s.strip(), self.metaph) PUNCTUATION = """\.\,\?\-\(\)\:\;\"\'\!\#""" def metaphonize_string(words): words = re.sub(r'[%s]' % PUNCTUATION, '', words) metaphones = [] for word in words.split(): w = DMetaph(word) metaphones.append(w.metaph) return metaphones if __name__ == '__main__': for word in open('/usr/share/dict/words', 'rb'): w = DMetaph(word) print w print metaphonize_string('Now is the time for all good men to come to the aid of their country')
-- http://mail.python.org/mailman/listinfo/python-list