On Mon, Aug 11, 2008 at 12:13 PM, Dave Webster <[EMAIL PROTECTED]> wrote:
> Thanks, Timothy.  I'm pretty sure that there is no such thing as a "beautiful"
> implementation of double-metaphone but I would personally like to have a copy
> of your python implementation.  I have a fairly elegant version of the 
> original
> metaphone algorithm I wrote myself (in PERL, many years ago) but I've
> never found
> the time to reverse-engineer the original C++ code for double-metaphone and
> "pythonize" it.
>
> On Mon, Aug 11, 2008 at 2:08 PM, Timothy Grant <[EMAIL PROTECTED]> wrote:
>> On Mon, Aug 11, 2008 at 8:44 AM, Casey <[EMAIL PROTECTED]> wrote:
>>> My first thought is that you should be looking at implementations of
>>> Hamming Distance.  If you are actually looking for something like
>>> SOUNDEX you might also want to look at the double metaphor algorithm,
>>> which is significantly harder to implement but provides better
>>> matching and is less susceptible to differences based on name origins.
>>> --
>>> http://mail.python.org/mailman/listinfo/python-list
>>>
>>
>> I responded in the thread of the poster's original message on this
>> subject, but will do the same here. I have a horribly ugly version of
>> the double-metaphone algorithm in python that does work, and may be of
>> some use in solving this problem.
>>
>> --
>> Stand Fast,
>> tjg. [Timothy Grant]
>>
>
This is truly cringe-worthy, and pretty much a direct port of the C++
code. It need unit tests (which are on my "to-do someday" list) but
even though it's ugly it does work and I have managed to do real work
with it.


-- 
Stand Fast,
tjg. [Timothy Grant]
#
# DMetaph.py
#
# Copyright 2006 by Timothy (rhacer) Grant
#
# Based on an algorithm and code by Lawrence Philips
# 
# This code is licensed under the terms of the GNU Public License v2
#

import re

DEBUG = False

class DMetaph(str):
    """
    DMetaph(<word>) creates a Double Metaphone encoding of the word passed in.
    
    
    """
    def __init__(self, s):
        self.s = s.upper() + '     '  # Padding allows safe indexing beyond the end
        self.length = len(s)
        self.last = self.length - 1
        
        self.current = 0
        self.primary = ""
        self.secondary = ""
        self.alternate = False
        
        if self.StringAt(0, 2, 'GN', 'KN', 'PN', 'WR', 'PS'):
            self.current += 1
            
        if self.s[0] == 'X':
            self.MetaphAdd('S')
            self.current += 1
        
        while len(self.primary) < 4 or len(self.secondary) < 4:
            if DEBUG: print "processing character: %s current: %s length: %s" % (
                    self.s[self.current], self.current, self.length)
            
            if self.current >= self.length:
                break
        
            self.ProcessString()
            
        self.metaph = self.primary
        self.metaph2 = ''
        if len(self.metaph) > 4:
            self.metaph = self.metaph[:4]
        
        if self.alternate:
            self.metaph2 = self.secondary
            if len(self.metaph2) > 4:
                self.metaph2 = self.metaph2[:4]
        
        
    def SlavoGermanic(self):
        if ('W' in self.s or 'K' in self.s 
                or 'CZ' in self.s or 'WITZ' in self.s):
            return True
        return False

    def MetaphAdd(self, main, alt=None):
        if main:
            self.primary += main
        
        if alt:
            self.alternate = True

            if (alt[0] <> ' '):
                self.secondary += alt
        else:
            if (main[0] <> ' '):
                self.secondary += main

    def IsVowel(self, at):
        if (at < 0) or (at > self.length):
            return False
        
        if self.s[at] in 'AEIOUY':
            return True
            
        return False

    def StringAt(self, start, length, *sub_strings):
        if self.s[start:start + length] in sub_strings:
            return True

        return False
        
    def print_diags(self, section):
        print"section: %s current: %s" % (section, self.current)

    def ProcessString(self):
        if self.IsVowel(self.current):
            if DEBUG: self.print_diags('VOWEL')

            if self.current == 0:
                self.MetaphAdd('A')

            self.current += 1
            return

        elif self.s[self.current] == 'B':
            if DEBUG: self.print_diags('B')

            self.MetaphAdd('P')
            
            if self.s[self.current + 1] == 'B':
                self.current += 2
            else:
                self.current += 1

            return
                                
        elif self.s[self.current] == 'C':
            if DEBUG: self.print_diags('C')

            if (self.current > 1
                    and not self.IsVowel(self.current - 2)
                        and self.StringAt(self.current - 1, 3, 'ACH')
                            and self.s[self.current + 2] <> 'I' 
                                and self.s[self.current + 2] <> 'E'
                                    or self.StringAt(self.current - 2, 6, 'BACHER', 'MACHER')):

                self.MetaphAdd('K')
                self.current += 2
                return

            if self.current == 0 and self.StringAt(self.current, 6, 'CAESAR'):
                self.MetaphAdd('S')
                self.current += 2
                return

            if self.StringAt(self.current, 4, 'CHIA'):
                self.MetaphAdd('K')
                self.current += 2
                return

            if self.StringAt(self.current, 2, 'CH'):
                if self.current > 0 and self.StringAt(self.current, 4, 'CHAE'):
                    self.MetaphAdd('K', 'X')
                    self.current += 2
                    return

                if (self.current == 0
                        and (self.StringAt(self.current + 1, 5, 'HARAC', 'HARIS')
                            or self.StringAt(self.current + 1, 3, 'HOR', 'HYM', 'HIA', 'HEM'))
                        and not self.StringAt(0, 5, 'CHORE')):
                    self.MetaphAdd('K')
                    self.current += 2
                    return

                if ((self.StringAt(0, 4, 'VAN ', 'VON ') or self.StringAt(0, 3, 'SCH'))
                        or self.StringAt(self.current - 2, 6, 'ORCHES', 'ARCHIT', 'ORCHID')
                        or self.s[self.current - 2] in 'TS'
                        or (self.s[self.current - 1] in 'AOUE' or self.current == 0)
                        and self.s[self.current + 2] in 'LRNMBHFVW '):
                    self.MetaphAdd('K')
                else:
                    if self.current > 0:
                        if self.StringAt(0, 2, 'MC'):
                            self.MetaphAdd('K')
                        else:
                            self.MetaphAdd('X', 'K')
                    else:
                        self.MetaphAdd('X')

                self.current += 2
                return

            if (self.StringAt(self.current, 2, 'CZ') 
                    and not self.StringAt(self.current - 2, 4, 'WICZ')):
                self.MetaphAdd('S', 'X')
                self.current += 2
                return

            if self.StringAt(self.current + 1, 3, 'CIA'):
                self.MetaphAdd('X')
                self.current += 3
                return

            if (self.StringAt(self.current, 2, 'CC') 
                    and not (self.current == 1 and self.s[0] == 'M')):

                if (self.s[self.current + 2] in 'IEH' 
                        and not self.StringAt(self.current + 2, 2, 'HU')):
                    if (self.current == 1 and self.s[self.current - 1] == 'A'
                            or self.StringAt(self.current - 1, 5, 'UCCEE', 'UCCES')):
                        self.MetaphAdd('KS')
                    else:
                        self.MetaphAdd('X')
                    self.current += 3
                    return
                else:
                    self.MetaphAdd('K')
                    self.current += 2
                    return

            if self.StringAt(self.current, 2, 'CK', 'CG', 'CQ'):
                self.MetaphAdd('K')
                self.current += 2
                return

            if self.StringAt(self.current, 2, 'CI', 'CE', 'CY'):
                if self.StringAt(self.current, 3, 'CIO', 'CIE', 'CIA'):
                    self.MetaphAdd('S', 'X')
                else:
                    self.MetaphAdd('S')

                self.current += 2
                return
            
            self.MetaphAdd('K')
            
            if self.StringAt(self.current + 1, 2, ' C', ' Q', ' G'):
                self.current += 3
            else:
                if (self.s[self.current + 1] in 'CKQ' 
                        and not self.StringAt(self.current + 1, 2, 'CE', 'CI')):
                    self.current += 2
                else:
                    self.current += 1
            return

        elif self.s[self.current] == 'D':
            if DEBUG: self.print_diags('D')

            if self.StringAt(self.current, 2, 'DG'):
                if self.s[self.current + 2] in 'IEY':
                    self.MetaphAdd('J')
                    self.current += 3
                    return
                else:
                    self.MetaphAdd('TK')
                    self.current += 2

            if self.StringAt(self.current, 2, 'DT', 'DD'):
                self.MetaphAdd('T')
                self.current += 2
                return

            self.MetaphAdd('T')
            self.current += 1
            return

        elif self.s[self.current] == 'F':
            if DEBUG: self.print_diags('F')

            if self.s[self.current + 1] == 'F':
                self.current += 2
            else:
                self.current += 1

            self.MetaphAdd('F')
            return

        elif self.s[self.current] == 'G':
            if DEBUG: self.print_diags('G')

            if self.s[self.current + 1] == 'H':
                if self.current > 0 and not self.IsVowel(self.current - 1):
                    self.MetaphAdd('K')
                    self.current += 2
                    return

                if self.current < 3:
                    if self.current == 0:
                        if self.s[self.current + 2] == 'I':
                            self.MetaphAdd('J')
                        else:
                            self.MetaphAdd('K')
                        self.current += 2
                        return

                if ((self.current > 1 and self.s[self.current - 2] in 'BHD')
                        or (self.current > 2 and self.s[self.current - 3] in 'BHD')
                        or (self.current > 3 and self.s[self.current - 4] in 'BH')):
                    self.current += 2
                    return
                else:
                    if (self.current > 2 
                            and self.s[self.current - 1] == 'U' 
                                and self.s[self.current - 3] in 'CGLRT'):
                        self.MetaphAdd('F')
                    else:
                        if self.current > 0 and self.s[self.current - 1] <> 'I':
                            self.MetaphAdd('K')

                    self.current += 2
                    return

            if self.s[self.current + 1] == 'N':
                if self.current == 1 and self.IsVowel(0) and not self.SlavoGermanic():
                    self.MetaphAdd('KN', 'N')
                else:
                    if (not self.StringAt(self.current + 2, 2, 'EY')
                            and self.s[self.current + 1] <> 'Y'
                                and not self.SlavoGermanic()):
                        self.MetaphAdd('N', 'KN')
                    else:
                        self.MetaphAdd('KN')

                self.current += 2
                return

            if self.StringAt(self.current + 1, 2, 'LI') and not self.SlavoGermanic():
                self.MetaphAdd('KL', 'L')
                self.current += 2
                return

            if ((self.current == 0 
                    and self.s[self.current + 1] == 'Y')
                        or self.StringAt(self.current + 1, 2, 'ES', 'EP', 'EB', 
                                                              'EL', 'EY', 'IB',
                                                              'IL', 'IN', 'IE',
                                                              'EI', 'ER')):
                self.MetaphAdd('K', 'J')
                self.current += 2
                return

            if (self.StringAt(self.current + 1, 2, 'ER') 
                    or self.s[self.current + 1] == 'Y'
                        and not self.StringAt(0, 6, 'DANGER', 'RANGER', 'MANGER')
                            and not self.s[self.current - 1] in 'EI'
                                and not self.StringAt(self.current - 1, 3, 'RGY', 'OGY')):

                self.MetaphAdd('K', 'J')
                self.current += 2
                return

            if (self.s[self.current + 1] in 'EIY' 
                    or self.StringAt(self.current - 1, 4, 'AGGI', 'OGGI')):

                if ((self.StringAt(0, 4, 'VAN ', 'VON ') or self.StringAt(0, 3, 'SCH'))
                        or self.StringAt(self.current + 1, 2, 'ET')):

                    self.MetaphAdd('K')

                else:
                    if self.StringAt(self.current + 1, 4, 'IER'):
                        self.MetaphAdd('J')
                    else:
                        self.MetaphAdd('J', 'K')

                self.current += 2
                return

            if self.s[self.current + 1] == 'G':
                self.current += 2
            else:
                self.current += 1

            self.MetaphAdd('K')
            return

        elif self.s[self.current] == 'H':
            if DEBUG: self.print_diags('H')

            if ((self.current == 0 or self.IsVowel(self.current - 1))
                    and self.IsVowel(self.current + 1)):

                self.MetaphAdd('H')
                self.current += 2

            else:
                self.current += 1
            return

        elif self.s[self.current] == 'J':
            if DEBUG: self.print_diags('J')

            if self.StringAt(self.current, 4, 'JOSE') or self.StringAt(0, 4, 'SAN '):
                if ((self.current == 0 and self.s[self.current + 4] == ' ')
                        or self.StringAt(0, 4, 'SAN ')):
                    self.MetaphAdd('H')
                else:
                    self.MetaphAdd('J', 'H')
                self.current += 1
                return

            if self.current == 0 and not self.StringAt(self.current, 4, 'JOSE'):
                self.MetaphAdd('J', 'A')
            else:
                if (self.IsVowel(self.current - 1) 
                        and not self.SlavoGermanic()
                            and (self.s[self.current + 1] == 'A' 
                                    or self.s[self.current + 1] == 'O')):
                    self.MetaphAdd('J', 'H')
                else:
                    if self.current == self.last:
                        self.MetaphAdd('J', ' ')
                    else:
                        if (not self.s[self.current + 1] in 'LTKSNMBZ'
                                and not self.s[self.current - 1] in 'SKL'):
                            self.MetaphAdd('J')

            if self.s[self.current + 1] == 'J':
                self.current += 2
            else:
                self.current += 1
            return

        elif self.s[self.current] == 'K':
            if DEBUG: self.print_diags('K')

            if self.s[self.current + 1] == 'K':
                self.current += 2
            else:
                self.current += 1
            self.MetaphAdd('K')
            return

        elif self.s[self.current] == 'L':
            if DEBUG: self.print_diags('L')

            if self.s[self.current + 1] == 'L':
                if ((self.current == self.length - 3 
                        and self.StringAt(self.current - 1, 4, 'ILLO', 'ILLA', 'ALLE'))
                                 or ((self.StringAt(self.last - 1, 2, 'AS', 'OS') 
                                        or self.StringAt(self.last, 1, 'A', 'O')) 
                                            and self.StringAt(self.current - 1, 4, 'ALLE'))):

                    self.MetaphAdd('L', ' ')
                    self.current += 2
                    return

                self.current += 2
            else:
                self.current += 1
            self.MetaphAdd('L')
            return

        elif self.s[self.current] == 'M':
            if DEBUG: self.print_diags('M')

            if ((self.StringAt(self.current - 1, 3, 'UMB') 
                    and ((self.current + 1 == self.last) 
                            or self.StringAt(self.current + 2, 2, 'ER')))
                        or (self.s[self.current + 1] == 'M')):

                self.current += 2
            else:
                self.current += 1

            self.MetaphAdd('M')
            return

        elif self.s[self.current] == 'N':
            if DEBUG: self.print_diags('N')

            if self.s[self.current + 1] == 'N':
                self.current += 2
            else:
                self.current += 1
            self.MetaphAdd('N')
            return

        elif self.s[self.current] == 'P':
            if DEBUG: self.print_diags('P')

            if self.s[self.current + 1] == 'H':
                self.MetaphAdd('F')
                self.current += 2
                return

            if self.s[self.current + 1] in 'PB':
                self.current += 2
            else:
                self.current += 1

            self.MetaphAdd('P')
            return

        elif self.s[self.current] == 'Q':
            if DEBUG: self.print_diags('Q')

            if self.s[self.current + 1] == 'Q':
                self.current += 2
            else:
                self.current += 1

            self.MetaphAdd('K')
            return

        elif self.s[self.current] == 'R':
            if DEBUG: self.print_diags('R')

            if (self.current == self.last
                    and not self.SlavoGermanic()
                        and self.StringAt(self.current - 2, 2, 'IE')
                            and not self.StringAt(self.current - 4, 2, 'ME', 'MA')):

                self.MetaphAdd('','R')
            else:
                self.MetaphAdd('R')

            if self.s[self.current + 1] == 'R':
                self.current += 2
            else:
                self.current += 1

            return

        elif self.s[self.current] == 'S':
            if DEBUG: self.print_diags('S')

            if self.StringAt(self.current + 1, 3, 'ISL', 'YSL'):
                self.current += 1
                return

            if self.current == 0 and self.StringAt(self.current, 5, 'SUGAR'):
                self.MetaphAdd('X', 'S')
                self.current += 1

            if self.StringAt(self.current, 2, 'SH'):
                if self.StringAt(self.current + 1, 4, 'HEIM', 'HOEK', 'HOLM', 'HOLZ'):
                    self.MetaphAdd('S')
                else:
                    self.MetaphAdd('X')
                self.current += 2
                return

            if (self.StringAt(self.current, 3, 'SIO', 'SIA')
                    or self.StringAt(self.current, 4, 'SIAN')):

                if not self.SlavoGermanic():
                    self.MetaphAdd('S', 'X')
                else:
                    self.MetaphAdd('S')
                self.current += 3
                return

            if ((self.current == 0
                    and self.s[self.current + 1] in 'MNLW')
                        or self.s[self.current + 1] == 'Z'):

                self.MetaphAdd('S', 'X')

                if self.s[self.current + 1] == 'Z':
                    self.current += 2
                else:
                    self.current += 1
                return

            if self.StringAt(self.current, 2, 'SC'):
                if self.s[self.current + 2] == 'H':
                    if self.StringAt(self.current + 3, 2, 'OO', 'ER', 'EN', 
                                                          'UY', 'ED', 'EM'):
                        if self.StringAt(self.current + 3, 2, 'ER', 'EN'):
                            self.MetaphAdd('X', 'SK')
                        else:
                            self.MetaphAdd('SK')
                        self.current += 3
                        return
                    else:
                        if (self.current == 0 and not self.IsVowel(3) and self.s[3] <> 'W'):
                            self.MetaphAdd('X', 'S')
                        else:
                            self.MetaphAdd('X')
                        self.current += 3
                        return

                if self.s[self.current + 2] in 'IEY':
                    self.MetaphAdd('S')
                    self.current += 3
                    return

                self.MetaphAdd('SK')
                self.current += 3
                return

            if (self.current == self.last
                    and self.StringAt(self.current - 2, 2, 'AI', 'OI')):
                self.MetaphAdd('', 'S')
            else:
                self.MetaphAdd('S')

            if self.s[self.current + 1] in 'SZ':
                self.current += 2
            else:
                self.current += 1
            return

        elif self.s[self.current] == 'T':
            if DEBUG: self.print_diags('T')

            if self.StringAt(self.current, 4, 'TION'):
                self.MetaphAdd('X')
                self.current += 3
                return

            if self.StringAt(self.current, 3, 'TIA', 'TCH'):
                self.MetaphAdd('X')
                self.current += 3
                return

            if (self.StringAt(self.current, 2, 'TH')
                    or self.StringAt(self.current, 3, 'TTH')):

                if (self.StringAt(self.current + 2, 2, 'OM', 'AM')
                        or self.StringAt(0, 4, 'VAN ', 'VON ')
                            or self.StringAt(0, 3, 'SCH')):
                    self.MetaphAdd('T')
                else:
                    self.MetaphAdd('0', 'T')

                self.current += 2
                return

            if self.s[self.current + 1] in 'TD':
                self.current += 2
            else:
                self.current += 1

            self.MetaphAdd('T')
            return

        elif self.s[self.current] == 'V':
            if DEBUG: self.print_diags('V')

            if self.s[self.current + 1] == 'V':
                self.current += 2
            else:
                self.current += 1

            self.MetaphAdd('F')
            return

        elif self.s[self.current] == 'W':
            if DEBUG: self.print_diags('W')

            if self.StringAt(self.current, 2, 'WR'):
                self.MetaphAdd('R')
                self.current += 2
                return

            if (self.current == 0
                    and (self.IsVowel(self.current + 1)
                            or self.StringAt(self.current, 2, 'WH'))):
                if self.IsVowel(self.current + 1):
                    self.MetaphAdd('A', 'F')
                else:
                    self.MetaphAdd('A')

            if ((self.current == self.last 
                    and self.IsVowel(self.current - 1)) 
                        or self.StringAt(self.current - 1, 5, 'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY') 
                            or self.StringAt(0, 3, 'SCH')):

                self.MetaphAdd('', 'F')
                self.current += 1
                return

            if self.StringAt(self.current, 4, 'WICZ', 'WITZ'):
                self.MetaphAdd('TS', 'FX')
                self.current += 4
                return

            self.current += 1
            return

        elif self.s[self.current] == 'X':
            if DEBUG: self.print_diags('X')

            if (not (self.current == self.last 
                    and (self.StringAt(self.current - 3, 3, 'IAU', 'EAU') 
                        or self.StringAt(self.current - 2, 2, 'AU', 'OU')))):

                self.MetaphAdd('KS')

            if self.s[self.current + 1] in 'CX':
                self.current += 2
            else:
                self.current += 1

        elif self.s[self.current] == 'Z':
            if DEBUG: self.print_diags('Z')

            if self.s[self.current + 1] == 'H':
                self.MetaphAdd('J')
                self.current += 2
                return
            else:
                if (self.StringAt(self.current + 1, 2, 'ZO', 'ZI', 'ZA') 
                        or (self.SlavoGermanic() 
                                and (self.current > 0 
                                        and self.s[self.current - 1] <> 'T'))):
                    self.MetaphAdd('S', 'TS')
                else:
                    self.MetaphAdd('S')

            if self.s[self.current + 1] == 'Z':
                self.current += 2
            else:
                self.current += 1
        else:
            self.current += 1

    def __str__(self):
        if self.metaph2:
            return "<%s: %s, %s>" % (self.s.strip(), self.metaph, self.metaph2)
        else:
            return "<%s: %s>" % (self.s.strip(), self.metaph)

PUNCTUATION = """\.\,\?\-\(\)\:\;\"\'\!\#"""            
def metaphonize_string(words):
    words = re.sub(r'[%s]' % PUNCTUATION, '', words)
    
    metaphones = []
    for word in words.split():
        w = DMetaph(word)
        metaphones.append(w.metaph)
    return metaphones
            
if __name__ == '__main__':
    for word in open('/usr/share/dict/words', 'rb'):
        w = DMetaph(word)
        print w
        
    print metaphonize_string('Now is the time for all good men to come to the aid of their country')
--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to