PyXR

c:\projects\bitpim\src \ native \ strings \ jarowpy.py


0001 # Copyright Roger Binns <rogerb@rogerbinns.com>
0002 #
0003 # This file is under the BitPim license.
0004 #
0005 # Alternatively you can use it under any OSI certified license as listed
0006 # at http://www.opensource.org/licenses/alphabetical
0007 #
0008 # Jaro/Winkler string match algorithm.  There is a native C
0009 # implementation in this directory as well.  There is an excellent
0010 # paper on the "quality" of various string matching algorithms at
0011 # http://www.isi.edu/info-agents/workshops/ijcai03/papers/Cohen-p.pdf
0012 # as well as Java implementations by the same authors at
0013 # http://secondstring.sf.net
0014 #
0015 # Most implementations make copies of the source strings, scribble '*'
0016 # all over the copies, and will compute the entire commonchars even if
0017 # the lengths won't match early on.
0018 #
0019 # My version uses a bitset to track which chars have been seen when
0020 # looking for the common ones, won't give misleading results if '*' is
0021 # present in the string and various other goodness.
0022 #
0023 # The general algorithm is as follows:
0024 #
0025 #   - Compute the matching characters from s1 in s2.  The range
0026 #     searched is half of the length of the shortest string on either
0027 #     side of the current position.  Characters should only be matched
0028 #     once (we use a bitset to mark matched chars - many
0029 #     implementations make a copied string and replace the characters
0030 #     with '*'
0031 # 
0032 #   - Repeat the process with s2 in s1
0033 #
0034 #   - If the matching chars are zero or different length, then return
0035 #     zero.
0036 #
0037 #   - Do the math as show below, which also takes into account how
0038 #     different the two match strings are (transitions)
0039 #
0040 #   - The Winkler addition gives a bonus for how many characters at
0041 #     the begining of both strings are the same since most strings
0042 #     that are the same have misspellings later in the string.
0043 #
0044 # Note that if you want caseless comparison then the strings should be
0045 # converted beforehand. It isn't efficient to do it in this code.
0046 #
0047 # This implementation treats all characters equally.  A refinement is
0048 # partial scores for similar characters (such as 'i' and 'y' or 'b'
0049 # and 'd').  That requires training on a body of strings so we don't
0050 # bother.
0051 
0052 
0053 def jarow(s1, s2, winkleradjust=0):
0054     if len(s1)==0 or len(s2)==0: return 0
0055     if s1==s2: return 1
0056     halflen=min(len(s1)/2+1, len(s2)/2+1)
0057 
0058     s1pos=0
0059     s1seenins2=[0]*len(s2)
0060     s2pos=0
0061     s2seenins1=[0]*len(s1)
0062     transpositions=0
0063     commonlen=0
0064 
0065     while s1pos<len(s1) or s2pos<len(s2):
0066         # find the next common char from s1 in s2
0067         s1char=None
0068         while s1pos<len(s1):
0069             c=s1[s1pos]
0070             for i in xrange(max(0,s1pos-halflen), min(len(s2),s1pos+halflen)):
0071                 if s1seenins2[i]: continue
0072                 if c==s2[i]:
0073                     s1char=c
0074                     s1seenins2[i]=1
0075                     break
0076             s1pos+=1
0077             if s1char is not None:
0078                 break
0079         # find the next common char from s2 in s1
0080         s2char=None
0081         while s2pos<len(s2):
0082             c=s2[s2pos]
0083             for i in xrange(max(0,s2pos-halflen), min(len(s1),s2pos+halflen)):
0084                 if s2seenins1[i]: continue
0085                 if c==s1[i]:
0086                     s2char=c
0087                     s2seenins1[i]=1
0088                     break
0089             s2pos+=1
0090             if s2char is not None:
0091                 break
0092 
0093         if s1char==None and s2char==None:
0094             break
0095         if s1char!=None and s2char==None:
0096             return 0
0097         if s1char==None and s2char!=None:
0098             return 0
0099         commonlen+=1
0100         if s1char!=s2char:
0101             transpositions+=1
0102 
0103     if commonlen==0: return 0
0104     transpositions/=2
0105     dist=commonlen/float(len(s1)) + commonlen/float(len(s2)) + (commonlen-transpositions)/float(commonlen)
0106     dist/=3.0
0107 
0108     if winkleradjust:
0109         for common in range(min(len(s1)+1, len(s2)+1, winkleradjust)):
0110             if common>=len(s1) or common>=len(s2): break
0111             if s1[common]!=s2[common]:
0112                 break
0113         dist = dist + common * 0.1 * (1-dist)
0114 
0115     return dist
0116
Generated by PyXR 0.9.4