0001 # Copyright Roger Binns <rogerb@rogerbinns.com> 0002 # 0003 # This file is under the BitPim license. 0004 # 0005 # Alternatively you can use it under any OSI certified license as listed 0006 # at http://www.opensource.org/licenses/alphabetical 0007 # 0008 # Jaro/Winkler string match algorithm. There is a native C 0009 # implementation in this directory as well. There is an excellent 0010 # paper on the "quality" of various string matching algorithms at 0011 # http://www.isi.edu/info-agents/workshops/ijcai03/papers/Cohen-p.pdf 0012 # as well as Java implementations by the same authors at 0013 # http://secondstring.sf.net 0014 # 0015 # Most implementations make copies of the source strings, scribble '*' 0016 # all over the copies, and will compute the entire commonchars even if 0017 # the lengths won't match early on. 0018 # 0019 # My version uses a bitset to track which chars have been seen when 0020 # looking for the common ones, won't give misleading results if '*' is 0021 # present in the string and various other goodness. 0022 # 0023 # The general algorithm is as follows: 0024 # 0025 # - Compute the matching characters from s1 in s2. The range 0026 # searched is half of the length of the shortest string on either 0027 # side of the current position. Characters should only be matched 0028 # once (we use a bitset to mark matched chars - many 0029 # implementations make a copied string and replace the characters 0030 # with '*' 0031 # 0032 # - Repeat the process with s2 in s1 0033 # 0034 # - If the matching chars are zero or different length, then return 0035 # zero. 0036 # 0037 # - Do the math as show below, which also takes into account how 0038 # different the two match strings are (transitions) 0039 # 0040 # - The Winkler addition gives a bonus for how many characters at 0041 # the begining of both strings are the same since most strings 0042 # that are the same have misspellings later in the string. 0043 # 0044 # Note that if you want caseless comparison then the strings should be 0045 # converted beforehand. It isn't efficient to do it in this code. 0046 # 0047 # This implementation treats all characters equally. A refinement is 0048 # partial scores for similar characters (such as 'i' and 'y' or 'b' 0049 # and 'd'). That requires training on a body of strings so we don't 0050 # bother. 0051 0052 0053 def jarow(s1, s2, winkleradjust=0): 0054 if len(s1)==0 or len(s2)==0: return 0 0055 if s1==s2: return 1 0056 halflen=min(len(s1)/2+1, len(s2)/2+1) 0057 0058 s1pos=0 0059 s1seenins2=[0]*len(s2) 0060 s2pos=0 0061 s2seenins1=[0]*len(s1) 0062 transpositions=0 0063 commonlen=0 0064 0065 while s1pos<len(s1) or s2pos<len(s2): 0066 # find the next common char from s1 in s2 0067 s1char=None 0068 while s1pos<len(s1): 0069 c=s1[s1pos] 0070 for i in xrange(max(0,s1pos-halflen), min(len(s2),s1pos+halflen)): 0071 if s1seenins2[i]: continue 0072 if c==s2[i]: 0073 s1char=c 0074 s1seenins2[i]=1 0075 break 0076 s1pos+=1 0077 if s1char is not None: 0078 break 0079 # find the next common char from s2 in s1 0080 s2char=None 0081 while s2pos<len(s2): 0082 c=s2[s2pos] 0083 for i in xrange(max(0,s2pos-halflen), min(len(s1),s2pos+halflen)): 0084 if s2seenins1[i]: continue 0085 if c==s1[i]: 0086 s2char=c 0087 s2seenins1[i]=1 0088 break 0089 s2pos+=1 0090 if s2char is not None: 0091 break 0092 0093 if s1char==None and s2char==None: 0094 break 0095 if s1char!=None and s2char==None: 0096 return 0 0097 if s1char==None and s2char!=None: 0098 return 0 0099 commonlen+=1 0100 if s1char!=s2char: 0101 transpositions+=1 0102 0103 if commonlen==0: return 0 0104 transpositions/=2 0105 dist=commonlen/float(len(s1)) + commonlen/float(len(s2)) + (commonlen-transpositions)/float(commonlen) 0106 dist/=3.0 0107 0108 if winkleradjust: 0109 for common in range(min(len(s1)+1, len(s2)+1, winkleradjust)): 0110 if common>=len(s1) or common>=len(s2): break 0111 if s1[common]!=s2[common]: 0112 break 0113 dist = dist + common * 0.1 * (1-dist) 0114 0115 return dist 0116
Generated by PyXR 0.9.4