Package native :: Package strings :: Module jarowpy
[hide private]
[frames] | no frames]

Source Code for Module native.strings.jarowpy

  1  # Copyright Roger Binns <rogerb@rogerbinns.com> 
  2  # 
  3  # This file is under the BitPim license. 
  4  # 
  5  # Alternatively you can use it under any OSI certified license as listed 
  6  # at http://www.opensource.org/licenses/alphabetical 
  7  # 
  8  # Jaro/Winkler string match algorithm.  There is a native C 
  9  # implementation in this directory as well.  There is an excellent 
 10  # paper on the "quality" of various string matching algorithms at 
 11  # http://www.isi.edu/info-agents/workshops/ijcai03/papers/Cohen-p.pdf 
 12  # as well as Java implementations by the same authors at 
 13  # http://secondstring.sf.net 
 14  # 
 15  # Most implementations make copies of the source strings, scribble '*' 
 16  # all over the copies, and will compute the entire commonchars even if 
 17  # the lengths won't match early on. 
 18  # 
 19  # My version uses a bitset to track which chars have been seen when 
 20  # looking for the common ones, won't give misleading results if '*' is 
 21  # present in the string and various other goodness. 
 22  # 
 23  # The general algorithm is as follows: 
 24  # 
 25  #   - Compute the matching characters from s1 in s2.  The range 
 26  #     searched is half of the length of the shortest string on either 
 27  #     side of the current position.  Characters should only be matched 
 28  #     once (we use a bitset to mark matched chars - many 
 29  #     implementations make a copied string and replace the characters 
 30  #     with '*' 
 31  #  
 32  #   - Repeat the process with s2 in s1 
 33  # 
 34  #   - If the matching chars are zero or different length, then return 
 35  #     zero. 
 36  # 
 37  #   - Do the math as show below, which also takes into account how 
 38  #     different the two match strings are (transitions) 
 39  # 
 40  #   - The Winkler addition gives a bonus for how many characters at 
 41  #     the begining of both strings are the same since most strings 
 42  #     that are the same have misspellings later in the string. 
 43  # 
 44  # Note that if you want caseless comparison then the strings should be 
 45  # converted beforehand. It isn't efficient to do it in this code. 
 46  # 
 47  # This implementation treats all characters equally.  A refinement is 
 48  # partial scores for similar characters (such as 'i' and 'y' or 'b' 
 49  # and 'd').  That requires training on a body of strings so we don't 
 50  # bother. 
 51   
 52   
53 -def jarow(s1, s2, winkleradjust=0):
54 if len(s1)==0 or len(s2)==0: return 0 55 if s1==s2: return 1 56 halflen=min(len(s1)/2+1, len(s2)/2+1) 57 58 s1pos=0 59 s1seenins2=[0]*len(s2) 60 s2pos=0 61 s2seenins1=[0]*len(s1) 62 transpositions=0 63 commonlen=0 64 65 while s1pos<len(s1) or s2pos<len(s2): 66 # find the next common char from s1 in s2 67 s1char=None 68 while s1pos<len(s1): 69 c=s1[s1pos] 70 for i in xrange(max(0,s1pos-halflen), min(len(s2),s1pos+halflen)): 71 if s1seenins2[i]: continue 72 if c==s2[i]: 73 s1char=c 74 s1seenins2[i]=1 75 break 76 s1pos+=1 77 if s1char is not None: 78 break 79 # find the next common char from s2 in s1 80 s2char=None 81 while s2pos<len(s2): 82 c=s2[s2pos] 83 for i in xrange(max(0,s2pos-halflen), min(len(s1),s2pos+halflen)): 84 if s2seenins1[i]: continue 85 if c==s1[i]: 86 s2char=c 87 s2seenins1[i]=1 88 break 89 s2pos+=1 90 if s2char is not None: 91 break 92 93 if s1char==None and s2char==None: 94 break 95 if s1char!=None and s2char==None: 96 return 0 97 if s1char==None and s2char!=None: 98 return 0 99 commonlen+=1 100 if s1char!=s2char: 101 transpositions+=1 102 103 if commonlen==0: return 0 104 transpositions/=2 105 dist=commonlen/float(len(s1)) + commonlen/float(len(s2)) + (commonlen-transpositions)/float(commonlen) 106 dist/=3.0 107 108 if winkleradjust: 109 for common in range(min(len(s1)+1, len(s2)+1, winkleradjust)): 110 if common>=len(s1) or common>=len(s2): break 111 if s1[common]!=s2[common]: 112 break 113 dist = dist + common * 0.1 * (1-dist) 114 115 return dist
116