4419 |
27 Aug 08 |
nicklas |
1 |
/** |
4419 |
27 Aug 08 |
nicklas |
$Id$ |
4419 |
27 Aug 08 |
nicklas |
3 |
|
4419 |
27 Aug 08 |
nicklas |
Copyright (C) 2008 Nicklas Nordborg |
4419 |
27 Aug 08 |
nicklas |
5 |
|
4419 |
27 Aug 08 |
nicklas |
This file is part of BASE - BioArray Software Environment. |
4419 |
27 Aug 08 |
nicklas |
Available at http://base.thep.lu.se/ |
4419 |
27 Aug 08 |
nicklas |
8 |
|
4419 |
27 Aug 08 |
nicklas |
BASE is free software; you can redistribute it and/or |
4419 |
27 Aug 08 |
nicklas |
modify it under the terms of the GNU General Public License |
4479 |
05 Sep 08 |
jari |
as published by the Free Software Foundation; either version 3 |
4419 |
27 Aug 08 |
nicklas |
of the License, or (at your option) any later version. |
4419 |
27 Aug 08 |
nicklas |
13 |
|
4419 |
27 Aug 08 |
nicklas |
BASE is distributed in the hope that it will be useful, |
4419 |
27 Aug 08 |
nicklas |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
4419 |
27 Aug 08 |
nicklas |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
4419 |
27 Aug 08 |
nicklas |
GNU General Public License for more details. |
4419 |
27 Aug 08 |
nicklas |
18 |
|
4419 |
27 Aug 08 |
nicklas |
You should have received a copy of the GNU General Public License |
4515 |
11 Sep 08 |
jari |
along with BASE. If not, see <http://www.gnu.org/licenses/>. |
4419 |
27 Aug 08 |
nicklas |
21 |
*/ |
4419 |
27 Aug 08 |
nicklas |
22 |
package net.sf.basedb.util.fuzzy; |
4419 |
27 Aug 08 |
nicklas |
23 |
|
4419 |
27 Aug 08 |
nicklas |
24 |
import java.util.Arrays; |
4419 |
27 Aug 08 |
nicklas |
25 |
import java.util.Collection; |
4419 |
27 Aug 08 |
nicklas |
26 |
import java.util.List; |
4419 |
27 Aug 08 |
nicklas |
27 |
|
4419 |
27 Aug 08 |
nicklas |
28 |
import com.wcohen.ss.Level2JaroWinkler; |
4419 |
27 Aug 08 |
nicklas |
29 |
import com.wcohen.ss.api.StringDistance; |
4419 |
27 Aug 08 |
nicklas |
30 |
|
4419 |
27 Aug 08 |
nicklas |
31 |
/** |
4419 |
27 Aug 08 |
nicklas |
A wrapper class for fuzzy string matching using the SecondString package. This |
4419 |
27 Aug 08 |
nicklas |
class uses a given StringDistance object for calculating string similaroties. |
4419 |
27 Aug 08 |
nicklas |
Use {@link #getScore(String, String)} to get the similarity score of two strings. |
4419 |
27 Aug 08 |
nicklas |
<p> |
4419 |
27 Aug 08 |
nicklas |
Use {@link #getBestMatch(String, Collection)} to find the best match of a string among |
4419 |
27 Aug 08 |
nicklas |
strings in a given list. |
4419 |
27 Aug 08 |
nicklas |
<p> |
4419 |
27 Aug 08 |
nicklas |
Use {@link #getBestPairs(Collection, Collection)} to pair up strings in two lists. |
4419 |
27 Aug 08 |
nicklas |
40 |
|
4419 |
27 Aug 08 |
nicklas |
@see <a href="http://secondstring.sourceforge.net/">SecondString project page</a> |
4419 |
27 Aug 08 |
nicklas |
@author nicklas |
4419 |
27 Aug 08 |
nicklas |
@version 2.8 |
4419 |
27 Aug 08 |
nicklas |
@base.modified $Date$ |
4419 |
27 Aug 08 |
nicklas |
45 |
*/ |
4419 |
27 Aug 08 |
nicklas |
46 |
public class StringMatcher |
4419 |
27 Aug 08 |
nicklas |
47 |
{ |
4419 |
27 Aug 08 |
nicklas |
48 |
|
4419 |
27 Aug 08 |
nicklas |
49 |
private final StringDistance matcher; |
4419 |
27 Aug 08 |
nicklas |
50 |
|
4419 |
27 Aug 08 |
nicklas |
51 |
/** |
4419 |
27 Aug 08 |
nicklas |
Create a new matcher using the default fuzzy matching algorithm: |
4419 |
27 Aug 08 |
nicklas |
{@link Level2JaroWinkler}. |
4419 |
27 Aug 08 |
nicklas |
54 |
*/ |
4419 |
27 Aug 08 |
nicklas |
55 |
public StringMatcher() |
4419 |
27 Aug 08 |
nicklas |
56 |
{ |
4419 |
27 Aug 08 |
nicklas |
57 |
this(new Level2JaroWinkler()); |
4419 |
27 Aug 08 |
nicklas |
58 |
} |
4419 |
27 Aug 08 |
nicklas |
59 |
|
4419 |
27 Aug 08 |
nicklas |
60 |
/** |
4419 |
27 Aug 08 |
nicklas |
Create a new matcher using a specific fuzzy matching algorithm. |
4419 |
27 Aug 08 |
nicklas |
@param matcher The algorithm to use |
4419 |
27 Aug 08 |
nicklas |
63 |
*/ |
4419 |
27 Aug 08 |
nicklas |
64 |
public StringMatcher(StringDistance matcher) |
4419 |
27 Aug 08 |
nicklas |
65 |
{ |
4419 |
27 Aug 08 |
nicklas |
66 |
this.matcher = matcher; |
4419 |
27 Aug 08 |
nicklas |
67 |
} |
4419 |
27 Aug 08 |
nicklas |
68 |
|
4419 |
27 Aug 08 |
nicklas |
69 |
/** |
4419 |
27 Aug 08 |
nicklas |
Get the similarity score of two strings. The score is a value between 0 |
4419 |
27 Aug 08 |
nicklas |
and 1, where 0 is a poor match and 1 is a good match. Note! It doesn't |
4419 |
27 Aug 08 |
nicklas |
have to be a perfect match to get a score of 1. |
4419 |
27 Aug 08 |
nicklas |
@param s1 The first string |
4419 |
27 Aug 08 |
nicklas |
@param s2 The second string |
4419 |
27 Aug 08 |
nicklas |
@return The similarity score or 0 if any of the strings are null |
4419 |
27 Aug 08 |
nicklas |
76 |
*/ |
4419 |
27 Aug 08 |
nicklas |
77 |
public double getScore(String s1, String s2) |
4419 |
27 Aug 08 |
nicklas |
78 |
{ |
4419 |
27 Aug 08 |
nicklas |
79 |
if (s1 == null || s2 == null) return 0.0; |
4419 |
27 Aug 08 |
nicklas |
80 |
return matcher.score(s1, s2); |
4419 |
27 Aug 08 |
nicklas |
81 |
} |
4419 |
27 Aug 08 |
nicklas |
82 |
|
4419 |
27 Aug 08 |
nicklas |
83 |
/** |
4419 |
27 Aug 08 |
nicklas |
Find the string that is most similar to a given string in a list of |
4419 |
27 Aug 08 |
nicklas |
strings. |
4419 |
27 Aug 08 |
nicklas |
@param key The string to look for |
4419 |
27 Aug 08 |
nicklas |
@param values The list of strings to compare with the key |
4419 |
27 Aug 08 |
nicklas |
@return A {@link FuzzyMatch} result for the string in the list |
4419 |
27 Aug 08 |
nicklas |
that got the highest score; null if there are no values in the |
4419 |
27 Aug 08 |
nicklas |
list |
4419 |
27 Aug 08 |
nicklas |
91 |
*/ |
4419 |
27 Aug 08 |
nicklas |
92 |
public FuzzyMatch getBestMatch(String key, Collection<? extends String> values) |
4419 |
27 Aug 08 |
nicklas |
93 |
{ |
4419 |
27 Aug 08 |
nicklas |
94 |
if (values == null || values.size() == 0) return null; |
4419 |
27 Aug 08 |
nicklas |
95 |
double bestScore = Double.MIN_NORMAL; |
4419 |
27 Aug 08 |
nicklas |
96 |
String bestMatch = null; |
4419 |
27 Aug 08 |
nicklas |
97 |
int i = 0; |
4419 |
27 Aug 08 |
nicklas |
98 |
for (String s : values) |
4419 |
27 Aug 08 |
nicklas |
99 |
{ |
4419 |
27 Aug 08 |
nicklas |
100 |
double score = getScore(key, s); |
4419 |
27 Aug 08 |
nicklas |
101 |
if (score > bestScore) |
4419 |
27 Aug 08 |
nicklas |
102 |
{ |
4419 |
27 Aug 08 |
nicklas |
103 |
bestScore = score; |
4419 |
27 Aug 08 |
nicklas |
104 |
bestMatch = s; |
4419 |
27 Aug 08 |
nicklas |
105 |
} |
4419 |
27 Aug 08 |
nicklas |
106 |
++i; |
4419 |
27 Aug 08 |
nicklas |
107 |
} |
4419 |
27 Aug 08 |
nicklas |
108 |
return new FuzzyMatch(key, bestMatch, bestScore); |
4419 |
27 Aug 08 |
nicklas |
109 |
} |
4419 |
27 Aug 08 |
nicklas |
110 |
|
4419 |
27 Aug 08 |
nicklas |
111 |
/** |
4419 |
27 Aug 08 |
nicklas |
Match strings in two lists. The result is a paired list of strings |
4419 |
27 Aug 08 |
nicklas |
with one values from each of the lists. The matching is done so that |
4419 |
27 Aug 08 |
nicklas |
no string from any of the lists is paired more than once. |
4419 |
27 Aug 08 |
nicklas |
115 |
|
4419 |
27 Aug 08 |
nicklas |
@param keys The list with the keys |
4419 |
27 Aug 08 |
nicklas |
@param values The list with the values |
4419 |
27 Aug 08 |
nicklas |
@return A list of {@link FuzzyMatch} results. The returned list is of the |
4419 |
27 Aug 08 |
nicklas |
same size as the 'keys' collection with the elements in the same |
4419 |
27 Aug 08 |
nicklas |
order as returned by the iterator. The list may contains null elements |
4419 |
27 Aug 08 |
nicklas |
if no match could be found for a given key |
4419 |
27 Aug 08 |
nicklas |
122 |
*/ |
4419 |
27 Aug 08 |
nicklas |
123 |
public List<FuzzyMatch> getBestPairs(Collection<? extends String> keys, Collection<? extends String> values) |
4419 |
27 Aug 08 |
nicklas |
124 |
{ |
4419 |
27 Aug 08 |
nicklas |
// First, calculate all pair-wise scores |
4419 |
27 Aug 08 |
nicklas |
126 |
double[][] scores = new double[keys.size()][values.size()]; |
4419 |
27 Aug 08 |
nicklas |
127 |
String[] theKeys = new String[keys.size()]; |
4419 |
27 Aug 08 |
nicklas |
128 |
String[] theValues = new String[values.size()]; |
4419 |
27 Aug 08 |
nicklas |
129 |
int keyIdx = 0; |
4419 |
27 Aug 08 |
nicklas |
130 |
int valueIdx = 0; |
4419 |
27 Aug 08 |
nicklas |
131 |
for (String key : keys) |
4419 |
27 Aug 08 |
nicklas |
132 |
{ |
4419 |
27 Aug 08 |
nicklas |
133 |
theKeys[keyIdx] = key; |
4419 |
27 Aug 08 |
nicklas |
134 |
for (String value : values) |
4419 |
27 Aug 08 |
nicklas |
135 |
{ |
4419 |
27 Aug 08 |
nicklas |
136 |
if (keyIdx == 0) theValues[valueIdx] = value; |
4419 |
27 Aug 08 |
nicklas |
137 |
scores[keyIdx][valueIdx] = getScore(key, value); |
4419 |
27 Aug 08 |
nicklas |
138 |
valueIdx++; |
4419 |
27 Aug 08 |
nicklas |
139 |
} |
4419 |
27 Aug 08 |
nicklas |
140 |
keyIdx++; |
4419 |
27 Aug 08 |
nicklas |
141 |
valueIdx = 0; |
4419 |
27 Aug 08 |
nicklas |
142 |
} |
4419 |
27 Aug 08 |
nicklas |
143 |
|
4419 |
27 Aug 08 |
nicklas |
144 |
FuzzyMatch[] matched = new FuzzyMatch[keys.size()]; |
4419 |
27 Aug 08 |
nicklas |
// Find the position with the highest score |
4419 |
27 Aug 08 |
nicklas |
146 |
int[] bestIdx = getHighestScoreIdx(scores); |
4419 |
27 Aug 08 |
nicklas |
147 |
while (bestIdx != null) |
4419 |
27 Aug 08 |
nicklas |
148 |
{ |
4419 |
27 Aug 08 |
nicklas |
149 |
keyIdx = bestIdx[0]; |
4419 |
27 Aug 08 |
nicklas |
150 |
valueIdx = bestIdx[1]; |
4419 |
27 Aug 08 |
nicklas |
151 |
matched[keyIdx] = new FuzzyMatch(theKeys[keyIdx], theValues[valueIdx], scores[keyIdx][valueIdx]); |
4419 |
27 Aug 08 |
nicklas |
// Clear the scores for the used key/value so it is not matched again |
4419 |
27 Aug 08 |
nicklas |
153 |
for (int i = 0; i < scores[keyIdx].length; ++i) |
4419 |
27 Aug 08 |
nicklas |
154 |
{ |
4419 |
27 Aug 08 |
nicklas |
155 |
scores[keyIdx][i] = -Double.MAX_VALUE; |
4419 |
27 Aug 08 |
nicklas |
156 |
} |
4419 |
27 Aug 08 |
nicklas |
157 |
for (int i = 0; i < scores.length; ++i) |
4419 |
27 Aug 08 |
nicklas |
158 |
{ |
4419 |
27 Aug 08 |
nicklas |
159 |
scores[i][valueIdx] = -Double.MAX_VALUE; |
4419 |
27 Aug 08 |
nicklas |
160 |
} |
4419 |
27 Aug 08 |
nicklas |
// Get the next remaining position with the highest score |
4419 |
27 Aug 08 |
nicklas |
162 |
bestIdx = getHighestScoreIdx(scores); |
4419 |
27 Aug 08 |
nicklas |
163 |
} |
4419 |
27 Aug 08 |
nicklas |
164 |
return Arrays.asList(matched); |
4419 |
27 Aug 08 |
nicklas |
165 |
} |
4419 |
27 Aug 08 |
nicklas |
166 |
|
4419 |
27 Aug 08 |
nicklas |
167 |
private int[] getHighestScoreIdx(double[][] score) |
4419 |
27 Aug 08 |
nicklas |
168 |
{ |
4419 |
27 Aug 08 |
nicklas |
169 |
double maxScore = -Double.MAX_VALUE; |
4419 |
27 Aug 08 |
nicklas |
170 |
int maxKeyIdx = -1; |
4419 |
27 Aug 08 |
nicklas |
171 |
int maxValueIdx = -1; |
4419 |
27 Aug 08 |
nicklas |
172 |
int keyIdx = 0; |
4419 |
27 Aug 08 |
nicklas |
173 |
int valueIdx = 0; |
4419 |
27 Aug 08 |
nicklas |
174 |
while (keyIdx < score.length) |
4419 |
27 Aug 08 |
nicklas |
175 |
{ |
4419 |
27 Aug 08 |
nicklas |
176 |
while (valueIdx < score[keyIdx].length) |
4419 |
27 Aug 08 |
nicklas |
177 |
{ |
4419 |
27 Aug 08 |
nicklas |
178 |
if (score[keyIdx][valueIdx] > maxScore) |
4419 |
27 Aug 08 |
nicklas |
179 |
{ |
4419 |
27 Aug 08 |
nicklas |
180 |
maxScore = score[keyIdx][valueIdx]; |
4419 |
27 Aug 08 |
nicklas |
181 |
maxKeyIdx = keyIdx; |
4419 |
27 Aug 08 |
nicklas |
182 |
maxValueIdx = valueIdx; |
4419 |
27 Aug 08 |
nicklas |
183 |
} |
4419 |
27 Aug 08 |
nicklas |
184 |
valueIdx++; |
4419 |
27 Aug 08 |
nicklas |
185 |
} |
4419 |
27 Aug 08 |
nicklas |
186 |
keyIdx++; |
4419 |
27 Aug 08 |
nicklas |
187 |
valueIdx = 0; |
4419 |
27 Aug 08 |
nicklas |
188 |
} |
4419 |
27 Aug 08 |
nicklas |
189 |
return maxKeyIdx >= 0 ? new int[] { maxKeyIdx, maxValueIdx } : null; |
4419 |
27 Aug 08 |
nicklas |
190 |
} |
4419 |
27 Aug 08 |
nicklas |
191 |
|
4419 |
27 Aug 08 |
nicklas |
192 |
/** |
4419 |
27 Aug 08 |
nicklas |
Wrapper that holds information about a fuzzy match. |
4419 |
27 Aug 08 |
nicklas |
194 |
*/ |
4419 |
27 Aug 08 |
nicklas |
195 |
public static class FuzzyMatch |
4419 |
27 Aug 08 |
nicklas |
196 |
{ |
4419 |
27 Aug 08 |
nicklas |
197 |
private final String key; |
4419 |
27 Aug 08 |
nicklas |
198 |
private final String value; |
4419 |
27 Aug 08 |
nicklas |
199 |
private final double score; |
4419 |
27 Aug 08 |
nicklas |
200 |
|
4419 |
27 Aug 08 |
nicklas |
201 |
FuzzyMatch(String key, String value, double score) |
4419 |
27 Aug 08 |
nicklas |
202 |
{ |
4419 |
27 Aug 08 |
nicklas |
203 |
this.key = key; |
4419 |
27 Aug 08 |
nicklas |
204 |
this.value = value; |
4419 |
27 Aug 08 |
nicklas |
205 |
this.score = score; |
4419 |
27 Aug 08 |
nicklas |
206 |
} |
4419 |
27 Aug 08 |
nicklas |
207 |
|
4419 |
27 Aug 08 |
nicklas |
208 |
/** |
4419 |
27 Aug 08 |
nicklas |
Get the key string. |
4419 |
27 Aug 08 |
nicklas |
210 |
*/ |
4419 |
27 Aug 08 |
nicklas |
211 |
public String getKey() |
4419 |
27 Aug 08 |
nicklas |
212 |
{ |
4419 |
27 Aug 08 |
nicklas |
213 |
return key; |
4419 |
27 Aug 08 |
nicklas |
214 |
} |
4419 |
27 Aug 08 |
nicklas |
215 |
/** |
4419 |
27 Aug 08 |
nicklas |
Get the value string. |
4419 |
27 Aug 08 |
nicklas |
217 |
*/ |
4419 |
27 Aug 08 |
nicklas |
218 |
public String getValue() |
4419 |
27 Aug 08 |
nicklas |
219 |
{ |
4419 |
27 Aug 08 |
nicklas |
220 |
return value; |
4419 |
27 Aug 08 |
nicklas |
221 |
} |
4419 |
27 Aug 08 |
nicklas |
222 |
/** |
4419 |
27 Aug 08 |
nicklas |
Get the similarity score of the key/value strings. |
4419 |
27 Aug 08 |
nicklas |
@see #getScore(String, String) |
4419 |
27 Aug 08 |
nicklas |
225 |
*/ |
4419 |
27 Aug 08 |
nicklas |
226 |
public double getScore() |
4419 |
27 Aug 08 |
nicklas |
227 |
{ |
4419 |
27 Aug 08 |
nicklas |
228 |
return score; |
4419 |
27 Aug 08 |
nicklas |
229 |
} |
4419 |
27 Aug 08 |
nicklas |
230 |
} |
4419 |
27 Aug 08 |
nicklas |
231 |
|
4419 |
27 Aug 08 |
nicklas |
232 |
} |