7623 |
07 Mar 19 |
nicklas |
1 |
package net.sf.basedb.util.charset; |
7623 |
07 Mar 19 |
nicklas |
2 |
|
7623 |
07 Mar 19 |
nicklas |
3 |
import java.io.IOException; |
7623 |
07 Mar 19 |
nicklas |
4 |
|
7623 |
07 Mar 19 |
nicklas |
5 |
/** |
7623 |
07 Mar 19 |
nicklas |
A simple string detector implementation that works with two strings. |
7623 |
07 Mar 19 |
nicklas |
It is designed to be used to detect the encoding in tabular data files |
7623 |
07 Mar 19 |
nicklas |
using one of the ISO-8859-N encodings or similar that are not not possible |
7623 |
07 Mar 19 |
nicklas |
to separate techically. |
7623 |
07 Mar 19 |
nicklas |
10 |
|
7623 |
07 Mar 19 |
nicklas |
The data file is expected to contain a header line were at least one header |
7623 |
07 Mar 19 |
nicklas |
column has a name with non-ASCII characters. |
7623 |
07 Mar 19 |
nicklas |
13 |
|
7623 |
07 Mar 19 |
nicklas |
For each line of data if will first check if the 'ifFound' string |
7623 |
07 Mar 19 |
nicklas |
can be found. If not, it will return null to request more data. |
7623 |
07 Mar 19 |
nicklas |
16 |
|
7623 |
07 Mar 19 |
nicklas |
If the 'ifFound' string is found, it will continue to see if the 'thenMatch' |
7623 |
07 Mar 19 |
nicklas |
string is also present. If so, TRUE is returned to indicate a successful |
7623 |
07 Mar 19 |
nicklas |
encoding match, otherwise FALSE is return to indicate an incorrect encoding. |
7623 |
07 Mar 19 |
nicklas |
20 |
|
7623 |
07 Mar 19 |
nicklas |
Note that the two strings need to be selected wisely. The 'ifFound' string should |
7623 |
07 Mar 19 |
nicklas |
typcially be an ASCII-only string and 'thenMatch' a string with one or more non-ASCII |
7623 |
07 Mar 19 |
nicklas |
characters. |
7623 |
07 Mar 19 |
nicklas |
24 |
|
7623 |
07 Mar 19 |
nicklas |
For example, if the file header is: <code>Namn{tab}Ålder</code>, |
7623 |
07 Mar 19 |
nicklas |
we could use 'ifFound=Namn' and 'thenMatch=Ålder'. |
7623 |
07 Mar 19 |
nicklas |
27 |
|
7703 |
11 Apr 19 |
nicklas |
If the entire file is parsed without finding the 'ifFound' string, the {@link #eof(int)} method |
7623 |
07 Mar 19 |
nicklas |
will return false. |
7623 |
07 Mar 19 |
nicklas |
30 |
|
7623 |
07 Mar 19 |
nicklas |
@author nicklas |
7623 |
07 Mar 19 |
nicklas |
@since 3.15 |
7623 |
07 Mar 19 |
nicklas |
33 |
*/ |
7623 |
07 Mar 19 |
nicklas |
34 |
public class SimpleStringDetector |
7623 |
07 Mar 19 |
nicklas |
35 |
implements StringDetector |
7623 |
07 Mar 19 |
nicklas |
36 |
{ |
7623 |
07 Mar 19 |
nicklas |
37 |
|
7623 |
07 Mar 19 |
nicklas |
38 |
private final String ifFound; |
7623 |
07 Mar 19 |
nicklas |
39 |
private final String thenMatch; |
7623 |
07 Mar 19 |
nicklas |
40 |
|
7623 |
07 Mar 19 |
nicklas |
41 |
public SimpleStringDetector(String ifFound, String thenMatch) |
7623 |
07 Mar 19 |
nicklas |
42 |
{ |
7623 |
07 Mar 19 |
nicklas |
43 |
this.ifFound = ifFound; |
7623 |
07 Mar 19 |
nicklas |
44 |
this.thenMatch = thenMatch; |
7623 |
07 Mar 19 |
nicklas |
45 |
} |
7623 |
07 Mar 19 |
nicklas |
46 |
|
7623 |
07 Mar 19 |
nicklas |
47 |
@Override |
7623 |
07 Mar 19 |
nicklas |
48 |
public boolean checkLine(int lineNo, String line) |
7623 |
07 Mar 19 |
nicklas |
49 |
throws IOException |
7623 |
07 Mar 19 |
nicklas |
50 |
{ |
7623 |
07 Mar 19 |
nicklas |
51 |
if (line == null || !line.contains(ifFound)) return false; |
7623 |
07 Mar 19 |
nicklas |
52 |
if (line.contains(thenMatch)) return true; |
7623 |
07 Mar 19 |
nicklas |
53 |
throw new IOException("Found '"+ifFound+"' on line " + lineNo + " but not '" + thenMatch + "': "+line); |
7623 |
07 Mar 19 |
nicklas |
54 |
} |
7623 |
07 Mar 19 |
nicklas |
55 |
|
7623 |
07 Mar 19 |
nicklas |
56 |
@Override |
7623 |
07 Mar 19 |
nicklas |
57 |
public void eof(int parsedLines) |
7623 |
07 Mar 19 |
nicklas |
58 |
throws IOException |
7623 |
07 Mar 19 |
nicklas |
59 |
{ |
7623 |
07 Mar 19 |
nicklas |
60 |
throw new IOException("Could not find '"+ifFound + "' after " + parsedLines + " lines"); |
7623 |
07 Mar 19 |
nicklas |
61 |
} |
7623 |
07 Mar 19 |
nicklas |
62 |
|
7623 |
07 Mar 19 |
nicklas |
63 |
} |