7623 |
07 Mar 19 |
nicklas |
1 |
package net.sf.basedb.util.charset; |
7623 |
07 Mar 19 |
nicklas |
2 |
|
7623 |
07 Mar 19 |
nicklas |
3 |
import java.io.BufferedReader; |
7623 |
07 Mar 19 |
nicklas |
4 |
import java.io.IOException; |
7623 |
07 Mar 19 |
nicklas |
5 |
import java.io.InputStream; |
7623 |
07 Mar 19 |
nicklas |
6 |
import java.io.InputStreamReader; |
7623 |
07 Mar 19 |
nicklas |
7 |
import java.nio.charset.Charset; |
7623 |
07 Mar 19 |
nicklas |
8 |
import java.nio.charset.CharsetDecoder; |
7623 |
07 Mar 19 |
nicklas |
9 |
|
7623 |
07 Mar 19 |
nicklas |
10 |
import net.sf.basedb.util.FileUtil; |
7623 |
07 Mar 19 |
nicklas |
11 |
import net.sf.basedb.util.InputStreamTracker; |
7623 |
07 Mar 19 |
nicklas |
12 |
|
7623 |
07 Mar 19 |
nicklas |
13 |
/** |
7623 |
07 Mar 19 |
nicklas |
Utility class for testing if a text stream can be parsed using |
7623 |
07 Mar 19 |
nicklas |
a given character set. There are two sides of the testing: |
7623 |
07 Mar 19 |
nicklas |
16 |
|
7623 |
07 Mar 19 |
nicklas |
* The technical side which checks for invalid byte sequences, etc. This works well for |
7623 |
07 Mar 19 |
nicklas |
UTF-8 but it is, for example, not able to discriminate betwee different ISO-8859-? or |
7623 |
07 Mar 19 |
nicklas |
Windows-? encoding. |
7623 |
07 Mar 19 |
nicklas |
* The content side which can check that the parsed content contains some expected text |
7623 |
07 Mar 19 |
nicklas |
strings. This can be used to discriminate between diffent ISO-8859-? or Windows-? |
7623 |
07 Mar 19 |
nicklas |
encoding by using careful choices of text strings to look for. |
7623 |
07 Mar 19 |
nicklas |
23 |
|
7623 |
07 Mar 19 |
nicklas |
24 |
|
7623 |
07 Mar 19 |
nicklas |
@author nicklas |
7623 |
07 Mar 19 |
nicklas |
@since 3.15 |
7623 |
07 Mar 19 |
nicklas |
27 |
*/ |
7623 |
07 Mar 19 |
nicklas |
28 |
public class CharsetDetector |
7623 |
07 Mar 19 |
nicklas |
29 |
{ |
7623 |
07 Mar 19 |
nicklas |
30 |
|
7623 |
07 Mar 19 |
nicklas |
31 |
private final Charset charset; |
7623 |
07 Mar 19 |
nicklas |
32 |
private final StringDetector lineTester; |
7623 |
07 Mar 19 |
nicklas |
33 |
private IOException parsingFailure; |
7623 |
07 Mar 19 |
nicklas |
34 |
private long parsedBytes; |
7623 |
07 Mar 19 |
nicklas |
35 |
private int parsedLines; |
7623 |
07 Mar 19 |
nicklas |
36 |
|
7623 |
07 Mar 19 |
nicklas |
37 |
/** |
7623 |
07 Mar 19 |
nicklas |
Create a detector for the given character set that |
7623 |
07 Mar 19 |
nicklas |
only detects technical issues. Useful for UTF-8. |
7623 |
07 Mar 19 |
nicklas |
40 |
*/ |
7623 |
07 Mar 19 |
nicklas |
41 |
public CharsetDetector(Charset charset) |
7623 |
07 Mar 19 |
nicklas |
42 |
{ |
7623 |
07 Mar 19 |
nicklas |
43 |
this(charset, null); |
7623 |
07 Mar 19 |
nicklas |
44 |
} |
7623 |
07 Mar 19 |
nicklas |
45 |
|
7623 |
07 Mar 19 |
nicklas |
46 |
/** |
7623 |
07 Mar 19 |
nicklas |
Create a detector for the given character set that uses technical |
7623 |
07 Mar 19 |
nicklas |
an content-based detection. If no lineTester is given it will use |
7623 |
07 Mar 19 |
nicklas |
only technical detection. |
7623 |
07 Mar 19 |
nicklas |
50 |
*/ |
7623 |
07 Mar 19 |
nicklas |
51 |
public CharsetDetector(Charset charset, StringDetector lineTester) |
7623 |
07 Mar 19 |
nicklas |
52 |
{ |
7623 |
07 Mar 19 |
nicklas |
53 |
this.charset = charset; |
7623 |
07 Mar 19 |
nicklas |
54 |
this.lineTester = lineTester; |
7623 |
07 Mar 19 |
nicklas |
55 |
} |
7623 |
07 Mar 19 |
nicklas |
56 |
|
7623 |
07 Mar 19 |
nicklas |
57 |
/** |
7623 |
07 Mar 19 |
nicklas |
Get the character set this detector is configured to use. |
7623 |
07 Mar 19 |
nicklas |
59 |
*/ |
7623 |
07 Mar 19 |
nicklas |
60 |
public Charset getCharset() |
7623 |
07 Mar 19 |
nicklas |
61 |
{ |
7623 |
07 Mar 19 |
nicklas |
62 |
return charset; |
7623 |
07 Mar 19 |
nicklas |
63 |
} |
7623 |
07 Mar 19 |
nicklas |
64 |
|
7623 |
07 Mar 19 |
nicklas |
65 |
/** |
7623 |
07 Mar 19 |
nicklas |
Test if the given input stream can be parsed with the configured character set. |
7623 |
07 Mar 19 |
nicklas |
The stream is read until the end is reached or until there is a decoding failure. |
7623 |
07 Mar 19 |
nicklas |
68 |
*/ |
7623 |
07 Mar 19 |
nicklas |
69 |
public boolean testIt(InputStream in) |
7623 |
07 Mar 19 |
nicklas |
70 |
{ |
7623 |
07 Mar 19 |
nicklas |
71 |
return testIt(in, -1, -1); |
7623 |
07 Mar 19 |
nicklas |
72 |
} |
7623 |
07 Mar 19 |
nicklas |
73 |
|
7623 |
07 Mar 19 |
nicklas |
74 |
/** |
7623 |
07 Mar 19 |
nicklas |
Test if the given input stream can be parsed with the configured character set. |
7623 |
07 Mar 19 |
nicklas |
The stream is read until maxBytes bytes has been parsed or until there is a decoding failure. |
7623 |
07 Mar 19 |
nicklas |
@param maxBytes Max number of bytes to parse or -1 to not use a limit |
7623 |
07 Mar 19 |
nicklas |
@param maxLines Max number of lines to parse or -1 to not use a limit |
7623 |
07 Mar 19 |
nicklas |
79 |
*/ |
7623 |
07 Mar 19 |
nicklas |
80 |
public boolean testIt(InputStream in, long maxBytes, int maxLines) |
7623 |
07 Mar 19 |
nicklas |
81 |
{ |
7623 |
07 Mar 19 |
nicklas |
82 |
CharsetDecoder dec = charset.newDecoder(); |
7623 |
07 Mar 19 |
nicklas |
83 |
InputStreamTracker tracker = new InputStreamTracker(in); |
7623 |
07 Mar 19 |
nicklas |
84 |
BufferedReader tryit = new BufferedReader(new InputStreamReader(tracker, dec)); |
7623 |
07 Mar 19 |
nicklas |
85 |
|
7623 |
07 Mar 19 |
nicklas |
86 |
parsingFailure = null; |
7623 |
07 Mar 19 |
nicklas |
87 |
parsedBytes = 0; |
7623 |
07 Mar 19 |
nicklas |
88 |
parsedLines = 0; |
7623 |
07 Mar 19 |
nicklas |
89 |
try |
7623 |
07 Mar 19 |
nicklas |
90 |
{ |
7623 |
07 Mar 19 |
nicklas |
91 |
String line = null; |
7623 |
07 Mar 19 |
nicklas |
92 |
boolean lineCheck = false; |
7623 |
07 Mar 19 |
nicklas |
93 |
do |
7623 |
07 Mar 19 |
nicklas |
94 |
{ |
7623 |
07 Mar 19 |
nicklas |
95 |
line = tryit.readLine(); |
7623 |
07 Mar 19 |
nicklas |
96 |
parsedLines++; |
7623 |
07 Mar 19 |
nicklas |
97 |
parsedBytes = tracker.getNumRead(); |
7623 |
07 Mar 19 |
nicklas |
98 |
if (line != null && lineTester != null) |
7623 |
07 Mar 19 |
nicklas |
99 |
{ |
7623 |
07 Mar 19 |
nicklas |
100 |
lineCheck = lineTester.checkLine(parsedLines, line); |
7623 |
07 Mar 19 |
nicklas |
101 |
if (lineCheck) break; |
7623 |
07 Mar 19 |
nicklas |
102 |
} |
7623 |
07 Mar 19 |
nicklas |
103 |
} while (line != null && (maxBytes == -1 || parsedBytes < maxBytes) && (maxLines == -1 || parsedLines < maxLines)); |
7623 |
07 Mar 19 |
nicklas |
104 |
|
7623 |
07 Mar 19 |
nicklas |
// We reached the end but the lineTester never returned TRUE so we call lineTester.eof() |
7623 |
07 Mar 19 |
nicklas |
106 |
if (lineTester != null && !lineCheck) |
7623 |
07 Mar 19 |
nicklas |
107 |
{ |
7623 |
07 Mar 19 |
nicklas |
108 |
lineTester.eof(parsedLines); |
7623 |
07 Mar 19 |
nicklas |
109 |
} |
7623 |
07 Mar 19 |
nicklas |
110 |
|
7623 |
07 Mar 19 |
nicklas |
111 |
} |
7623 |
07 Mar 19 |
nicklas |
112 |
catch (IOException ex) |
7623 |
07 Mar 19 |
nicklas |
113 |
{ |
7623 |
07 Mar 19 |
nicklas |
114 |
parsingFailure = ex; |
7623 |
07 Mar 19 |
nicklas |
115 |
} |
7623 |
07 Mar 19 |
nicklas |
116 |
catch (RuntimeException ex) |
7623 |
07 Mar 19 |
nicklas |
117 |
{ |
7623 |
07 Mar 19 |
nicklas |
118 |
parsingFailure = new IOException(ex); |
7623 |
07 Mar 19 |
nicklas |
119 |
} |
7623 |
07 Mar 19 |
nicklas |
120 |
finally |
7623 |
07 Mar 19 |
nicklas |
121 |
{ |
7623 |
07 Mar 19 |
nicklas |
122 |
FileUtil.close(tryit); |
7623 |
07 Mar 19 |
nicklas |
123 |
} |
7623 |
07 Mar 19 |
nicklas |
124 |
return parsingFailure == null; |
7623 |
07 Mar 19 |
nicklas |
125 |
} |
7623 |
07 Mar 19 |
nicklas |
126 |
|
7623 |
07 Mar 19 |
nicklas |
127 |
/** |
7623 |
07 Mar 19 |
nicklas |
Get the number of bytes that the last test operation parsed. |
7623 |
07 Mar 19 |
nicklas |
129 |
*/ |
7623 |
07 Mar 19 |
nicklas |
130 |
public long getParsedBytes() |
7623 |
07 Mar 19 |
nicklas |
131 |
{ |
7623 |
07 Mar 19 |
nicklas |
132 |
return parsedBytes; |
7623 |
07 Mar 19 |
nicklas |
133 |
} |
7623 |
07 Mar 19 |
nicklas |
134 |
|
7623 |
07 Mar 19 |
nicklas |
135 |
public int getParsedLines() |
7623 |
07 Mar 19 |
nicklas |
136 |
{ |
7623 |
07 Mar 19 |
nicklas |
137 |
return parsedLines; |
7623 |
07 Mar 19 |
nicklas |
138 |
} |
7623 |
07 Mar 19 |
nicklas |
139 |
|
7623 |
07 Mar 19 |
nicklas |
140 |
/** |
7623 |
07 Mar 19 |
nicklas |
If the last test failed, get the exception that was thrown by the parser. |
7623 |
07 Mar 19 |
nicklas |
142 |
*/ |
7623 |
07 Mar 19 |
nicklas |
143 |
public IOException getParsingFailure() |
7623 |
07 Mar 19 |
nicklas |
144 |
{ |
7623 |
07 Mar 19 |
nicklas |
145 |
return parsingFailure; |
7623 |
07 Mar 19 |
nicklas |
146 |
} |
7623 |
07 Mar 19 |
nicklas |
147 |
|
7623 |
07 Mar 19 |
nicklas |
148 |
} |