3887 |
28 Apr 16 |
nicklas |
1 |
package net.sf.basedb.inca; |
3887 |
28 Apr 16 |
nicklas |
2 |
|
3887 |
28 Apr 16 |
nicklas |
3 |
import java.io.PrintWriter; |
3887 |
28 Apr 16 |
nicklas |
4 |
import java.util.ArrayList; |
3887 |
28 Apr 16 |
nicklas |
5 |
import java.util.List; |
3887 |
28 Apr 16 |
nicklas |
6 |
import java.util.Map; |
3887 |
28 Apr 16 |
nicklas |
7 |
import java.util.Set; |
3887 |
28 Apr 16 |
nicklas |
8 |
|
3887 |
28 Apr 16 |
nicklas |
9 |
/** |
3887 |
28 Apr 16 |
nicklas |
Implementation for writing the INCA CSV file. |
3887 |
28 Apr 16 |
nicklas |
11 |
|
7374 |
06 Oct 23 |
nicklas |
Since version 1.4 we support either using a whitelist or blacklist to handle |
7374 |
06 Oct 23 |
nicklas |
data for |
7374 |
06 Oct 23 |
nicklas |
14 |
|
3887 |
28 Apr 16 |
nicklas |
@author nicklas |
3887 |
28 Apr 16 |
nicklas |
@since 1.0 |
3887 |
28 Apr 16 |
nicklas |
17 |
*/ |
3887 |
28 Apr 16 |
nicklas |
18 |
public class IncaCsvWriter |
3887 |
28 Apr 16 |
nicklas |
19 |
{ |
3887 |
28 Apr 16 |
nicklas |
20 |
|
3887 |
28 Apr 16 |
nicklas |
21 |
private final ScanBParser scanb; |
7060 |
14 Mar 23 |
nicklas |
22 |
private final Set<String> whitelist; |
7374 |
06 Oct 23 |
nicklas |
23 |
private final Map<String, Masker> blacklist; |
3887 |
28 Apr 16 |
nicklas |
24 |
private final PrintWriter out; |
3887 |
28 Apr 16 |
nicklas |
25 |
private final List<Header> headers; |
3887 |
28 Apr 16 |
nicklas |
26 |
|
3890 |
28 Apr 16 |
nicklas |
27 |
private int numRows; |
3890 |
28 Apr 16 |
nicklas |
28 |
private int numMasked; |
7060 |
14 Mar 23 |
nicklas |
29 |
private int numSkipped; |
3890 |
28 Apr 16 |
nicklas |
30 |
|
3887 |
28 Apr 16 |
nicklas |
31 |
/** |
3887 |
28 Apr 16 |
nicklas |
Write data to the 'out' file. The 'scanb' parser is used to |
3887 |
28 Apr 16 |
nicklas |
check which rows that can be written in full and which rows |
7374 |
06 Oct 23 |
nicklas |
that must be masked. All columns except those in the whitelist |
7374 |
06 Oct 23 |
nicklas |
are masked. |
3887 |
28 Apr 16 |
nicklas |
36 |
|
7374 |
06 Oct 23 |
nicklas |
@param whitelist Column names that are allowed to be unmasked if the |
3887 |
28 Apr 16 |
nicklas |
row is not accepted by the scanb parser |
3887 |
28 Apr 16 |
nicklas |
39 |
*/ |
7060 |
14 Mar 23 |
nicklas |
40 |
public IncaCsvWriter(PrintWriter out, ScanBParser scanb, Set<String> whitelist) |
3887 |
28 Apr 16 |
nicklas |
41 |
{ |
3887 |
28 Apr 16 |
nicklas |
42 |
this.out = out; |
3887 |
28 Apr 16 |
nicklas |
43 |
this.scanb = scanb; |
7060 |
14 Mar 23 |
nicklas |
44 |
this.whitelist = whitelist; |
7374 |
06 Oct 23 |
nicklas |
45 |
this.blacklist = null; |
3887 |
28 Apr 16 |
nicklas |
46 |
this.headers = new ArrayList<>(); |
3887 |
28 Apr 16 |
nicklas |
47 |
} |
3887 |
28 Apr 16 |
nicklas |
48 |
|
3887 |
28 Apr 16 |
nicklas |
49 |
/** |
7374 |
06 Oct 23 |
nicklas |
Write data to the 'out' file. The 'scanb' parser is used to |
7374 |
06 Oct 23 |
nicklas |
check which rows that can be written in full and which rows |
7374 |
06 Oct 23 |
nicklas |
that must use the blacklist. Only columns in the blacklist are |
7374 |
06 Oct 23 |
nicklas |
masked. |
7374 |
06 Oct 23 |
nicklas |
54 |
|
7374 |
06 Oct 23 |
nicklas |
@param blacklist A map with column names that should be masked if the |
7374 |
06 Oct 23 |
nicklas |
row is not accepted by the scanb parser |
7374 |
06 Oct 23 |
nicklas |
57 |
*/ |
7374 |
06 Oct 23 |
nicklas |
58 |
public IncaCsvWriter(PrintWriter out, ScanBParser scanb, Map<String, Masker> blacklist) |
7374 |
06 Oct 23 |
nicklas |
59 |
{ |
7374 |
06 Oct 23 |
nicklas |
60 |
this.out = out; |
7374 |
06 Oct 23 |
nicklas |
61 |
this.scanb = scanb; |
7374 |
06 Oct 23 |
nicklas |
62 |
this.whitelist = null; |
7374 |
06 Oct 23 |
nicklas |
63 |
this.blacklist = blacklist; |
7374 |
06 Oct 23 |
nicklas |
64 |
this.headers = new ArrayList<>(); |
7374 |
06 Oct 23 |
nicklas |
65 |
} |
7374 |
06 Oct 23 |
nicklas |
66 |
|
7374 |
06 Oct 23 |
nicklas |
67 |
/** |
3887 |
28 Apr 16 |
nicklas |
Write the headers. Do not call more than once. |
3887 |
28 Apr 16 |
nicklas |
We make a copy of the headers into a list to make sure that |
3887 |
28 Apr 16 |
nicklas |
the {@link #writeRow(Map)} writes the data in the same order. |
3887 |
28 Apr 16 |
nicklas |
71 |
*/ |
3887 |
28 Apr 16 |
nicklas |
72 |
public void writeHeaders(Set<String> all) |
3887 |
28 Apr 16 |
nicklas |
73 |
{ |
3887 |
28 Apr 16 |
nicklas |
74 |
if (headers.size() > 0) |
3887 |
28 Apr 16 |
nicklas |
75 |
{ |
3887 |
28 Apr 16 |
nicklas |
76 |
throw new RuntimeException("Headers have already been written"); |
3887 |
28 Apr 16 |
nicklas |
77 |
} |
3887 |
28 Apr 16 |
nicklas |
78 |
|
3887 |
28 Apr 16 |
nicklas |
79 |
boolean tab = false; |
3887 |
28 Apr 16 |
nicklas |
80 |
for (String h : all) |
3887 |
28 Apr 16 |
nicklas |
81 |
{ |
7374 |
06 Oct 23 |
nicklas |
82 |
Masker m = blacklist != null ? blacklist.get(h) : (whitelist.contains(h) ? null : ConstMasker.EMPTY_STRING); |
7374 |
06 Oct 23 |
nicklas |
83 |
headers.add(new Header(h, m)); |
3887 |
28 Apr 16 |
nicklas |
84 |
if (tab) out.print('\t'); |
3887 |
28 Apr 16 |
nicklas |
85 |
out.print(h); |
3887 |
28 Apr 16 |
nicklas |
86 |
tab = true; |
3887 |
28 Apr 16 |
nicklas |
87 |
} |
3887 |
28 Apr 16 |
nicklas |
88 |
out.print('\n'); |
3887 |
28 Apr 16 |
nicklas |
89 |
} |
3887 |
28 Apr 16 |
nicklas |
90 |
|
3887 |
28 Apr 16 |
nicklas |
91 |
/** |
3887 |
28 Apr 16 |
nicklas |
Write a data row. The input is a map with INCA variable name |
3891 |
28 Apr 16 |
nicklas |
as key and the data as values. If the "PersonalNo" is null nothing |
3887 |
28 Apr 16 |
nicklas |
is written. If the {@link ScanBParser#isAccpted(String)} accepts |
7060 |
14 Mar 23 |
nicklas |
the personal number the entire row is written, otherwise |
7374 |
06 Oct 23 |
nicklas |
only the columns in the whitelist or the masked values from the blacklist. |
3887 |
28 Apr 16 |
nicklas |
97 |
*/ |
3887 |
28 Apr 16 |
nicklas |
98 |
public void writeRow(Map<String, String> row) |
3887 |
28 Apr 16 |
nicklas |
99 |
{ |
3891 |
28 Apr 16 |
nicklas |
100 |
String pnr = row.get("PersonalNo"); |
7060 |
14 Mar 23 |
nicklas |
101 |
if (pnr == null) |
7060 |
14 Mar 23 |
nicklas |
102 |
{ |
7060 |
14 Mar 23 |
nicklas |
103 |
numSkipped++; |
7060 |
14 Mar 23 |
nicklas |
104 |
return; |
7060 |
14 Mar 23 |
nicklas |
105 |
} |
3887 |
28 Apr 16 |
nicklas |
106 |
|
7060 |
14 Mar 23 |
nicklas |
107 |
boolean mask = !scanb.isAccpted(pnr); |
3887 |
28 Apr 16 |
nicklas |
108 |
boolean tab = false; |
3887 |
28 Apr 16 |
nicklas |
109 |
for (Header h : headers) |
3887 |
28 Apr 16 |
nicklas |
110 |
{ |
3887 |
28 Apr 16 |
nicklas |
111 |
if (tab) out.print('\t'); |
4191 |
31 Oct 16 |
nicklas |
112 |
String value = row.get(h.name); |
7060 |
14 Mar 23 |
nicklas |
113 |
if (h.masker != null && mask) value = h.masker.getMaskedValue(value); |
4191 |
31 Oct 16 |
nicklas |
114 |
out.print(encode(value)); |
3887 |
28 Apr 16 |
nicklas |
115 |
tab = true; |
3887 |
28 Apr 16 |
nicklas |
116 |
} |
3887 |
28 Apr 16 |
nicklas |
117 |
out.print('\n'); |
3890 |
28 Apr 16 |
nicklas |
118 |
|
3890 |
28 Apr 16 |
nicklas |
119 |
numRows++; |
7060 |
14 Mar 23 |
nicklas |
120 |
if (mask) numMasked++; |
3887 |
28 Apr 16 |
nicklas |
121 |
} |
3887 |
28 Apr 16 |
nicklas |
122 |
|
3887 |
28 Apr 16 |
nicklas |
123 |
/** |
3890 |
28 Apr 16 |
nicklas |
The number of rows that was written (excluding the header row). |
3890 |
28 Apr 16 |
nicklas |
125 |
*/ |
3890 |
28 Apr 16 |
nicklas |
126 |
public int getRows() |
3890 |
28 Apr 16 |
nicklas |
127 |
{ |
3890 |
28 Apr 16 |
nicklas |
128 |
return numRows; |
3890 |
28 Apr 16 |
nicklas |
129 |
} |
3890 |
28 Apr 16 |
nicklas |
130 |
|
3890 |
28 Apr 16 |
nicklas |
131 |
/** |
7060 |
14 Mar 23 |
nicklas |
Get the number of rows that was skipped. |
7060 |
14 Mar 23 |
nicklas |
@since 1.3 |
7060 |
14 Mar 23 |
nicklas |
134 |
*/ |
7060 |
14 Mar 23 |
nicklas |
135 |
public int getSkipped() |
7060 |
14 Mar 23 |
nicklas |
136 |
{ |
7060 |
14 Mar 23 |
nicklas |
137 |
return numSkipped; |
7060 |
14 Mar 23 |
nicklas |
138 |
} |
7060 |
14 Mar 23 |
nicklas |
139 |
|
7060 |
14 Mar 23 |
nicklas |
140 |
/** |
3890 |
28 Apr 16 |
nicklas |
The number of masked rows. |
3890 |
28 Apr 16 |
nicklas |
142 |
*/ |
3890 |
28 Apr 16 |
nicklas |
143 |
public int getMasked() |
3890 |
28 Apr 16 |
nicklas |
144 |
{ |
3890 |
28 Apr 16 |
nicklas |
145 |
return numMasked; |
3890 |
28 Apr 16 |
nicklas |
146 |
} |
3890 |
28 Apr 16 |
nicklas |
147 |
|
3890 |
28 Apr 16 |
nicklas |
148 |
/** |
7060 |
14 Mar 23 |
nicklas |
The number of full rows. |
7060 |
14 Mar 23 |
nicklas |
150 |
*/ |
7060 |
14 Mar 23 |
nicklas |
151 |
public int getFullRows() |
7060 |
14 Mar 23 |
nicklas |
152 |
{ |
7060 |
14 Mar 23 |
nicklas |
153 |
return numRows - numMasked; |
7060 |
14 Mar 23 |
nicklas |
154 |
} |
7060 |
14 Mar 23 |
nicklas |
155 |
|
7060 |
14 Mar 23 |
nicklas |
156 |
/** |
3890 |
28 Apr 16 |
nicklas |
The number of columns in the CSV. |
3890 |
28 Apr 16 |
nicklas |
158 |
*/ |
3890 |
28 Apr 16 |
nicklas |
159 |
public int getColumns() |
3890 |
28 Apr 16 |
nicklas |
160 |
{ |
3890 |
28 Apr 16 |
nicklas |
161 |
return headers.size(); |
3890 |
28 Apr 16 |
nicklas |
162 |
} |
3890 |
28 Apr 16 |
nicklas |
163 |
|
3890 |
28 Apr 16 |
nicklas |
164 |
/** |
3887 |
28 Apr 16 |
nicklas |
Replaces newline, carriage return, tab and backslash with \n, \r, \t and \\. |
3887 |
28 Apr 16 |
nicklas |
Null is converted to the empty string. |
3887 |
28 Apr 16 |
nicklas |
(Copied from net.sf.basedb.util.encode.TabCrLfEncoderDecoder in BASE) |
3887 |
28 Apr 16 |
nicklas |
168 |
*/ |
3887 |
28 Apr 16 |
nicklas |
169 |
public String encode(String s) |
3887 |
28 Apr 16 |
nicklas |
170 |
{ |
3887 |
28 Apr 16 |
nicklas |
171 |
if (s == null) return ""; |
3887 |
28 Apr 16 |
nicklas |
172 |
boolean neededEncode = false; |
3887 |
28 Apr 16 |
nicklas |
173 |
StringBuilder sb = new StringBuilder(s.length()); |
3887 |
28 Apr 16 |
nicklas |
174 |
for (int i = 0; i < s.length(); ++i) |
3887 |
28 Apr 16 |
nicklas |
175 |
{ |
3887 |
28 Apr 16 |
nicklas |
176 |
char c = s.charAt(i); |
3887 |
28 Apr 16 |
nicklas |
177 |
if (c == '\t') |
3887 |
28 Apr 16 |
nicklas |
178 |
{ |
3887 |
28 Apr 16 |
nicklas |
179 |
sb.append("\\t"); |
3887 |
28 Apr 16 |
nicklas |
180 |
neededEncode = true; |
3887 |
28 Apr 16 |
nicklas |
181 |
} |
3887 |
28 Apr 16 |
nicklas |
182 |
else if (c == '\n') |
3887 |
28 Apr 16 |
nicklas |
183 |
{ |
3887 |
28 Apr 16 |
nicklas |
184 |
sb.append("\\n"); |
3887 |
28 Apr 16 |
nicklas |
185 |
neededEncode = true; |
3887 |
28 Apr 16 |
nicklas |
186 |
} |
3887 |
28 Apr 16 |
nicklas |
187 |
else if (c == '\r') |
3887 |
28 Apr 16 |
nicklas |
188 |
{ |
3887 |
28 Apr 16 |
nicklas |
189 |
sb.append("\\r"); |
3887 |
28 Apr 16 |
nicklas |
190 |
neededEncode = true; |
3887 |
28 Apr 16 |
nicklas |
191 |
} |
3887 |
28 Apr 16 |
nicklas |
192 |
else if (c == '\\') |
3887 |
28 Apr 16 |
nicklas |
193 |
{ |
3887 |
28 Apr 16 |
nicklas |
194 |
sb.append("\\\\"); |
3887 |
28 Apr 16 |
nicklas |
195 |
neededEncode = true; |
3887 |
28 Apr 16 |
nicklas |
196 |
} |
3887 |
28 Apr 16 |
nicklas |
197 |
else |
3887 |
28 Apr 16 |
nicklas |
198 |
{ |
3887 |
28 Apr 16 |
nicklas |
199 |
sb.append(c); |
3887 |
28 Apr 16 |
nicklas |
200 |
} |
3887 |
28 Apr 16 |
nicklas |
201 |
} |
3887 |
28 Apr 16 |
nicklas |
202 |
return neededEncode ? sb.toString() : s; |
3887 |
28 Apr 16 |
nicklas |
203 |
} |
3887 |
28 Apr 16 |
nicklas |
204 |
|
3887 |
28 Apr 16 |
nicklas |
205 |
/** |
3887 |
28 Apr 16 |
nicklas |
The name of a INCA header and a flag if is blacklisted |
3887 |
28 Apr 16 |
nicklas |
or not. |
3887 |
28 Apr 16 |
nicklas |
208 |
*/ |
3887 |
28 Apr 16 |
nicklas |
209 |
static class Header |
3887 |
28 Apr 16 |
nicklas |
210 |
{ |
3887 |
28 Apr 16 |
nicklas |
211 |
final String name; |
4191 |
31 Oct 16 |
nicklas |
212 |
final Masker masker; |
3887 |
28 Apr 16 |
nicklas |
213 |
|
4191 |
31 Oct 16 |
nicklas |
214 |
Header(String name, Masker masker) |
3887 |
28 Apr 16 |
nicklas |
215 |
{ |
3887 |
28 Apr 16 |
nicklas |
216 |
this.name = name; |
4191 |
31 Oct 16 |
nicklas |
217 |
this.masker = masker; |
3887 |
28 Apr 16 |
nicklas |
218 |
} |
3887 |
28 Apr 16 |
nicklas |
219 |
} |
3887 |
28 Apr 16 |
nicklas |
220 |
|
3887 |
28 Apr 16 |
nicklas |
221 |
} |