3885 |
27 Apr 16 |
nicklas |
1 |
package net.sf.basedb.inca; |
3885 |
27 Apr 16 |
nicklas |
2 |
|
3885 |
27 Apr 16 |
nicklas |
3 |
import java.io.IOException; |
3885 |
27 Apr 16 |
nicklas |
4 |
import java.io.InputStream; |
7373 |
06 Oct 23 |
nicklas |
5 |
import java.util.Arrays; |
3885 |
27 Apr 16 |
nicklas |
6 |
import java.util.HashMap; |
7373 |
06 Oct 23 |
nicklas |
7 |
import java.util.HashSet; |
3885 |
27 Apr 16 |
nicklas |
8 |
import java.util.LinkedHashSet; |
3885 |
27 Apr 16 |
nicklas |
9 |
import java.util.Map; |
3885 |
27 Apr 16 |
nicklas |
10 |
import java.util.Set; |
3885 |
27 Apr 16 |
nicklas |
11 |
|
7373 |
06 Oct 23 |
nicklas |
12 |
import javax.swing.JOptionPane; |
3885 |
27 Apr 16 |
nicklas |
13 |
import javax.xml.parsers.ParserConfigurationException; |
3885 |
27 Apr 16 |
nicklas |
14 |
import javax.xml.parsers.SAXParser; |
3885 |
27 Apr 16 |
nicklas |
15 |
import javax.xml.parsers.SAXParserFactory; |
3885 |
27 Apr 16 |
nicklas |
16 |
|
3885 |
27 Apr 16 |
nicklas |
17 |
import org.xml.sax.Attributes; |
3885 |
27 Apr 16 |
nicklas |
18 |
import org.xml.sax.Locator; |
3885 |
27 Apr 16 |
nicklas |
19 |
import org.xml.sax.SAXException; |
3885 |
27 Apr 16 |
nicklas |
20 |
import org.xml.sax.helpers.DefaultHandler; |
3885 |
27 Apr 16 |
nicklas |
21 |
import org.xml.sax.helpers.LocatorImpl; |
3885 |
27 Apr 16 |
nicklas |
22 |
|
3885 |
27 Apr 16 |
nicklas |
23 |
/** |
3885 |
27 Apr 16 |
nicklas |
Parses the XML file from the INCA export. |
3885 |
27 Apr 16 |
nicklas |
The structure is very simple. It contains a <table> |
3885 |
27 Apr 16 |
nicklas |
root element with lots of <row> elements. Each <row> |
3885 |
27 Apr 16 |
nicklas |
represents a single entry (=patient+laterality) from the |
3885 |
27 Apr 16 |
nicklas |
INCA database. Each <row> has a number of tags that are named |
3885 |
27 Apr 16 |
nicklas |
after the INCA variables. |
3885 |
27 Apr 16 |
nicklas |
30 |
|
3885 |
27 Apr 16 |
nicklas |
When parsing we only care about the <row> tags and the INCA variable |
3885 |
27 Apr 16 |
nicklas |
tags. The first <row> we find is used as a template. The INCA variables |
3885 |
27 Apr 16 |
nicklas |
in this entry define the number of columns in the output CSV file. |
3885 |
27 Apr 16 |
nicklas |
The variables are sorted alphabetically and no duplicates are allowed. |
3885 |
27 Apr 16 |
nicklas |
35 |
|
3885 |
27 Apr 16 |
nicklas |
For each subsequent <row> entry we check that the INCA variable has |
3885 |
27 Apr 16 |
nicklas |
been defined. At the end of the row, we use the value of the "PERSNR" |
3885 |
27 Apr 16 |
nicklas |
to check if the entry should be allowed in full or if we need to mask |
3885 |
27 Apr 16 |
nicklas |
blacklisted variables. |
3885 |
27 Apr 16 |
nicklas |
40 |
|
3885 |
27 Apr 16 |
nicklas |
@author nicklas |
3885 |
27 Apr 16 |
nicklas |
@since 1.0 |
3885 |
27 Apr 16 |
nicklas |
43 |
*/ |
3885 |
27 Apr 16 |
nicklas |
44 |
public class IncaXmlParser |
3885 |
27 Apr 16 |
nicklas |
45 |
extends DefaultHandler |
3885 |
27 Apr 16 |
nicklas |
46 |
{ |
3885 |
27 Apr 16 |
nicklas |
// Contains the list of allowed personal numbers |
3887 |
28 Apr 16 |
nicklas |
48 |
private final IncaCsvWriter writer; |
3885 |
27 Apr 16 |
nicklas |
49 |
|
3885 |
27 Apr 16 |
nicklas |
// The filename we are currently parsing |
3885 |
27 Apr 16 |
nicklas |
51 |
private String filename; |
3885 |
27 Apr 16 |
nicklas |
52 |
|
3885 |
27 Apr 16 |
nicklas |
// Headers are defined by the first row |
3885 |
27 Apr 16 |
nicklas |
54 |
private final Set<String> headers; |
3885 |
27 Apr 16 |
nicklas |
// Data for a single row |
3885 |
27 Apr 16 |
nicklas |
56 |
private final Map<String, String> row; |
3885 |
27 Apr 16 |
nicklas |
57 |
|
3885 |
27 Apr 16 |
nicklas |
// A flag indicating if we have found the <table> element or not |
3885 |
27 Apr 16 |
nicklas |
59 |
private boolean hasStarted; |
3885 |
27 Apr 16 |
nicklas |
60 |
|
3885 |
27 Apr 16 |
nicklas |
// The current <row> entry number |
3885 |
27 Apr 16 |
nicklas |
62 |
private int rowNo; |
3885 |
27 Apr 16 |
nicklas |
// Flag to indicate if we are inside a <row> tag or not |
3885 |
27 Apr 16 |
nicklas |
64 |
private boolean inRow; |
3885 |
27 Apr 16 |
nicklas |
65 |
|
3885 |
27 Apr 16 |
nicklas |
// The name of the current INCA variable |
3885 |
27 Apr 16 |
nicklas |
67 |
private String currentTag; |
3885 |
27 Apr 16 |
nicklas |
68 |
|
3885 |
27 Apr 16 |
nicklas |
// Used for collecting data for the current INCA variable |
3885 |
27 Apr 16 |
nicklas |
70 |
private StringBuilder currentValue; |
3885 |
27 Apr 16 |
nicklas |
71 |
|
3885 |
27 Apr 16 |
nicklas |
// We get this from the SAX parser and use it for error handling |
3885 |
27 Apr 16 |
nicklas |
73 |
private Locator locator; |
3885 |
27 Apr 16 |
nicklas |
74 |
|
7373 |
06 Oct 23 |
nicklas |
// Known root tags for different versions |
7373 |
06 Oct 23 |
nicklas |
76 |
private final Set<String> rootTags; |
7373 |
06 Oct 23 |
nicklas |
77 |
|
3885 |
27 Apr 16 |
nicklas |
78 |
/** |
3885 |
27 Apr 16 |
nicklas |
Create a new parser. Use the {@link #parse(InputStream, String)} |
3885 |
27 Apr 16 |
nicklas |
method to start parsing. |
3885 |
27 Apr 16 |
nicklas |
81 |
*/ |
3887 |
28 Apr 16 |
nicklas |
82 |
public IncaXmlParser(IncaCsvWriter writer) |
3885 |
27 Apr 16 |
nicklas |
83 |
{ |
3887 |
28 Apr 16 |
nicklas |
84 |
this.writer = writer; |
3885 |
27 Apr 16 |
nicklas |
85 |
this.headers = new LinkedHashSet<>(); |
3885 |
27 Apr 16 |
nicklas |
86 |
this.row = new HashMap<>(); |
3885 |
27 Apr 16 |
nicklas |
87 |
this.locator = new LocatorImpl(); |
7373 |
06 Oct 23 |
nicklas |
88 |
this.rootTags = new HashSet<>(Arrays.asList("table", "document")); |
3891 |
28 Apr 16 |
nicklas |
89 |
|
3891 |
28 Apr 16 |
nicklas |
// Extra header that is PERSNR with '-' removed |
3891 |
28 Apr 16 |
nicklas |
91 |
headers.add("PersonalNo"); |
3885 |
27 Apr 16 |
nicklas |
92 |
} |
3885 |
27 Apr 16 |
nicklas |
93 |
|
3885 |
27 Apr 16 |
nicklas |
94 |
/** |
3885 |
27 Apr 16 |
nicklas |
Parse the given INCA XML file. |
3885 |
27 Apr 16 |
nicklas |
96 |
*/ |
3885 |
27 Apr 16 |
nicklas |
97 |
public void parse(InputStream in, String filename) |
3885 |
27 Apr 16 |
nicklas |
98 |
{ |
3885 |
27 Apr 16 |
nicklas |
99 |
this.filename = filename; |
3885 |
27 Apr 16 |
nicklas |
100 |
try |
3885 |
27 Apr 16 |
nicklas |
101 |
{ |
3885 |
27 Apr 16 |
nicklas |
102 |
SAXParserFactory factory = SAXParserFactory.newInstance(); |
3885 |
27 Apr 16 |
nicklas |
103 |
SAXParser sax = factory.newSAXParser(); |
3885 |
27 Apr 16 |
nicklas |
104 |
sax.parse(in, this); |
3885 |
27 Apr 16 |
nicklas |
105 |
} |
3885 |
27 Apr 16 |
nicklas |
106 |
catch (IOException | SAXException | ParserConfigurationException ex) |
3885 |
27 Apr 16 |
nicklas |
107 |
{ |
3885 |
27 Apr 16 |
nicklas |
108 |
throw new RuntimeException(ex); |
3885 |
27 Apr 16 |
nicklas |
109 |
} |
3885 |
27 Apr 16 |
nicklas |
//System.out.println("Parsed " + lineNo + " lines; " + personalNumbers.size() + " unique personal numbers."); |
3885 |
27 Apr 16 |
nicklas |
111 |
} |
3885 |
27 Apr 16 |
nicklas |
112 |
|
3885 |
27 Apr 16 |
nicklas |
113 |
/** |
3885 |
27 Apr 16 |
nicklas |
Handler method that is called when a start tag is found. We react to |
3885 |
27 Apr 16 |
nicklas |
the <table> tag, <row> tags and to INCA variable tags (when inside a <row> tag). |
3885 |
27 Apr 16 |
nicklas |
116 |
*/ |
3885 |
27 Apr 16 |
nicklas |
117 |
@Override |
3885 |
27 Apr 16 |
nicklas |
118 |
public void startElement(String uri, String localName, String qName, Attributes attributes) |
3885 |
27 Apr 16 |
nicklas |
119 |
{ |
7373 |
06 Oct 23 |
nicklas |
120 |
if (!hasStarted && !rootTags.contains(qName)) |
3885 |
27 Apr 16 |
nicklas |
121 |
{ |
7373 |
06 Oct 23 |
nicklas |
122 |
String msg = "File: "+filename+"\n"; |
7373 |
06 Oct 23 |
nicklas |
123 |
msg += "At line "+locator.getLineNumber()+": <"+qName+">\n\n"; |
7373 |
06 Oct 23 |
nicklas |
124 |
msg += "Expected one of:\n"; |
7373 |
06 Oct 23 |
nicklas |
125 |
for (String tag : rootTags) |
7373 |
06 Oct 23 |
nicklas |
126 |
{ |
7373 |
06 Oct 23 |
nicklas |
127 |
msg += " • <"+tag+">\n"; |
7373 |
06 Oct 23 |
nicklas |
128 |
} |
7373 |
06 Oct 23 |
nicklas |
129 |
msg += "\nContinue anyway?"; |
7373 |
06 Oct 23 |
nicklas |
130 |
int answer = JOptionPane.showConfirmDialog(null, msg, "Unexpected root tag: <"+qName+">", JOptionPane.YES_NO_OPTION, JOptionPane.QUESTION_MESSAGE); |
7373 |
06 Oct 23 |
nicklas |
131 |
if (answer != JOptionPane.YES_OPTION) |
7373 |
06 Oct 23 |
nicklas |
132 |
{ |
7373 |
06 Oct 23 |
nicklas |
133 |
throw new ImportException(filename, locator.getLineNumber(), "XML parsing aborted"); |
7373 |
06 Oct 23 |
nicklas |
134 |
} |
3885 |
27 Apr 16 |
nicklas |
135 |
} |
3885 |
27 Apr 16 |
nicklas |
136 |
hasStarted = true; |
3885 |
27 Apr 16 |
nicklas |
137 |
if ("row".equals(qName)) |
3885 |
27 Apr 16 |
nicklas |
138 |
{ |
3885 |
27 Apr 16 |
nicklas |
// Start a new row |
3885 |
27 Apr 16 |
nicklas |
140 |
row.clear(); |
3885 |
27 Apr 16 |
nicklas |
141 |
inRow = true; |
3885 |
27 Apr 16 |
nicklas |
142 |
rowNo++; |
3885 |
27 Apr 16 |
nicklas |
143 |
|
3885 |
27 Apr 16 |
nicklas |
144 |
} |
3885 |
27 Apr 16 |
nicklas |
145 |
else if (inRow) |
3885 |
27 Apr 16 |
nicklas |
146 |
{ |
3885 |
27 Apr 16 |
nicklas |
147 |
if (rowNo == 1) |
3885 |
27 Apr 16 |
nicklas |
148 |
{ |
3885 |
27 Apr 16 |
nicklas |
// First <row> entry -- add headers |
3885 |
27 Apr 16 |
nicklas |
150 |
headers.add(qName); |
3885 |
27 Apr 16 |
nicklas |
151 |
} |
3885 |
27 Apr 16 |
nicklas |
152 |
else |
3885 |
27 Apr 16 |
nicklas |
153 |
{ |
3885 |
27 Apr 16 |
nicklas |
// Verify that the header exists |
3885 |
27 Apr 16 |
nicklas |
155 |
if (!headers.contains(qName)) |
3885 |
27 Apr 16 |
nicklas |
156 |
{ |
3885 |
27 Apr 16 |
nicklas |
157 |
throw new ImportException(filename, locator.getLineNumber(), rowNo, "Element not found in headers: <" + qName + ">"); |
3885 |
27 Apr 16 |
nicklas |
158 |
} |
3885 |
27 Apr 16 |
nicklas |
159 |
} |
3885 |
27 Apr 16 |
nicklas |
160 |
|
3885 |
27 Apr 16 |
nicklas |
// Duplicates are not allowed |
3885 |
27 Apr 16 |
nicklas |
162 |
if (row.containsKey(qName)) |
3885 |
27 Apr 16 |
nicklas |
163 |
{ |
3885 |
27 Apr 16 |
nicklas |
164 |
throw new ImportException(filename, locator.getLineNumber(), rowNo, "Duplicate element: <" + qName + ">"); |
3885 |
27 Apr 16 |
nicklas |
165 |
} |
3885 |
27 Apr 16 |
nicklas |
166 |
|
3885 |
27 Apr 16 |
nicklas |
// Initialize the current tag/value |
3885 |
27 Apr 16 |
nicklas |
168 |
currentTag = qName; |
3885 |
27 Apr 16 |
nicklas |
169 |
currentValue = new StringBuilder(); |
3885 |
27 Apr 16 |
nicklas |
170 |
} |
3885 |
27 Apr 16 |
nicklas |
171 |
} |
3885 |
27 Apr 16 |
nicklas |
172 |
|
3885 |
27 Apr 16 |
nicklas |
173 |
/** |
3885 |
27 Apr 16 |
nicklas |
Handler method that is called when an end tag is found. We react to |
3885 |
27 Apr 16 |
nicklas |
<row> tags and to INCA variable tags (when inside a <row> tag). |
3885 |
27 Apr 16 |
nicklas |
176 |
*/ |
3885 |
27 Apr 16 |
nicklas |
177 |
@Override |
3885 |
27 Apr 16 |
nicklas |
178 |
public void endElement(String uri, String localName, String qName) |
3885 |
27 Apr 16 |
nicklas |
179 |
{ |
3885 |
27 Apr 16 |
nicklas |
180 |
if ("row".equals(qName)) |
3885 |
27 Apr 16 |
nicklas |
181 |
{ |
3885 |
27 Apr 16 |
nicklas |
// End a row |
3885 |
27 Apr 16 |
nicklas |
183 |
if (rowNo == 1) |
3885 |
27 Apr 16 |
nicklas |
184 |
{ |
3887 |
28 Apr 16 |
nicklas |
// First entry -- write headers |
3887 |
28 Apr 16 |
nicklas |
186 |
writer.writeHeaders(headers); |
3885 |
27 Apr 16 |
nicklas |
187 |
} |
3885 |
27 Apr 16 |
nicklas |
188 |
|
3891 |
28 Apr 16 |
nicklas |
// Remove '-' from PERSNR and place the value in PersonalNo |
3891 |
28 Apr 16 |
nicklas |
190 |
String pnr = row.get("PERSNR"); |
3891 |
28 Apr 16 |
nicklas |
191 |
if (pnr != null) row.put("PersonalNo", pnr.replace("-", "")); |
3891 |
28 Apr 16 |
nicklas |
192 |
|
3887 |
28 Apr 16 |
nicklas |
// Write row data and clean up |
3887 |
28 Apr 16 |
nicklas |
194 |
writer.writeRow(row); |
3885 |
27 Apr 16 |
nicklas |
195 |
row.clear(); |
3885 |
27 Apr 16 |
nicklas |
196 |
inRow = false; |
3885 |
27 Apr 16 |
nicklas |
197 |
} |
3885 |
27 Apr 16 |
nicklas |
198 |
else if (inRow) |
3885 |
27 Apr 16 |
nicklas |
199 |
{ |
3885 |
27 Apr 16 |
nicklas |
// End an INCA variable entry |
3885 |
27 Apr 16 |
nicklas |
201 |
if (!qName.equals(currentTag)) |
3885 |
27 Apr 16 |
nicklas |
202 |
{ |
3885 |
27 Apr 16 |
nicklas |
// This is probably catched by the SAX parser, but just in case... |
3885 |
27 Apr 16 |
nicklas |
204 |
throw new ImportException(filename, locator.getLineNumber(), rowNo, "Unexpected end tag: </" + qName + ">:" + currentTag); |
3885 |
27 Apr 16 |
nicklas |
205 |
} |
3885 |
27 Apr 16 |
nicklas |
206 |
|
3887 |
28 Apr 16 |
nicklas |
// Store the value and cleanup |
3885 |
27 Apr 16 |
nicklas |
208 |
row.put(qName, currentValue.toString()); |
3885 |
27 Apr 16 |
nicklas |
209 |
currentTag = null; |
3885 |
27 Apr 16 |
nicklas |
210 |
currentValue = null; |
3885 |
27 Apr 16 |
nicklas |
211 |
} |
3885 |
27 Apr 16 |
nicklas |
212 |
} |
3885 |
27 Apr 16 |
nicklas |
213 |
|
3885 |
27 Apr 16 |
nicklas |
214 |
/** |
3885 |
27 Apr 16 |
nicklas |
Handler method that is called when character data is found. |
3885 |
27 Apr 16 |
nicklas |
We react when inside a <row> and append to the current INCA |
3885 |
27 Apr 16 |
nicklas |
variable. |
3885 |
27 Apr 16 |
nicklas |
218 |
*/ |
3885 |
27 Apr 16 |
nicklas |
219 |
@Override |
3885 |
27 Apr 16 |
nicklas |
220 |
public void characters(char[] ch, int start, int length) |
3885 |
27 Apr 16 |
nicklas |
221 |
throws SAXException |
3885 |
27 Apr 16 |
nicklas |
222 |
{ |
3885 |
27 Apr 16 |
nicklas |
223 |
if (inRow && currentValue != null) |
3885 |
27 Apr 16 |
nicklas |
224 |
{ |
3885 |
27 Apr 16 |
nicklas |
225 |
currentValue.append(ch, start, length); |
3885 |
27 Apr 16 |
nicklas |
226 |
} |
3885 |
27 Apr 16 |
nicklas |
227 |
} |
3885 |
27 Apr 16 |
nicklas |
228 |
|
3885 |
27 Apr 16 |
nicklas |
229 |
/** |
3885 |
27 Apr 16 |
nicklas |
We get this from the SAX parser. |
3885 |
27 Apr 16 |
nicklas |
231 |
*/ |
3885 |
27 Apr 16 |
nicklas |
232 |
@Override |
3885 |
27 Apr 16 |
nicklas |
233 |
public void setDocumentLocator(Locator locator) |
3885 |
27 Apr 16 |
nicklas |
234 |
{ |
3885 |
27 Apr 16 |
nicklas |
235 |
this.locator = locator; |
3885 |
27 Apr 16 |
nicklas |
236 |
} |
3885 |
27 Apr 16 |
nicklas |
237 |
|
3890 |
28 Apr 16 |
nicklas |
238 |
/** |
3890 |
28 Apr 16 |
nicklas |
The number of <row> tags parsed. |
3890 |
28 Apr 16 |
nicklas |
240 |
*/ |
3890 |
28 Apr 16 |
nicklas |
241 |
public int getRows() |
3890 |
28 Apr 16 |
nicklas |
242 |
{ |
3890 |
28 Apr 16 |
nicklas |
243 |
return rowNo; |
3890 |
28 Apr 16 |
nicklas |
244 |
} |
3885 |
27 Apr 16 |
nicklas |
245 |
|
3890 |
28 Apr 16 |
nicklas |
246 |
/** |
3890 |
28 Apr 16 |
nicklas |
The number of INCA variables in each <row> tag. |
3890 |
28 Apr 16 |
nicklas |
248 |
*/ |
3890 |
28 Apr 16 |
nicklas |
249 |
public int getHeaders() |
3890 |
28 Apr 16 |
nicklas |
250 |
{ |
3890 |
28 Apr 16 |
nicklas |
251 |
return headers.size(); |
3890 |
28 Apr 16 |
nicklas |
252 |
} |
3885 |
27 Apr 16 |
nicklas |
253 |
|
3885 |
27 Apr 16 |
nicklas |
254 |
} |