1 /**
2 * Copyright 2010, CSIRO Australia.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package au.csiro.netcdf.util;
17
18 import java.util.ArrayList;
19 import java.util.List;
20 import java.util.StringTokenizer;
21
22 /**
23 * Specialised StringTokenizer proxy to handle lines from a CSV file.
24 *
25 * Copyright 2010, CSIRO Australia
26 * All rights reserved.
27 *
28 * @author James Dempsey on 10/06/2010
29 * @version $Revision: 78 $ $Date: 2010-07-24 16:23:13 +1000 (Sat, 24 Jul 2010) $
30 */
31 public class CSVTokenizer extends StringTokenizer
32 {
33 /** flag indicating whether to return the delimiters as tokens. */
34 private boolean returnTokens;
35 /** The major delimiter of the string to be parsed. */
36 private String majorDelimiter;
37
38 /** A delimiter that is being held for the next request. */
39 private String cachedToken = null;
40 /** Do we have a cached token? */
41 private boolean haveCachedToken = false;
42 /** A token that has been returned for processing. Null if none currently available. */
43 private String pushedBackToken = null;
44 /** Are we at the end of the line yet */
45 private boolean atEOL = false;
46
47
48 /**
49 * Constructs a CSV string tokenizer for the specified string. Delimiter characters themselves
50 * will not be treated as tokens.
51 *
52 * @param str a string to be parsed.
53 */
54 public CSVTokenizer(String str)
55 {
56 this(str, ',', false);
57 }
58
59
60 /**
61 * Constructs a CSV string tokenizer for the specified string. Delimiter characters themselves
62 * will not be treated as tokens.<P>
63 * If the returnTokens flag is true, then the delimiter characters are also returned as tokens.
64 * Each delimiter is returned as a string of length one. If the flag is false, the delimiter
65 * characters are skipped and only serve as separators between tokens. NB: The quotes around
66 * some strings will never come out as delimiters.
67 *
68 * @param str a string to be parsed.
69 * @param returnTokens flag indicating whether to return the delimiters as tokens.
70 */
71 public CSVTokenizer(String str, boolean returnTokens)
72 {
73 this(str, ',', returnTokens);
74 }
75
76
77 /**
78 * Constructs a CSV string tokenizer for the specified string. Delimiter characters themselves
79 * will not be treated as tokens.<P>
80 * The majorDelimiter value can be used to change the major delimiter used. Normally this will
81 * be a comma, but a tab could be supplied for a tab delimited file.<P>
82 * If the returnTokens flag is true, then the delimiter characters are also returned as tokens.
83 * Each delimiter is returned as a string of length one. If the flag is false, the delimiter
84 * characters are skipped and only serve as separators between tokens. NB: The quotes around
85 * some strings will never come out as delimiters.
86 *
87 * @param str a string to be parsed.
88 * @param majorDelimiter The major delimiter of the string to be parsed.
89 * @param returnTokens flag indicating whether to return the delimiters as tokens.
90 */
91 public CSVTokenizer(String str, char majorDelimiter, boolean returnTokens)
92 {
93 super(str, "\"\n\015"+ majorDelimiter, true);
94
95 this.returnTokens = returnTokens;
96 this.majorDelimiter = String.valueOf(majorDelimiter);
97 }
98
99 /**
100 * Returns all remaining columns for this tokenizer as a string. This is
101 * similar to the split function for a string.
102 *
103 * @return An array of all remaining columns as strings.
104 */
105 public String[] getAllColumns()
106 {
107 List<String> columns = new ArrayList<String>();
108 while (hasMoreElements())
109 {
110 columns.add(nextToken());
111 }
112
113 return columns.toArray(new String[]{});
114 }
115
116 /**
117 * We have deliberately excluded the count token method as it would require caching.
118 *
119 * @return 0
120 */
121 public int countTokens()
122 {
123 return 0;
124 }
125
126 public boolean hasMoreElements()
127 {
128 return super.hasMoreElements() || haveCachedToken || pushedBackToken != null;
129 }
130
131 public boolean hasMoreTokens()
132 {
133 return super.hasMoreTokens() || haveCachedToken || pushedBackToken != null;
134 }
135
136 public boolean atEOL()
137 {
138 return atEOL || !hasMoreTokens();
139 }
140
141 public Object nextElement()
142 {
143 return nextToken();
144 }
145
146 public String nextToken()
147 {
148 // If we returned the delimiter last time, return the token this time
149 if (returnTokens && haveCachedToken)
150 {
151 String temp = cachedToken;
152 cachedToken = null;
153 haveCachedToken = false;
154 return temp;
155 }
156
157 // tokenise by comma
158 boolean inQuoteBlock = false;
159 boolean wordFinished = false;
160 String currWord = "";
161 String token = null;
162 atEOL = false;
163
164 while ((pushedBackToken != null || super.hasMoreTokens()) && !wordFinished)
165 {
166 if (pushedBackToken != null)
167 {
168 token = pushedBackToken;
169 pushedBackToken = null;
170 }
171 else
172 {
173 token = super.nextToken();
174 }
175
176 //Debug.log("CSVT: got token (" + token + ")");
177 if (token.equals("\""))
178 {
179 if (inQuoteBlock)
180 {
181 if (!super.hasMoreTokens())
182 {
183 inQuoteBlock = false;
184 wordFinished = true;
185 token = null;
186 }
187 else
188 {
189 String nextToken = super.nextToken();
190 if (nextToken.equals("\""))
191 {
192 currWord += "\"";
193 }
194 else
195 {
196 inQuoteBlock = false;
197 pushedBackToken = nextToken;
198 }
199 }
200 }
201 else
202 {
203 inQuoteBlock = true;
204 }
205 }
206 else if (token.equals(majorDelimiter))
207 {
208 if (inQuoteBlock)
209 {
210 currWord += token;
211 }
212 else
213 {
214 wordFinished = true;
215 }
216 }
217 else if (token.equals("\n"))
218 {
219 if (inQuoteBlock)
220 {
221 currWord += "\n";
222 }
223 else
224 {
225 wordFinished = true;
226 atEOL = true;
227 }
228 }
229 else if (token.equals("\015"))
230 {
231 if (inQuoteBlock)
232 {
233 currWord += "\015";
234 }
235 else
236 {
237 if (super.hasMoreTokens())
238 {
239 String nextToken = super.nextToken();
240 if (!nextToken.equals("\n"))
241 {
242 currWord += "\015";
243 }
244 pushedBackToken = nextToken;
245 }
246 }
247 }
248 else
249 {
250 currWord += token;
251 token = null;
252 }
253 }
254
255 if (returnTokens)
256 {
257 cachedToken = token;
258 haveCachedToken = cachedToken != null;
259 return currWord;
260 }
261 else
262 {
263 cachedToken = null;
264 return currWord;
265 }
266 }
267 }