CSVTokenizer xref

View Javadoc
1   /**
2    * Copyright 2010, CSIRO Australia.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *         http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package au.csiro.netcdf.util;
17  
18  import java.util.ArrayList;
19  import java.util.List;
20  import java.util.StringTokenizer;
21  
22  /**
23   * Specialised StringTokenizer proxy to handle lines from a CSV file.
24   * 
25   * Copyright 2010, CSIRO Australia
26   * All rights reserved.
27   * 
28   * @author      James Dempsey on 10/06/2010
29   * @version     $Revision: 78 $  $Date: 2010-07-24 16:23:13 +1000 (Sat, 24 Jul 2010) $
30   */
31  public class CSVTokenizer  extends StringTokenizer
32  {
33      /** flag indicating whether to return the delimiters as tokens. */
34  	private boolean returnTokens;
35  	/** The major delimiter of the string to be parsed. */
36  	private String majorDelimiter;
37  
38  	/** A delimiter that is being held for the next request. */
39  	private String cachedToken = null;
40  	/** Do we have a cached token? */
41  	private boolean haveCachedToken = false;
42  	/** A token that has been returned for processing. Null if none currently available. */
43  	private String pushedBackToken = null;
44  	/** Are we at the end of the line yet */
45  	private boolean atEOL = false;
46  
47  	
48  	/**
49  	 * Constructs a CSV string tokenizer for the specified string. Delimiter characters themselves 
50  	 * will not be treated as tokens.
51  	 *
52  	 * @param str a string to be parsed.
53  	 */
54  	public CSVTokenizer(String str)
55  	{
56  		this(str, ',', false);
57  	}
58  
59  
60  	/**
61  	 * Constructs a CSV string tokenizer for the specified string. Delimiter characters themselves 
62  	 * will not be treated as tokens.<P>
63  	 * If the returnTokens flag is true, then the delimiter characters are also returned as tokens. 
64  	 * Each delimiter is returned as a string of length one. If the flag is false, the delimiter 
65  	 * characters are skipped and only serve as separators between tokens. NB: The quotes around 
66  	 * some strings will never come out as delimiters. 
67  	 *
68  	 * @param str a string to be parsed.
69  	 * @param returnTokens flag indicating whether to return the delimiters as tokens.
70  	 */
71  	public CSVTokenizer(String str, boolean returnTokens)
72  	{
73  		this(str, ',', returnTokens);
74  	}
75  
76  
77  	/**
78  	 * Constructs a CSV string tokenizer for the specified string. Delimiter characters themselves 
79  	 * will not be treated as tokens.<P>
80  	 * The majorDelimiter value can be used to change the major delimiter used. Normally this will 
81  	 * be a comma, but a tab could be supplied for a tab delimited file.<P>
82  	 * If the returnTokens flag is true, then the delimiter characters are also returned as tokens. 
83  	 * Each delimiter is returned as a string of length one. If the flag is false, the delimiter 
84  	 * characters are skipped and only serve as separators between tokens. NB: The quotes around 
85  	 * some strings will never come out as delimiters. 
86  	 *
87  	 * @param str a string to be parsed.
88  	 * @param majorDelimiter The major delimiter of the string to be parsed.
89  	 * @param returnTokens flag indicating whether to return the delimiters as tokens.
90  	 */
91  	public CSVTokenizer(String str, char majorDelimiter, boolean returnTokens)
92  	{
93  		super(str, "\"\n\015"+ majorDelimiter, true);
94  
95  		this.returnTokens = returnTokens;
96  		this.majorDelimiter = String.valueOf(majorDelimiter);
97  	}
98  
99      /**
100      * Returns all remaining columns for this tokenizer as a string. This is 
101      * similar to the split function for a string.
102      *   
103      * @return An array of all remaining columns as strings.
104      */
105     public String[] getAllColumns()
106     {
107         List<String> columns = new ArrayList<String>();
108         while (hasMoreElements())
109         {
110             columns.add(nextToken());
111         }
112         
113         return columns.toArray(new String[]{});
114     }
115 	
116 	/**
117 	 * We have deliberately excluded the count token method as it would require caching.
118 	 *
119 	 * @return 0
120 	 */
121 	public int countTokens()
122 	{
123 		return 0;
124 	}
125 
126 	public boolean hasMoreElements()
127 	{
128 		return super.hasMoreElements() || haveCachedToken || pushedBackToken != null;
129 	}
130 
131 	public boolean hasMoreTokens()
132 	{
133 		return super.hasMoreTokens() || haveCachedToken || pushedBackToken != null;
134 	}
135 
136 	public boolean atEOL()
137 	{
138 		return atEOL || !hasMoreTokens();
139 	}
140 
141 	public Object nextElement()
142 	{
143 		return nextToken();
144 	}
145 
146 	public String nextToken()
147 	{
148 		// If we returned the delimiter last time, return the token this time
149 		if (returnTokens && haveCachedToken)
150 		{
151 			String temp = cachedToken;
152 			cachedToken = null;
153 			haveCachedToken = false;
154 			return temp;
155 		}
156 
157 		// tokenise by comma
158 		boolean inQuoteBlock = false;
159 		boolean wordFinished = false;
160 		String currWord = "";
161 		String token = null;
162 		atEOL = false;
163 
164 		while ((pushedBackToken != null || super.hasMoreTokens()) && !wordFinished) 
165 		{
166 			if (pushedBackToken != null)
167 			{
168 				token = pushedBackToken;
169 				pushedBackToken = null;
170 			}
171 			else
172 			{
173 				token = super.nextToken();
174 			}
175 
176 			//Debug.log("CSVT: got token (" + token + ")");
177 			if (token.equals("\""))
178 			{
179 				if (inQuoteBlock)
180 				{
181 					if (!super.hasMoreTokens())
182 					{
183 						inQuoteBlock = false;
184 						wordFinished = true;
185 						token = null;
186 					}
187 					else
188 					{
189 						String nextToken = super.nextToken();
190 						if (nextToken.equals("\""))
191 						{
192 							currWord += "\"";
193 						}
194 						else 
195 						{
196 							inQuoteBlock = false;
197 							pushedBackToken = nextToken;
198 						}
199 					}
200 				}
201 				else
202 				{
203 					inQuoteBlock = true;
204 				}
205 			}
206  			else if (token.equals(majorDelimiter))
207 			{
208 				if (inQuoteBlock)
209 				{
210 					currWord += token;
211 				}
212 				else
213 				{
214 					wordFinished = true;
215 				}
216 			}
217 			else if (token.equals("\n"))
218 			{
219 				if (inQuoteBlock)
220 				{
221 					currWord += "\n";
222 				}
223 				else
224 				{
225 					wordFinished = true;
226 					atEOL = true;
227 				}
228 			}
229 			else if (token.equals("\015"))
230 			{
231 				if (inQuoteBlock)
232 				{
233 					currWord += "\015";
234 				}
235 				else
236 				{
237 					if (super.hasMoreTokens())
238 					{
239 						String nextToken = super.nextToken();
240 						if (!nextToken.equals("\n"))
241 						{
242 							currWord += "\015";
243 						}
244 						pushedBackToken = nextToken;
245 					}
246 				}
247 			}
248 			else
249 			{
250 			    currWord += token;
251 			    token = null;
252 			}
253 		}
254 
255 		if (returnTokens)
256 		{
257 			cachedToken = token;
258 			haveCachedToken = cachedToken != null;
259 			return currWord;
260 		}
261 		else
262 		{
263 			cachedToken = null;
264 			return currWord;
265 		}
266 	}
267 }