001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025 026import org.apache.commons.lang3.ArrayUtils; 027import org.apache.commons.lang3.StringUtils; 028 029/** 030 * Tokenizes a string based on delimiters (separators) 031 * and supporting quoting and ignored character concepts. 032 * <p> 033 * This class can split a String into many smaller strings. It aims 034 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 035 * however it offers much more control and flexibility including implementing 036 * the {@code ListIterator} interface. By default, it is set up 037 * like {@code StringTokenizer}. 038 * <p> 039 * The input String is split into a number of <i>tokens</i>. 040 * Each token is separated from the next String by a <i>delimiter</i>. 041 * One or more delimiter characters must be specified. 042 * <p> 043 * Each token may be surrounded by quotes. 044 * The <i>quote</i> matcher specifies the quote character(s). 045 * A quote may be escaped within a quoted section by duplicating itself. 046 * <p> 047 * Between each token and the delimiter are potentially characters that need trimming. 048 * The <i>trimmer</i> matcher specifies these characters. 049 * One usage might be to trim whitespace characters. 050 * <p> 051 * At any point outside the quotes there might potentially be invalid characters. 052 * The <i>ignored</i> matcher specifies these characters to be removed. 053 * One usage might be to remove new line characters. 054 * <p> 055 * Empty tokens may be removed or returned as null. 056 * <pre> 057 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 058 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 059 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 060 * </pre> 061 * 062 * <table> 063 * <caption>StrTokenizer properties and options</caption> 064 * <tr> 065 * <th>Property</th><th>Type</th><th>Default</th> 066 * </tr> 067 * <tr> 068 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 069 * </tr> 070 * <tr> 071 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 072 * </tr> 073 * <tr> 074 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 075 * </tr> 076 * <tr> 077 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 078 * </tr> 079 * <tr> 080 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 081 * </tr> 082 * </table> 083 * 084 * @since 2.2 085 * @deprecated as of 3.6, use commons-text 086 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html"> 087 * StringTokenizer</a> instead 088 */ 089@Deprecated 090public class StrTokenizer implements ListIterator<String>, Cloneable { 091 092 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 093 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 094 static { 095 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 096 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 098 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 099 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 100 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 101 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 102 103 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 104 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 106 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 107 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 108 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 109 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 110 } 111 112 /** The text to work on. */ 113 private char[] chars; 114 /** The parsed tokens */ 115 private String[] tokens; 116 /** The current iteration position */ 117 private int tokenPos; 118 119 /** The delimiter matcher */ 120 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 121 /** The quote matcher */ 122 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 123 /** The ignored matcher */ 124 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 125 /** The trimmer matcher */ 126 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 127 128 /** Whether to return empty tokens as null */ 129 private boolean emptyAsNull; 130 /** Whether to ignore empty tokens */ 131 private boolean ignoreEmptyTokens = true; 132 133 //----------------------------------------------------------------------- 134 135 /** 136 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 137 * 138 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 139 */ 140 private static StrTokenizer getCSVClone() { 141 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 142 } 143 144 /** 145 * Gets a new tokenizer instance which parses Comma Separated Value strings 146 * initializing it with the given input. The default for CSV processing 147 * will be trim whitespace from both ends (which can be overridden with 148 * the setTrimmer method). 149 * <p> 150 * You must call a "reset" method to set the string which you want to parse. 151 * @return a new tokenizer instance which parses Comma Separated Value strings 152 */ 153 public static StrTokenizer getCSVInstance() { 154 return getCSVClone(); 155 } 156 157 /** 158 * Gets a new tokenizer instance which parses Comma Separated Value strings 159 * initializing it with the given input. The default for CSV processing 160 * will be trim whitespace from both ends (which can be overridden with 161 * the setTrimmer method). 162 * 163 * @param input the text to parse 164 * @return a new tokenizer instance which parses Comma Separated Value strings 165 */ 166 public static StrTokenizer getCSVInstance(final String input) { 167 final StrTokenizer tok = getCSVClone(); 168 tok.reset(input); 169 return tok; 170 } 171 172 /** 173 * Gets a new tokenizer instance which parses Comma Separated Value strings 174 * initializing it with the given input. The default for CSV processing 175 * will be trim whitespace from both ends (which can be overridden with 176 * the setTrimmer method). 177 * 178 * @param input the text to parse 179 * @return a new tokenizer instance which parses Comma Separated Value strings 180 */ 181 public static StrTokenizer getCSVInstance(final char[] input) { 182 final StrTokenizer tok = getCSVClone(); 183 tok.reset(input); 184 return tok; 185 } 186 187 /** 188 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 189 * 190 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 191 */ 192 private static StrTokenizer getTSVClone() { 193 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 194 } 195 196 197 /** 198 * Gets a new tokenizer instance which parses Tab Separated Value strings. 199 * The default for CSV processing will be trim whitespace from both ends 200 * (which can be overridden with the setTrimmer method). 201 * <p> 202 * You must call a "reset" method to set the string which you want to parse. 203 * @return a new tokenizer instance which parses Tab Separated Value strings. 204 */ 205 public static StrTokenizer getTSVInstance() { 206 return getTSVClone(); 207 } 208 209 /** 210 * Gets a new tokenizer instance which parses Tab Separated Value strings. 211 * The default for CSV processing will be trim whitespace from both ends 212 * (which can be overridden with the setTrimmer method). 213 * @param input the string to parse 214 * @return a new tokenizer instance which parses Tab Separated Value strings. 215 */ 216 public static StrTokenizer getTSVInstance(final String input) { 217 final StrTokenizer tok = getTSVClone(); 218 tok.reset(input); 219 return tok; 220 } 221 222 /** 223 * Gets a new tokenizer instance which parses Tab Separated Value strings. 224 * The default for CSV processing will be trim whitespace from both ends 225 * (which can be overridden with the setTrimmer method). 226 * @param input the string to parse 227 * @return a new tokenizer instance which parses Tab Separated Value strings. 228 */ 229 public static StrTokenizer getTSVInstance(final char[] input) { 230 final StrTokenizer tok = getTSVClone(); 231 tok.reset(input); 232 return tok; 233 } 234 235 //----------------------------------------------------------------------- 236 /** 237 * Constructs a tokenizer splitting on space, tab, newline and formfeed 238 * as per StringTokenizer, but with no text to tokenize. 239 * <p> 240 * This constructor is normally used with {@link #reset(String)}. 241 */ 242 public StrTokenizer() { 243 this.chars = null; 244 } 245 246 /** 247 * Constructs a tokenizer splitting on space, tab, newline and formfeed 248 * as per StringTokenizer. 249 * 250 * @param input the string which is to be parsed 251 */ 252 public StrTokenizer(final String input) { 253 if (input != null) { 254 chars = input.toCharArray(); 255 } else { 256 chars = null; 257 } 258 } 259 260 /** 261 * Constructs a tokenizer splitting on the specified delimiter character. 262 * 263 * @param input the string which is to be parsed 264 * @param delim the field delimiter character 265 */ 266 public StrTokenizer(final String input, final char delim) { 267 this(input); 268 setDelimiterChar(delim); 269 } 270 271 /** 272 * Constructs a tokenizer splitting on the specified delimiter string. 273 * 274 * @param input the string which is to be parsed 275 * @param delim the field delimiter string 276 */ 277 public StrTokenizer(final String input, final String delim) { 278 this(input); 279 setDelimiterString(delim); 280 } 281 282 /** 283 * Constructs a tokenizer splitting using the specified delimiter matcher. 284 * 285 * @param input the string which is to be parsed 286 * @param delim the field delimiter matcher 287 */ 288 public StrTokenizer(final String input, final StrMatcher delim) { 289 this(input); 290 setDelimiterMatcher(delim); 291 } 292 293 /** 294 * Constructs a tokenizer splitting on the specified delimiter character 295 * and handling quotes using the specified quote character. 296 * 297 * @param input the string which is to be parsed 298 * @param delim the field delimiter character 299 * @param quote the field quoted string character 300 */ 301 public StrTokenizer(final String input, final char delim, final char quote) { 302 this(input, delim); 303 setQuoteChar(quote); 304 } 305 306 /** 307 * Constructs a tokenizer splitting using the specified delimiter matcher 308 * and handling quotes using the specified quote matcher. 309 * 310 * @param input the string which is to be parsed 311 * @param delim the field delimiter matcher 312 * @param quote the field quoted string matcher 313 */ 314 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 315 this(input, delim); 316 setQuoteMatcher(quote); 317 } 318 319 /** 320 * Constructs a tokenizer splitting on space, tab, newline and formfeed 321 * as per StringTokenizer. 322 * 323 * @param input the string which is to be parsed, not cloned 324 */ 325 public StrTokenizer(final char[] input) { 326 this.chars = ArrayUtils.clone(input); 327 } 328 329 /** 330 * Constructs a tokenizer splitting on the specified character. 331 * 332 * @param input the string which is to be parsed, not cloned 333 * @param delim the field delimiter character 334 */ 335 public StrTokenizer(final char[] input, final char delim) { 336 this(input); 337 setDelimiterChar(delim); 338 } 339 340 /** 341 * Constructs a tokenizer splitting on the specified string. 342 * 343 * @param input the string which is to be parsed, not cloned 344 * @param delim the field delimiter string 345 */ 346 public StrTokenizer(final char[] input, final String delim) { 347 this(input); 348 setDelimiterString(delim); 349 } 350 351 /** 352 * Constructs a tokenizer splitting using the specified delimiter matcher. 353 * 354 * @param input the string which is to be parsed, not cloned 355 * @param delim the field delimiter matcher 356 */ 357 public StrTokenizer(final char[] input, final StrMatcher delim) { 358 this(input); 359 setDelimiterMatcher(delim); 360 } 361 362 /** 363 * Constructs a tokenizer splitting on the specified delimiter character 364 * and handling quotes using the specified quote character. 365 * 366 * @param input the string which is to be parsed, not cloned 367 * @param delim the field delimiter character 368 * @param quote the field quoted string character 369 */ 370 public StrTokenizer(final char[] input, final char delim, final char quote) { 371 this(input, delim); 372 setQuoteChar(quote); 373 } 374 375 /** 376 * Constructs a tokenizer splitting using the specified delimiter matcher 377 * and handling quotes using the specified quote matcher. 378 * 379 * @param input the string which is to be parsed, not cloned 380 * @param delim the field delimiter character 381 * @param quote the field quoted string character 382 */ 383 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 384 this(input, delim); 385 setQuoteMatcher(quote); 386 } 387 388 // API 389 //----------------------------------------------------------------------- 390 /** 391 * Gets the number of tokens found in the String. 392 * 393 * @return the number of matched tokens 394 */ 395 public int size() { 396 checkTokenized(); 397 return tokens.length; 398 } 399 400 /** 401 * Gets the next token from the String. 402 * Equivalent to {@link #next()} except it returns null rather than 403 * throwing {@link NoSuchElementException} when no tokens remain. 404 * 405 * @return the next sequential token, or null when no more tokens are found 406 */ 407 public String nextToken() { 408 if (hasNext()) { 409 return tokens[tokenPos++]; 410 } 411 return null; 412 } 413 414 /** 415 * Gets the previous token from the String. 416 * 417 * @return the previous sequential token, or null when no more tokens are found 418 */ 419 public String previousToken() { 420 if (hasPrevious()) { 421 return tokens[--tokenPos]; 422 } 423 return null; 424 } 425 426 /** 427 * Gets a copy of the full token list as an independent modifiable array. 428 * 429 * @return the tokens as a String array 430 */ 431 public String[] getTokenArray() { 432 checkTokenized(); 433 return tokens.clone(); 434 } 435 436 /** 437 * Gets a copy of the full token list as an independent modifiable list. 438 * 439 * @return the tokens as a String array 440 */ 441 public List<String> getTokenList() { 442 checkTokenized(); 443 final List<String> list = new ArrayList<>(tokens.length); 444 list.addAll(Arrays.asList(tokens)); 445 return list; 446 } 447 448 /** 449 * Resets this tokenizer, forgetting all parsing and iteration already completed. 450 * <p> 451 * This method allows the same tokenizer to be reused for the same String. 452 * 453 * @return this, to enable chaining 454 */ 455 public StrTokenizer reset() { 456 tokenPos = 0; 457 tokens = null; 458 return this; 459 } 460 461 /** 462 * Reset this tokenizer, giving it a new input string to parse. 463 * In this manner you can re-use a tokenizer with the same settings 464 * on multiple input lines. 465 * 466 * @param input the new string to tokenize, null sets no text to parse 467 * @return this, to enable chaining 468 */ 469 public StrTokenizer reset(final String input) { 470 reset(); 471 if (input != null) { 472 this.chars = input.toCharArray(); 473 } else { 474 this.chars = null; 475 } 476 return this; 477 } 478 479 /** 480 * Reset this tokenizer, giving it a new input string to parse. 481 * In this manner you can re-use a tokenizer with the same settings 482 * on multiple input lines. 483 * 484 * @param input the new character array to tokenize, not cloned, null sets no text to parse 485 * @return this, to enable chaining 486 */ 487 public StrTokenizer reset(final char[] input) { 488 reset(); 489 this.chars = ArrayUtils.clone(input); 490 return this; 491 } 492 493 // ListIterator 494 //----------------------------------------------------------------------- 495 /** 496 * Checks whether there are any more tokens. 497 * 498 * @return true if there are more tokens 499 */ 500 @Override 501 public boolean hasNext() { 502 checkTokenized(); 503 return tokenPos < tokens.length; 504 } 505 506 /** 507 * Gets the next token. 508 * 509 * @return the next String token 510 * @throws NoSuchElementException if there are no more elements 511 */ 512 @Override 513 public String next() { 514 if (hasNext()) { 515 return tokens[tokenPos++]; 516 } 517 throw new NoSuchElementException(); 518 } 519 520 /** 521 * Gets the index of the next token to return. 522 * 523 * @return the next token index 524 */ 525 @Override 526 public int nextIndex() { 527 return tokenPos; 528 } 529 530 /** 531 * Checks whether there are any previous tokens that can be iterated to. 532 * 533 * @return true if there are previous tokens 534 */ 535 @Override 536 public boolean hasPrevious() { 537 checkTokenized(); 538 return tokenPos > 0; 539 } 540 541 /** 542 * Gets the token previous to the last returned token. 543 * 544 * @return the previous token 545 */ 546 @Override 547 public String previous() { 548 if (hasPrevious()) { 549 return tokens[--tokenPos]; 550 } 551 throw new NoSuchElementException(); 552 } 553 554 /** 555 * Gets the index of the previous token. 556 * 557 * @return the previous token index 558 */ 559 @Override 560 public int previousIndex() { 561 return tokenPos - 1; 562 } 563 564 /** 565 * Unsupported ListIterator operation. 566 * 567 * @throws UnsupportedOperationException always 568 */ 569 @Override 570 public void remove() { 571 throw new UnsupportedOperationException("remove() is unsupported"); 572 } 573 574 /** 575 * Unsupported ListIterator operation. 576 * @param obj this parameter ignored. 577 * @throws UnsupportedOperationException always 578 */ 579 @Override 580 public void set(final String obj) { 581 throw new UnsupportedOperationException("set() is unsupported"); 582 } 583 584 /** 585 * Unsupported ListIterator operation. 586 * @param obj this parameter ignored. 587 * @throws UnsupportedOperationException always 588 */ 589 @Override 590 public void add(final String obj) { 591 throw new UnsupportedOperationException("add() is unsupported"); 592 } 593 594 // Implementation 595 //----------------------------------------------------------------------- 596 /** 597 * Checks if tokenization has been done, and if not then do it. 598 */ 599 private void checkTokenized() { 600 if (tokens == null) { 601 if (chars == null) { 602 // still call tokenize as subclass may do some work 603 final List<String> split = tokenize(null, 0, 0); 604 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 605 } else { 606 final List<String> split = tokenize(chars, 0, chars.length); 607 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 608 } 609 } 610 } 611 612 /** 613 * Internal method to performs the tokenization. 614 * <p> 615 * Most users of this class do not need to call this method. This method 616 * will be called automatically by other (public) methods when required. 617 * <p> 618 * This method exists to allow subclasses to add code before or after the 619 * tokenization. For example, a subclass could alter the character array, 620 * offset or count to be parsed, or call the tokenizer multiple times on 621 * multiple strings. It is also be possible to filter the results. 622 * <p> 623 * {@code StrTokenizer} will always pass a zero offset and a count 624 * equal to the length of the array to this method, however a subclass 625 * may pass other values, or even an entirely different array. 626 * 627 * @param srcChars the character array being tokenized, may be null 628 * @param offset the start position within the character array, must be valid 629 * @param count the number of characters to tokenize, must be valid 630 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 631 */ 632 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 633 if (srcChars == null || count == 0) { 634 return Collections.emptyList(); 635 } 636 final StrBuilder buf = new StrBuilder(); 637 final List<String> tokenList = new ArrayList<>(); 638 int pos = offset; 639 640 // loop around the entire buffer 641 while (pos >= 0 && pos < count) { 642 // find next token 643 pos = readNextToken(srcChars, pos, count, buf, tokenList); 644 645 // handle case where end of string is a delimiter 646 if (pos >= count) { 647 addToken(tokenList, StringUtils.EMPTY); 648 } 649 } 650 return tokenList; 651 } 652 653 /** 654 * Adds a token to a list, paying attention to the parameters we've set. 655 * 656 * @param list the list to add to 657 * @param tok the token to add 658 */ 659 private void addToken(final List<String> list, String tok) { 660 if (StringUtils.isEmpty(tok)) { 661 if (isIgnoreEmptyTokens()) { 662 return; 663 } 664 if (isEmptyTokenAsNull()) { 665 tok = null; 666 } 667 } 668 list.add(tok); 669 } 670 671 /** 672 * Reads character by character through the String to get the next token. 673 * 674 * @param srcChars the character array being tokenized 675 * @param start the first character of field 676 * @param len the length of the character array being tokenized 677 * @param workArea a temporary work area 678 * @param tokenList the list of parsed tokens 679 * @return the starting position of the next field (the character 680 * immediately after the delimiter), or -1 if end of string found 681 */ 682 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 683 // skip all leading whitespace, unless it is the 684 // field delimiter or the quote character 685 while (start < len) { 686 final int removeLen = Math.max( 687 getIgnoredMatcher().isMatch(srcChars, start, start, len), 688 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 689 if (removeLen == 0 || 690 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 691 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 692 break; 693 } 694 start += removeLen; 695 } 696 697 // handle reaching end 698 if (start >= len) { 699 addToken(tokenList, StringUtils.EMPTY); 700 return -1; 701 } 702 703 // handle empty token 704 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 705 if (delimLen > 0) { 706 addToken(tokenList, StringUtils.EMPTY); 707 return start + delimLen; 708 } 709 710 // handle found token 711 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 712 if (quoteLen > 0) { 713 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 714 } 715 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 716 } 717 718 /** 719 * Reads a possibly quoted string token. 720 * 721 * @param srcChars the character array being tokenized 722 * @param start the first character of field 723 * @param len the length of the character array being tokenized 724 * @param workArea a temporary work area 725 * @param tokenList the list of parsed tokens 726 * @param quoteStart the start position of the matched quote, 0 if no quoting 727 * @param quoteLen the length of the matched quote, 0 if no quoting 728 * @return the starting position of the next field (the character 729 * immediately after the delimiter, or if end of string found, 730 * then the length of string 731 */ 732 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 733 final List<String> tokenList, final int quoteStart, final int quoteLen) { 734 // Loop until we've found the end of the quoted 735 // string or the end of the input 736 workArea.clear(); 737 int pos = start; 738 boolean quoting = quoteLen > 0; 739 int trimStart = 0; 740 741 while (pos < len) { 742 // quoting mode can occur several times throughout a string 743 // we must switch between quoting and non-quoting until we 744 // encounter a non-quoted delimiter, or end of string 745 if (quoting) { 746 // In quoting mode 747 748 // If we've found a quote character, see if it's 749 // followed by a second quote. If so, then we need 750 // to actually put the quote character into the token 751 // rather than end the token. 752 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 753 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 754 // matched pair of quotes, thus an escaped quote 755 workArea.append(srcChars, pos, quoteLen); 756 pos += quoteLen * 2; 757 trimStart = workArea.size(); 758 continue; 759 } 760 761 // end of quoting 762 quoting = false; 763 pos += quoteLen; 764 continue; 765 } 766 767 // copy regular character from inside quotes 768 workArea.append(srcChars[pos++]); 769 trimStart = workArea.size(); 770 771 } else { 772 // Not in quoting mode 773 774 // check for delimiter, and thus end of token 775 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 776 if (delimLen > 0) { 777 // return condition when end of token found 778 addToken(tokenList, workArea.substring(0, trimStart)); 779 return pos + delimLen; 780 } 781 782 // check for quote, and thus back into quoting mode 783 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 784 quoting = true; 785 pos += quoteLen; 786 continue; 787 } 788 789 // check for ignored (outside quotes), and ignore 790 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 791 if (ignoredLen > 0) { 792 pos += ignoredLen; 793 continue; 794 } 795 796 // check for trimmed character 797 // don't yet know if its at the end, so copy to workArea 798 // use trimStart to keep track of trim at the end 799 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 800 if (trimmedLen > 0) { 801 workArea.append(srcChars, pos, trimmedLen); 802 pos += trimmedLen; 803 continue; 804 } 805 806 // copy regular character from outside quotes 807 workArea.append(srcChars[pos++]); 808 trimStart = workArea.size(); 809 } 810 } 811 812 // return condition when end of string found 813 addToken(tokenList, workArea.substring(0, trimStart)); 814 return -1; 815 } 816 817 /** 818 * Checks if the characters at the index specified match the quote 819 * already matched in readNextToken(). 820 * 821 * @param srcChars the character array being tokenized 822 * @param pos the position to check for a quote 823 * @param len the length of the character array being tokenized 824 * @param quoteStart the start position of the matched quote, 0 if no quoting 825 * @param quoteLen the length of the matched quote, 0 if no quoting 826 * @return true if a quote is matched 827 */ 828 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 829 for (int i = 0; i < quoteLen; i++) { 830 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 831 return false; 832 } 833 } 834 return true; 835 } 836 837 // Delimiter 838 //----------------------------------------------------------------------- 839 /** 840 * Gets the field delimiter matcher. 841 * 842 * @return the delimiter matcher in use 843 */ 844 public StrMatcher getDelimiterMatcher() { 845 return this.delimMatcher; 846 } 847 848 /** 849 * Sets the field delimiter matcher. 850 * <p> 851 * The delimiter is used to separate one token from another. 852 * 853 * @param delim the delimiter matcher to use 854 * @return this, to enable chaining 855 */ 856 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 857 if (delim == null) { 858 this.delimMatcher = StrMatcher.noneMatcher(); 859 } else { 860 this.delimMatcher = delim; 861 } 862 return this; 863 } 864 865 /** 866 * Sets the field delimiter character. 867 * 868 * @param delim the delimiter character to use 869 * @return this, to enable chaining 870 */ 871 public StrTokenizer setDelimiterChar(final char delim) { 872 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 873 } 874 875 /** 876 * Sets the field delimiter string. 877 * 878 * @param delim the delimiter string to use 879 * @return this, to enable chaining 880 */ 881 public StrTokenizer setDelimiterString(final String delim) { 882 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 883 } 884 885 // Quote 886 //----------------------------------------------------------------------- 887 /** 888 * Gets the quote matcher currently in use. 889 * <p> 890 * The quote character is used to wrap data between the tokens. 891 * This enables delimiters to be entered as data. 892 * The default value is '"' (double quote). 893 * 894 * @return the quote matcher in use 895 */ 896 public StrMatcher getQuoteMatcher() { 897 return quoteMatcher; 898 } 899 900 /** 901 * Set the quote matcher to use. 902 * <p> 903 * The quote character is used to wrap data between the tokens. 904 * This enables delimiters to be entered as data. 905 * 906 * @param quote the quote matcher to use, null ignored 907 * @return this, to enable chaining 908 */ 909 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 910 if (quote != null) { 911 this.quoteMatcher = quote; 912 } 913 return this; 914 } 915 916 /** 917 * Sets the quote character to use. 918 * <p> 919 * The quote character is used to wrap data between the tokens. 920 * This enables delimiters to be entered as data. 921 * 922 * @param quote the quote character to use 923 * @return this, to enable chaining 924 */ 925 public StrTokenizer setQuoteChar(final char quote) { 926 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 927 } 928 929 // Ignored 930 //----------------------------------------------------------------------- 931 /** 932 * Gets the ignored character matcher. 933 * <p> 934 * These characters are ignored when parsing the String, unless they are 935 * within a quoted region. 936 * The default value is not to ignore anything. 937 * 938 * @return the ignored matcher in use 939 */ 940 public StrMatcher getIgnoredMatcher() { 941 return ignoredMatcher; 942 } 943 944 /** 945 * Set the matcher for characters to ignore. 946 * <p> 947 * These characters are ignored when parsing the String, unless they are 948 * within a quoted region. 949 * 950 * @param ignored the ignored matcher to use, null ignored 951 * @return this, to enable chaining 952 */ 953 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 954 if (ignored != null) { 955 this.ignoredMatcher = ignored; 956 } 957 return this; 958 } 959 960 /** 961 * Set the character to ignore. 962 * <p> 963 * This character is ignored when parsing the String, unless it is 964 * within a quoted region. 965 * 966 * @param ignored the ignored character to use 967 * @return this, to enable chaining 968 */ 969 public StrTokenizer setIgnoredChar(final char ignored) { 970 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 971 } 972 973 // Trimmer 974 //----------------------------------------------------------------------- 975 /** 976 * Gets the trimmer character matcher. 977 * <p> 978 * These characters are trimmed off on each side of the delimiter 979 * until the token or quote is found. 980 * The default value is not to trim anything. 981 * 982 * @return the trimmer matcher in use 983 */ 984 public StrMatcher getTrimmerMatcher() { 985 return trimmerMatcher; 986 } 987 988 /** 989 * Sets the matcher for characters to trim. 990 * <p> 991 * These characters are trimmed off on each side of the delimiter 992 * until the token or quote is found. 993 * 994 * @param trimmer the trimmer matcher to use, null ignored 995 * @return this, to enable chaining 996 */ 997 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 998 if (trimmer != null) { 999 this.trimmerMatcher = trimmer; 1000 } 1001 return this; 1002 } 1003 1004 //----------------------------------------------------------------------- 1005 /** 1006 * Gets whether the tokenizer currently returns empty tokens as null. 1007 * The default for this property is false. 1008 * 1009 * @return true if empty tokens are returned as null 1010 */ 1011 public boolean isEmptyTokenAsNull() { 1012 return this.emptyAsNull; 1013 } 1014 1015 /** 1016 * Sets whether the tokenizer should return empty tokens as null. 1017 * The default for this property is false. 1018 * 1019 * @param emptyAsNull whether empty tokens are returned as null 1020 * @return this, to enable chaining 1021 */ 1022 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1023 this.emptyAsNull = emptyAsNull; 1024 return this; 1025 } 1026 1027 //----------------------------------------------------------------------- 1028 /** 1029 * Gets whether the tokenizer currently ignores empty tokens. 1030 * The default for this property is true. 1031 * 1032 * @return true if empty tokens are not returned 1033 */ 1034 public boolean isIgnoreEmptyTokens() { 1035 return ignoreEmptyTokens; 1036 } 1037 1038 /** 1039 * Sets whether the tokenizer should ignore and not return empty tokens. 1040 * The default for this property is true. 1041 * 1042 * @param ignoreEmptyTokens whether empty tokens are not returned 1043 * @return this, to enable chaining 1044 */ 1045 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1046 this.ignoreEmptyTokens = ignoreEmptyTokens; 1047 return this; 1048 } 1049 1050 //----------------------------------------------------------------------- 1051 /** 1052 * Gets the String content that the tokenizer is parsing. 1053 * 1054 * @return the string content being parsed 1055 */ 1056 public String getContent() { 1057 if (chars == null) { 1058 return null; 1059 } 1060 return new String(chars); 1061 } 1062 1063 //----------------------------------------------------------------------- 1064 /** 1065 * Creates a new instance of this Tokenizer. The new instance is reset so 1066 * that it will be at the start of the token list. 1067 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 1068 * 1069 * @return a new instance of this Tokenizer which has been reset. 1070 */ 1071 @Override 1072 public Object clone() { 1073 try { 1074 return cloneReset(); 1075 } catch (final CloneNotSupportedException ex) { 1076 return null; 1077 } 1078 } 1079 1080 /** 1081 * Creates a new instance of this Tokenizer. The new instance is reset so that 1082 * it will be at the start of the token list. 1083 * 1084 * @return a new instance of this Tokenizer which has been reset. 1085 * @throws CloneNotSupportedException if there is a problem cloning 1086 */ 1087 Object cloneReset() throws CloneNotSupportedException { 1088 // this method exists to enable 100% test coverage 1089 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1090 if (cloned.chars != null) { 1091 cloned.chars = cloned.chars.clone(); 1092 } 1093 cloned.reset(); 1094 return cloned; 1095 } 1096 1097 //----------------------------------------------------------------------- 1098 /** 1099 * Gets the String content that the tokenizer is parsing. 1100 * 1101 * @return the string content being parsed 1102 */ 1103 @Override 1104 public String toString() { 1105 if (tokens == null) { 1106 return "StrTokenizer[not tokenized yet]"; 1107 } 1108 return "StrTokenizer" + getTokenList(); 1109 } 1110 1111}