1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.xerces.impl.xpath.regex; 19 20 import java.util.Vector; 21 import java.util.Hashtable; 22 23 /** 24 * This class represents a node in parse tree. 25 * 26 * @xerces.internal 27 * 28 * @version $Id: Token.java 572108 2007-09-02 18:48:31Z mrglavas $ 29 */ 30 class Token implements java.io.Serializable { 31 32 private static final long serialVersionUID = 8484976002585487481L; 33 34 static final boolean COUNTTOKENS = true; 35 static int tokens = 0; 36 37 static final int CHAR = 0; // Literal char 38 static final int DOT = 11; // . 39 static final int CONCAT = 1; // XY 40 static final int UNION = 2; // X|Y|Z 41 static final int CLOSURE = 3; // X* 42 static final int RANGE = 4; // [a-zA-Z] etc. 43 static final int NRANGE = 5; // [^a-zA-Z] etc. 44 static final int PAREN = 6; // (X) or (?:X) 45 static final int EMPTY = 7; // 46 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z 47 static final int NONGREEDYCLOSURE = 9; // *? +? 48 static final int STRING = 10; // strings 49 static final int BACKREFERENCE = 12; // back references 50 static final int LOOKAHEAD = 20; // (?=...) 51 static final int NEGATIVELOOKAHEAD = 21; // (?!...) 52 static final int LOOKBEHIND = 22; // (?<=...) 53 static final int NEGATIVELOOKBEHIND = 23; // (?<!...) 54 static final int INDEPENDENT = 24; // (?>...) 55 static final int MODIFIERGROUP = 25; // (?ims-ims:...) 56 static final int CONDITION = 26; // (?(...)yes|no) 57 58 static final int UTF16_MAX = 0x10ffff; 59 60 final int type; 61 62 static Token token_dot; 63 static Token token_0to9; 64 static Token token_wordchars; 65 static Token token_not_0to9; 66 static Token token_not_wordchars; 67 static Token token_spaces; 68 static Token token_not_spaces; 69 static Token token_empty; 70 static Token token_linebeginning; 71 static Token token_linebeginning2; 72 static Token token_lineend; 73 static Token token_stringbeginning; 74 static Token token_stringend; 75 static Token token_stringend2; 76 static Token token_wordedge; 77 static Token token_not_wordedge; 78 static Token token_wordbeginning; 79 static Token token_wordend; 80 static { 81 Token.token_empty = new Token(Token.EMPTY); 82 83 Token.token_linebeginning = Token.createAnchor('^'); 84 Token.token_linebeginning2 = Token.createAnchor('@'); 85 Token.token_lineend = Token.createAnchor('$'); 86 Token.token_stringbeginning = Token.createAnchor('A'); 87 Token.token_stringend = Token.createAnchor('z'); 88 Token.token_stringend2 = Token.createAnchor('Z'); 89 Token.token_wordedge = Token.createAnchor('b'); 90 Token.token_not_wordedge = Token.createAnchor('B'); 91 Token.token_wordbeginning = Token.createAnchor('<'); 92 Token.token_wordend = Token.createAnchor('>'); 93 94 Token.token_dot = new Token(Token.DOT); 95 96 Token.token_0to9 = Token.createRange(); 97 Token.token_0to9.addRange('0', '9'); 98 Token.token_wordchars = Token.createRange(); 99 Token.token_wordchars.addRange('0', '9'); 100 Token.token_wordchars.addRange('A', 'Z'); 101 Token.token_wordchars.addRange('_', '_'); 102 Token.token_wordchars.addRange('a', 'z'); 103 Token.token_spaces = Token.createRange(); 104 Token.token_spaces.addRange('\t', '\t'); 105 Token.token_spaces.addRange('\n', '\n'); 106 Token.token_spaces.addRange('\f', '\f'); 107 Token.token_spaces.addRange('\r', '\r'); 108 Token.token_spaces.addRange(' ', ' '); 109 110 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); 111 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); 112 Token.token_not_spaces = Token.complementRanges(Token.token_spaces); 113 } 114 115 static Token.ParenToken createLook(int type, Token child) { 116 if (COUNTTOKENS) Token.tokens ++; 117 return new Token.ParenToken(type, child, 0); 118 } 119 static Token.ParenToken createParen(Token child, int pnumber) { 120 if (COUNTTOKENS) Token.tokens ++; 121 return new Token.ParenToken(Token.PAREN, child, pnumber); 122 } 123 static Token.ClosureToken createClosure(Token tok) { 124 if (COUNTTOKENS) Token.tokens ++; 125 return new Token.ClosureToken(Token.CLOSURE, tok); 126 } 127 static Token.ClosureToken createNGClosure(Token tok) { 128 if (COUNTTOKENS) Token.tokens ++; 129 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); 130 } 131 static Token.ConcatToken createConcat(Token tok1, Token tok2) { 132 if (COUNTTOKENS) Token.tokens ++; 133 return new Token.ConcatToken(tok1, tok2); 134 } 135 static Token.UnionToken createConcat() { 136 if (COUNTTOKENS) Token.tokens ++; 137 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug. 138 } 139 static Token.UnionToken createUnion() { 140 if (COUNTTOKENS) Token.tokens ++; 141 return new Token.UnionToken(Token.UNION); 142 } 143 static Token createEmpty() { 144 return Token.token_empty; 145 } 146 static RangeToken createRange() { 147 if (COUNTTOKENS) Token.tokens ++; 148 return new RangeToken(Token.RANGE); 149 } 150 static RangeToken createNRange() { 151 if (COUNTTOKENS) Token.tokens ++; 152 return new RangeToken(Token.NRANGE); 153 } 154 static Token.CharToken createChar(int ch) { 155 if (COUNTTOKENS) Token.tokens ++; 156 return new Token.CharToken(Token.CHAR, ch); 157 } 158 static private Token.CharToken createAnchor(int ch) { 159 if (COUNTTOKENS) Token.tokens ++; 160 return new Token.CharToken(Token.ANCHOR, ch); 161 } 162 static Token.StringToken createBackReference(int refno) { 163 if (COUNTTOKENS) Token.tokens ++; 164 return new Token.StringToken(Token.BACKREFERENCE, null, refno); 165 } 166 static Token.StringToken createString(String str) { 167 if (COUNTTOKENS) Token.tokens ++; 168 return new Token.StringToken(Token.STRING, str, 0); 169 } 170 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { 171 if (COUNTTOKENS) Token.tokens ++; 172 return new Token.ModifierToken(child, add, mask); 173 } 174 static Token.ConditionToken createCondition(int refno, Token condition, 175 Token yespat, Token nopat) { 176 if (COUNTTOKENS) Token.tokens ++; 177 return new Token.ConditionToken(refno, condition, yespat, nopat); 178 } 179 180 protected Token(int type) { 181 this.type = type; 182 } 183 184 /** 185 * A number of children. 186 */ 187 int size() { 188 return 0; 189 } 190 Token getChild(int index) { 191 return null; 192 } 193 void addChild(Token tok) { 194 throw new RuntimeException("Not supported."); 195 } 196 197 // for RANGE or NRANGE 198 protected void addRange(int start, int end) { 199 throw new RuntimeException("Not supported."); 200 } 201 protected void sortRanges() { 202 throw new RuntimeException("Not supported."); 203 } 204 protected void compactRanges() { 205 throw new RuntimeException("Not supported."); 206 } 207 protected void mergeRanges(Token tok) { 208 throw new RuntimeException("Not supported."); 209 } 210 protected void subtractRanges(Token tok) { 211 throw new RuntimeException("Not supported."); 212 } 213 protected void intersectRanges(Token tok) { 214 throw new RuntimeException("Not supported."); 215 } 216 static Token complementRanges(Token tok) { 217 return RangeToken.complementRanges(tok); 218 } 219 220 221 void setMin(int min) { // for CLOSURE 222 } 223 void setMax(int max) { // for CLOSURE 224 } 225 int getMin() { // for CLOSURE 226 return -1; 227 } 228 int getMax() { // for CLOSURE 229 return -1; 230 } 231 int getReferenceNumber() { // for STRING 232 return 0; 233 } 234 String getString() { // for STRING 235 return null; 236 } 237 238 int getParenNumber() { 239 return 0; 240 } 241 int getChar() { 242 return -1; 243 } 244 245 public String toString() { 246 return this.toString(0); 247 } 248 public String toString(int options) { 249 return this.type == Token.DOT ? "." : ""; 250 } 251 252 /** 253 * How many characters are needed? 254 */ 255 final int getMinLength() { 256 switch (this.type) { 257 case CONCAT: 258 int sum = 0; 259 for (int i = 0; i < this.size(); i ++) 260 sum += this.getChild(i).getMinLength(); 261 return sum; 262 263 case CONDITION: 264 case UNION: 265 if (this.size() == 0) 266 return 0; 267 int ret = this.getChild(0).getMinLength(); 268 for (int i = 1; i < this.size(); i ++) { 269 int min = this.getChild(i).getMinLength(); 270 if (min < ret) ret = min; 271 } 272 return ret; 273 274 case CLOSURE: 275 case NONGREEDYCLOSURE: 276 if (this.getMin() >= 0) 277 return this.getMin() * this.getChild(0).getMinLength(); 278 return 0; 279 280 case EMPTY: 281 case ANCHOR: 282 return 0; 283 284 case DOT: 285 case CHAR: 286 case RANGE: 287 case NRANGE: 288 return 1; 289 290 case INDEPENDENT: 291 case PAREN: 292 case MODIFIERGROUP: 293 return this.getChild(0).getMinLength(); 294 295 case BACKREFERENCE: 296 return 0; // ******* 297 298 case STRING: 299 return this.getString().length(); 300 301 case LOOKAHEAD: 302 case NEGATIVELOOKAHEAD: 303 case LOOKBEHIND: 304 case NEGATIVELOOKBEHIND: 305 return 0; // ***** Really? 306 307 default: 308 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type); 309 } 310 } 311 312 final int getMaxLength() { 313 switch (this.type) { 314 case CONCAT: 315 int sum = 0; 316 for (int i = 0; i < this.size(); i ++) { 317 int d = this.getChild(i).getMaxLength(); 318 if (d < 0) return -1; 319 sum += d; 320 } 321 return sum; 322 323 case CONDITION: 324 case UNION: 325 if (this.size() == 0) 326 return 0; 327 int ret = this.getChild(0).getMaxLength(); 328 for (int i = 1; ret >= 0 && i < this.size(); i ++) { 329 int max = this.getChild(i).getMaxLength(); 330 if (max < 0) { // infinity 331 ret = -1; 332 break; 333 } 334 if (max > ret) ret = max; 335 } 336 return ret; 337 338 case CLOSURE: 339 case NONGREEDYCLOSURE: 340 if (this.getMax() >= 0) 341 // When this.child.getMaxLength() < 0, 342 // this returns minus value 343 return this.getMax() * this.getChild(0).getMaxLength(); 344 return -1; 345 346 case EMPTY: 347 case ANCHOR: 348 return 0; 349 350 case CHAR: 351 return 1; 352 case DOT: 353 case RANGE: 354 case NRANGE: 355 return 2; 356 357 case INDEPENDENT: 358 case PAREN: 359 case MODIFIERGROUP: 360 return this.getChild(0).getMaxLength(); 361 362 case BACKREFERENCE: 363 return -1; // ****** 364 365 case STRING: 366 return this.getString().length(); 367 368 case LOOKAHEAD: 369 case NEGATIVELOOKAHEAD: 370 case LOOKBEHIND: 371 case NEGATIVELOOKBEHIND: 372 return 0; // ***** Really? 373 374 default: 375 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type); 376 } 377 } 378 379 static final int FC_CONTINUE = 0; 380 static final int FC_TERMINAL = 1; 381 static final int FC_ANY = 2; 382 private static final boolean isSet(int options, int flag) { 383 return (options & flag) == flag; 384 } 385 final int analyzeFirstCharacter(RangeToken result, int options) { 386 switch (this.type) { 387 case CONCAT: 388 int ret = FC_CONTINUE; 389 for (int i = 0; i < this.size(); i ++) 390 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) 391 break; 392 return ret; 393 394 case UNION: 395 if (this.size() == 0) 396 return FC_CONTINUE; 397 /* 398 * a|b|c -> FC_TERMINAL 399 * a|.|c -> FC_ANY 400 * a|b| -> FC_CONTINUE 401 */ 402 int ret2 = FC_CONTINUE; 403 boolean hasEmpty = false; 404 for (int i = 0; i < this.size(); i ++) { 405 ret2 = this.getChild(i).analyzeFirstCharacter(result, options); 406 if (ret2 == FC_ANY) 407 break; 408 else if (ret2 == FC_CONTINUE) 409 hasEmpty = true; 410 } 411 return hasEmpty ? FC_CONTINUE : ret2; 412 413 case CONDITION: 414 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); 415 if (this.size() == 1) return FC_CONTINUE; 416 if (ret3 == FC_ANY) return ret3; 417 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); 418 if (ret4 == FC_ANY) return ret4; 419 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; 420 421 case CLOSURE: 422 case NONGREEDYCLOSURE: 423 this.getChild(0).analyzeFirstCharacter(result, options); 424 return FC_CONTINUE; 425 426 case EMPTY: 427 case ANCHOR: 428 return FC_CONTINUE; 429 430 case CHAR: 431 int ch = this.getChar(); 432 result.addRange(ch, ch); 433 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 434 ch = Character.toUpperCase((char)ch); 435 result.addRange(ch, ch); 436 ch = Character.toLowerCase((char)ch); 437 result.addRange(ch, ch); 438 } 439 return FC_TERMINAL; 440 441 case DOT: 442 return FC_ANY; 443 444 case RANGE: 445 if (isSet(options, RegularExpression.IGNORE_CASE)) { 446 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken()); 447 } else { 448 result.mergeRanges(this); 449 } 450 return FC_TERMINAL; 451 452 case NRANGE: // **** 453 if (isSet(options, RegularExpression.IGNORE_CASE)) { 454 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken())); 455 } else { 456 result.mergeRanges(Token.complementRanges(this)); 457 } 458 return FC_TERMINAL; 459 460 case INDEPENDENT: 461 case PAREN: 462 return this.getChild(0).analyzeFirstCharacter(result, options); 463 464 case MODIFIERGROUP: 465 options |= ((ModifierToken)this).getOptions(); 466 options &= ~((ModifierToken)this).getOptionsMask(); 467 return this.getChild(0).analyzeFirstCharacter(result, options); 468 469 case BACKREFERENCE: 470 result.addRange(0, UTF16_MAX); // **** We can not optimize. 471 return FC_ANY; 472 473 case STRING: 474 int cha = this.getString().charAt(0); 475 int ch2; 476 if (REUtil.isHighSurrogate(cha) 477 && this.getString().length() >= 2 478 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) 479 cha = REUtil.composeFromSurrogates(cha, ch2); 480 result.addRange(cha, cha); 481 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 482 cha = Character.toUpperCase((char)cha); 483 result.addRange(cha, cha); 484 cha = Character.toLowerCase((char)cha); 485 result.addRange(cha, cha); 486 } 487 return FC_TERMINAL; 488 489 case LOOKAHEAD: 490 case NEGATIVELOOKAHEAD: 491 case LOOKBEHIND: 492 case NEGATIVELOOKBEHIND: 493 return FC_CONTINUE; 494 495 default: 496 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); 497 } 498 } 499 500 private final boolean isShorterThan(Token tok) { 501 if (tok == null) return false; 502 /* 503 int mylength; 504 if (this.type == STRING) mylength = this.getString().length(); 505 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1; 506 else throw new RuntimeException("Internal Error: Illegal type: "+this.type); 507 int otherlength; 508 if (tok.type == STRING) otherlength = tok.getString().length(); 509 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1; 510 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); 511 */ 512 int mylength; 513 if (this.type == STRING) mylength = this.getString().length(); 514 else throw new RuntimeException("Internal Error: Illegal type: "+this.type); 515 int otherlength; 516 if (tok.type == STRING) otherlength = tok.getString().length(); 517 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); 518 return mylength < otherlength; 519 } 520 521 static class FixedStringContainer { 522 Token token = null; 523 int options = 0; 524 FixedStringContainer() { 525 } 526 } 527 528 final void findFixedString(FixedStringContainer container, int options) { 529 switch (this.type) { 530 case CONCAT: 531 Token prevToken = null; 532 int prevOptions = 0; 533 for (int i = 0; i < this.size(); i ++) { 534 this.getChild(i).findFixedString(container, options); 535 if (prevToken == null || prevToken.isShorterThan(container.token)) { 536 prevToken = container.token; 537 prevOptions = container.options; 538 } 539 } 540 container.token = prevToken; 541 container.options = prevOptions; 542 return; 543 544 case UNION: 545 case CLOSURE: 546 case NONGREEDYCLOSURE: 547 case EMPTY: 548 case ANCHOR: 549 case RANGE: 550 case DOT: 551 case NRANGE: 552 case BACKREFERENCE: 553 case LOOKAHEAD: 554 case NEGATIVELOOKAHEAD: 555 case LOOKBEHIND: 556 case NEGATIVELOOKBEHIND: 557 case CONDITION: 558 container.token = null; 559 return; 560 561 case CHAR: // Ignore CHAR tokens. 562 container.token = null; // ** 563 return; // ** 564 565 case STRING: 566 container.token = this; 567 container.options = options; 568 return; 569 570 case INDEPENDENT: 571 case PAREN: 572 this.getChild(0).findFixedString(container, options); 573 return; 574 575 case MODIFIERGROUP: 576 options |= ((ModifierToken)this).getOptions(); 577 options &= ~((ModifierToken)this).getOptionsMask(); 578 this.getChild(0).findFixedString(container, options); 579 return; 580 581 default: 582 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type); 583 } 584 } 585 586 boolean match(int ch) { 587 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); 588 } 589 590 // ------------------------------------------------------ 591 private final static Hashtable categories = new Hashtable(); 592 private final static Hashtable categories2 = new Hashtable(); 593 private static final String[] categoryNames = { 594 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", 595 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", 596 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28 597 "Pi", "Pf", // 29, 30 598 "L", "M", "N", "Z", "C", "P", "S", // 31-37 599 }; 600 601 // Schema Rec. {Datatypes} - Punctuation 602 static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote 603 static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote 604 static final int CHAR_LETTER = 31; 605 static final int CHAR_MARK = 32; 606 static final int CHAR_NUMBER = 33; 607 static final int CHAR_SEPARATOR = 34; 608 static final int CHAR_OTHER = 35; 609 static final int CHAR_PUNCTUATION = 36; 610 static final int CHAR_SYMBOL = 37; 611 612 //blockNames in UNICODE 3.1 that supported by XML Schema REC 613 private static final String[] blockNames = { 614 /*0000..007F;*/ "Basic Latin", 615 /*0080..00FF;*/ "Latin-1 Supplement", 616 /*0100..017F;*/ "Latin Extended-A", 617 /*0180..024F;*/ "Latin Extended-B", 618 /*0250..02AF;*/ "IPA Extensions", 619 /*02B0..02FF;*/ "Spacing Modifier Letters", 620 /*0300..036F;*/ "Combining Diacritical Marks", 621 /*0370..03FF;*/ "Greek", 622 /*0400..04FF;*/ "Cyrillic", 623 /*0530..058F;*/ "Armenian", 624 /*0590..05FF;*/ "Hebrew", 625 /*0600..06FF;*/ "Arabic", 626 /*0700..074F;*/ "Syriac", 627 /*0780..07BF;*/ "Thaana", 628 /*0900..097F;*/ "Devanagari", 629 /*0980..09FF;*/ "Bengali", 630 /*0A00..0A7F;*/ "Gurmukhi", 631 /*0A80..0AFF;*/ "Gujarati", 632 /*0B00..0B7F;*/ "Oriya", 633 /*0B80..0BFF;*/ "Tamil", 634 /*0C00..0C7F;*/ "Telugu", 635 /*0C80..0CFF;*/ "Kannada", 636 /*0D00..0D7F;*/ "Malayalam", 637 /*0D80..0DFF;*/ "Sinhala", 638 /*0E00..0E7F;*/ "Thai", 639 /*0E80..0EFF;*/ "Lao", 640 /*0F00..0FFF;*/ "Tibetan", 641 /*1000..109F;*/ "Myanmar", 642 /*10A0..10FF;*/ "Georgian", 643 /*1100..11FF;*/ "Hangul Jamo", 644 /*1200..137F;*/ "Ethiopic", 645 /*13A0..13FF;*/ "Cherokee", 646 /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics", 647 /*1680..169F;*/ "Ogham", 648 /*16A0..16FF;*/ "Runic", 649 /*1780..17FF;*/ "Khmer", 650 /*1800..18AF;*/ "Mongolian", 651 /*1E00..1EFF;*/ "Latin Extended Additional", 652 /*1F00..1FFF;*/ "Greek Extended", 653 /*2000..206F;*/ "General Punctuation", 654 /*2070..209F;*/ "Superscripts and Subscripts", 655 /*20A0..20CF;*/ "Currency Symbols", 656 /*20D0..20FF;*/ "Combining Marks for Symbols", 657 /*2100..214F;*/ "Letterlike Symbols", 658 /*2150..218F;*/ "Number Forms", 659 /*2190..21FF;*/ "Arrows", 660 /*2200..22FF;*/ "Mathematical Operators", 661 /*2300..23FF;*/ "Miscellaneous Technical", 662 /*2400..243F;*/ "Control Pictures", 663 /*2440..245F;*/ "Optical Character Recognition", 664 /*2460..24FF;*/ "Enclosed Alphanumerics", 665 /*2500..257F;*/ "Box Drawing", 666 /*2580..259F;*/ "Block Elements", 667 /*25A0..25FF;*/ "Geometric Shapes", 668 /*2600..26FF;*/ "Miscellaneous Symbols", 669 /*2700..27BF;*/ "Dingbats", 670 /*2800..28FF;*/ "Braille Patterns", 671 /*2E80..2EFF;*/ "CJK Radicals Supplement", 672 /*2F00..2FDF;*/ "Kangxi Radicals", 673 /*2FF0..2FFF;*/ "Ideographic Description Characters", 674 /*3000..303F;*/ "CJK Symbols and Punctuation", 675 /*3040..309F;*/ "Hiragana", 676 /*30A0..30FF;*/ "Katakana", 677 /*3100..312F;*/ "Bopomofo", 678 /*3130..318F;*/ "Hangul Compatibility Jamo", 679 /*3190..319F;*/ "Kanbun", 680 /*31A0..31BF;*/ "Bopomofo Extended", 681 /*3200..32FF;*/ "Enclosed CJK Letters and Months", 682 /*3300..33FF;*/ "CJK Compatibility", 683 /*3400..4DB5;*/ "CJK Unified Ideographs Extension A", 684 /*4E00..9FFF;*/ "CJK Unified Ideographs", 685 /*A000..A48F;*/ "Yi Syllables", 686 /*A490..A4CF;*/ "Yi Radicals", 687 /*AC00..D7A3;*/ "Hangul Syllables", 688 /*E000..F8FF;*/ "Private Use", 689 /*F900..FAFF;*/ "CJK Compatibility Ideographs", 690 /*FB00..FB4F;*/ "Alphabetic Presentation Forms", 691 /*FB50..FDFF;*/ "Arabic Presentation Forms-A", 692 /*FE20..FE2F;*/ "Combining Half Marks", 693 /*FE30..FE4F;*/ "CJK Compatibility Forms", 694 /*FE50..FE6F;*/ "Small Form Variants", 695 /*FE70..FEFE;*/ "Arabic Presentation Forms-B", 696 /*FEFF..FEFF;*/ "Specials", 697 /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms", 698 //missing Specials add manually 699 /*10300..1032F;*/ "Old Italic", // 84 700 /*10330..1034F;*/ "Gothic", 701 /*10400..1044F;*/ "Deseret", 702 /*1D000..1D0FF;*/ "Byzantine Musical Symbols", 703 /*1D100..1D1FF;*/ "Musical Symbols", 704 /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols", 705 /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B", 706 /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement", 707 /*E0000..E007F;*/ "Tags", 708 //missing 2 private use add manually 709 710 }; 711 //ADD THOSE MANUALLY 712 //F0000..FFFFD; "Private Use", 713 //100000..10FFFD; "Private Use" 714 //FFF0..FFFD; "Specials", 715 static final String blockRanges = 716 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" 717 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" 718 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" 719 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" 720 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" 721 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" 722 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" 723 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" 724 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" 725 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" 726 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; 727 static final int[] nonBMPBlockRanges = { 728 0x10300, 0x1032F, // 84 729 0x10330, 0x1034F, 730 0x10400, 0x1044F, 731 0x1D000, 0x1D0FF, 732 0x1D100, 0x1D1FF, 733 0x1D400, 0x1D7FF, 734 0x20000, 0x2A6D6, 735 0x2F800, 0x2FA1F, 736 0xE0000, 0xE007F 737 }; 738 private static final int NONBMP_BLOCK_START = 84; 739 740 static protected RangeToken getRange(String name, boolean positive) { 741 if (Token.categories.size() == 0) { 742 synchronized (Token.categories) { 743 Token[] ranges = new Token[Token.categoryNames.length]; 744 for (int i = 0; i < ranges.length; i ++) { 745 ranges[i] = Token.createRange(); 746 } 747 int type; 748 for (int i = 0; i < 0x10000; i ++) { 749 type = Character.getType((char)i); 750 if (type == Character.START_PUNCTUATION || 751 type == Character.END_PUNCTUATION) { 752 //build table of Pi values 753 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || 754 i == 0x201F || i == 0x2039) { 755 type = CHAR_INIT_QUOTE; 756 } 757 //build table of Pf values 758 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { 759 type = CHAR_FINAL_QUOTE; 760 } 761 } 762 ranges[type].addRange(i, i); 763 switch (type) { 764 case Character.UPPERCASE_LETTER: 765 case Character.LOWERCASE_LETTER: 766 case Character.TITLECASE_LETTER: 767 case Character.MODIFIER_LETTER: 768 case Character.OTHER_LETTER: 769 type = CHAR_LETTER; 770 break; 771 case Character.NON_SPACING_MARK: 772 case Character.COMBINING_SPACING_MARK: 773 case Character.ENCLOSING_MARK: 774 type = CHAR_MARK; 775 break; 776 case Character.DECIMAL_DIGIT_NUMBER: 777 case Character.LETTER_NUMBER: 778 case Character.OTHER_NUMBER: 779 type = CHAR_NUMBER; 780 break; 781 case Character.SPACE_SEPARATOR: 782 case Character.LINE_SEPARATOR: 783 case Character.PARAGRAPH_SEPARATOR: 784 type = CHAR_SEPARATOR; 785 break; 786 case Character.CONTROL: 787 case Character.FORMAT: 788 case Character.SURROGATE: 789 case Character.PRIVATE_USE: 790 case Character.UNASSIGNED: 791 type = CHAR_OTHER; 792 break; 793 case Character.CONNECTOR_PUNCTUATION: 794 case Character.DASH_PUNCTUATION: 795 case Character.START_PUNCTUATION: 796 case Character.END_PUNCTUATION: 797 case CHAR_INIT_QUOTE: 798 case CHAR_FINAL_QUOTE: 799 case Character.OTHER_PUNCTUATION: 800 type = CHAR_PUNCTUATION; 801 break; 802 case Character.MATH_SYMBOL: 803 case Character.CURRENCY_SYMBOL: 804 case Character.MODIFIER_SYMBOL: 805 case Character.OTHER_SYMBOL: 806 type = CHAR_SYMBOL; 807 break; 808 default: 809 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); 810 } 811 ranges[type].addRange(i, i); 812 } // for all characters 813 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); 814 815 for (int i = 0; i < ranges.length; i ++) { 816 if (Token.categoryNames[i] != null) { 817 if (i == Character.UNASSIGNED) { // Unassigned 818 ranges[i].addRange(0x10000, Token.UTF16_MAX); 819 } 820 Token.categories.put(Token.categoryNames[i], ranges[i]); 821 Token.categories2.put(Token.categoryNames[i], 822 Token.complementRanges(ranges[i])); 823 } 824 } 825 //REVISIT: do we really need to support block names as in Unicode 3.1 826 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? 827 // 828 StringBuffer buffer = new StringBuffer(50); 829 for (int i = 0; i < Token.blockNames.length; i ++) { 830 Token r1 = Token.createRange(); 831 int location; 832 if (i < NONBMP_BLOCK_START) { 833 location = i*2; 834 int rstart = Token.blockRanges.charAt(location); 835 int rend = Token.blockRanges.charAt(location+1); 836 //DEBUGING 837 //System.out.println(n+" " +Integer.toHexString(rstart) 838 // +"-"+ Integer.toHexString(rend)); 839 r1.addRange(rstart, rend); 840 } else { 841 location = (i - NONBMP_BLOCK_START) * 2; 842 r1.addRange(Token.nonBMPBlockRanges[location], 843 Token.nonBMPBlockRanges[location + 1]); 844 } 845 String n = Token.blockNames[i]; 846 if (n.equals("Specials")) 847 r1.addRange(0xfff0, 0xfffd); 848 if (n.equals("Private Use")) { 849 r1.addRange(0xF0000,0xFFFFD); 850 r1.addRange(0x100000,0x10FFFD); 851 } 852 Token.categories.put(n, r1); 853 Token.categories2.put(n, Token.complementRanges(r1)); 854 buffer.setLength(0); 855 buffer.append("Is"); 856 if (n.indexOf(' ') >= 0) { 857 for (int ci = 0; ci < n.length(); ci ++) 858 if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci)); 859 } 860 else { 861 buffer.append(n); 862 } 863 Token.setAlias(buffer.toString(), n, true); 864 } 865 866 // TR#18 1.2 867 Token.setAlias("ASSIGNED", "Cn", false); 868 Token.setAlias("UNASSIGNED", "Cn", true); 869 Token all = Token.createRange(); 870 all.addRange(0, Token.UTF16_MAX); 871 Token.categories.put("ALL", all); 872 Token.categories2.put("ALL", Token.complementRanges(all)); 873 Token.registerNonXS("ASSIGNED"); 874 Token.registerNonXS("UNASSIGNED"); 875 Token.registerNonXS("ALL"); 876 877 Token isalpha = Token.createRange(); 878 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu 879 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll 880 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo 881 Token.categories.put("IsAlpha", isalpha); 882 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha)); 883 Token.registerNonXS("IsAlpha"); 884 885 Token isalnum = Token.createRange(); 886 isalnum.mergeRanges(isalpha); // Lu Ll Lo 887 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd 888 Token.categories.put("IsAlnum", isalnum); 889 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum)); 890 Token.registerNonXS("IsAlnum"); 891 892 Token isspace = Token.createRange(); 893 isspace.mergeRanges(Token.token_spaces); 894 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z 895 Token.categories.put("IsSpace", isspace); 896 Token.categories2.put("IsSpace", Token.complementRanges(isspace)); 897 Token.registerNonXS("IsSpace"); 898 899 Token isword = Token.createRange(); 900 isword.mergeRanges(isalnum); // Lu Ll Lo Nd 901 isword.addRange('_', '_'); 902 Token.categories.put("IsWord", isword); 903 Token.categories2.put("IsWord", Token.complementRanges(isword)); 904 Token.registerNonXS("IsWord"); 905 906 Token isascii = Token.createRange(); 907 isascii.addRange(0, 127); 908 Token.categories.put("IsASCII", isascii); 909 Token.categories2.put("IsASCII", Token.complementRanges(isascii)); 910 Token.registerNonXS("IsASCII"); 911 912 Token isnotgraph = Token.createRange(); 913 isnotgraph.mergeRanges(ranges[CHAR_OTHER]); 914 isnotgraph.addRange(' ', ' '); 915 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph)); 916 Token.categories2.put("IsGraph", isnotgraph); 917 Token.registerNonXS("IsGraph"); 918 919 Token isxdigit = Token.createRange(); 920 isxdigit.addRange('0', '9'); 921 isxdigit.addRange('A', 'F'); 922 isxdigit.addRange('a', 'f'); 923 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit)); 924 Token.categories2.put("IsXDigit", isxdigit); 925 Token.registerNonXS("IsXDigit"); 926 927 Token.setAlias("IsDigit", "Nd", true); 928 Token.setAlias("IsUpper", "Lu", true); 929 Token.setAlias("IsLower", "Ll", true); 930 Token.setAlias("IsCntrl", "C", true); 931 Token.setAlias("IsPrint", "C", false); 932 Token.setAlias("IsPunct", "P", true); 933 Token.registerNonXS("IsDigit"); 934 Token.registerNonXS("IsUpper"); 935 Token.registerNonXS("IsLower"); 936 Token.registerNonXS("IsCntrl"); 937 Token.registerNonXS("IsPrint"); 938 Token.registerNonXS("IsPunct"); 939 940 Token.setAlias("alpha", "IsAlpha", true); 941 Token.setAlias("alnum", "IsAlnum", true); 942 Token.setAlias("ascii", "IsASCII", true); 943 Token.setAlias("cntrl", "IsCntrl", true); 944 Token.setAlias("digit", "IsDigit", true); 945 Token.setAlias("graph", "IsGraph", true); 946 Token.setAlias("lower", "IsLower", true); 947 Token.setAlias("print", "IsPrint", true); 948 Token.setAlias("punct", "IsPunct", true); 949 Token.setAlias("space", "IsSpace", true); 950 Token.setAlias("upper", "IsUpper", true); 951 Token.setAlias("word", "IsWord", true); // Perl extension 952 Token.setAlias("xdigit", "IsXDigit", true); 953 Token.registerNonXS("alpha"); 954 Token.registerNonXS("alnum"); 955 Token.registerNonXS("ascii"); 956 Token.registerNonXS("cntrl"); 957 Token.registerNonXS("digit"); 958 Token.registerNonXS("graph"); 959 Token.registerNonXS("lower"); 960 Token.registerNonXS("print"); 961 Token.registerNonXS("punct"); 962 Token.registerNonXS("space"); 963 Token.registerNonXS("upper"); 964 Token.registerNonXS("word"); 965 Token.registerNonXS("xdigit"); 966 } // synchronized 967 } // if null 968 RangeToken tok = positive ? (RangeToken)Token.categories.get(name) 969 : (RangeToken)Token.categories2.get(name); 970 //if (tok == null) System.out.println(name); 971 return tok; 972 } 973 static protected RangeToken getRange(String name, boolean positive, boolean xs) { 974 RangeToken range = Token.getRange(name, positive); 975 if (xs && range != null && Token.isRegisterNonXS(name)) 976 range = null; 977 return range; 978 } 979 980 static Hashtable nonxs = null; 981 /** 982 * This method is called by only getRange(). 983 * So this method need not MT-safe. 984 */ 985 static protected void registerNonXS(String name) { 986 if (Token.nonxs == null) 987 Token.nonxs = new Hashtable(); 988 Token.nonxs.put(name, name); 989 } 990 static protected boolean isRegisterNonXS(String name) { 991 if (Token.nonxs == null) 992 return false; 993 //DEBUG 994 //System.err.println("isRegisterNonXS: "+name); 995 return Token.nonxs.containsKey(name); 996 } 997 998 private static void setAlias(String newName, String name, boolean positive) { 999 Token t1 = (Token)Token.categories.get(name); 1000 Token t2 = (Token)Token.categories2.get(name); 1001 if (positive) { 1002 Token.categories.put(newName, t1); 1003 Token.categories2.put(newName, t2); 1004 } else { 1005 Token.categories2.put(newName, t1); 1006 Token.categories.put(newName, t2); 1007 } 1008 } 1009 1010 // ------------------------------------------------------ 1011 1012 static final String viramaString = 1013 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1014 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1015 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1016 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1017 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1018 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1019 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1020 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1021 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1022 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;; 1023 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;; 1024 1025 static private Token token_grapheme = null; 1026 static synchronized Token getGraphemePattern() { 1027 if (Token.token_grapheme != null) 1028 return Token.token_grapheme; 1029 1030 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}] 1031 base_char.mergeRanges(Token.getRange("ASSIGNED", true)); 1032 base_char.subtractRanges(Token.getRange("M", true)); 1033 base_char.subtractRanges(Token.getRange("C", true)); 1034 1035 Token virama = Token.createRange(); 1036 for (int i = 0; i < Token.viramaString.length(); i++) { 1037 virama.addRange(i, i); 1038 } 1039 1040 Token combiner_wo_virama = Token.createRange(); 1041 combiner_wo_virama.mergeRanges(Token.getRange("M", true)); 1042 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final 1043 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras 1044 1045 Token left = Token.createUnion(); // base_char? 1046 left.addChild(base_char); 1047 left.addChild(Token.token_empty); 1048 1049 Token foo = Token.createUnion(); 1050 foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); 1051 foo.addChild(combiner_wo_virama); 1052 1053 foo = Token.createClosure(foo); 1054 1055 foo = Token.createConcat(left, foo); 1056 1057 Token.token_grapheme = foo; 1058 return Token.token_grapheme; 1059 } 1060 1061 /** 1062 * Combing Character Sequence in Perl 5.6. 1063 */ 1064 static private Token token_ccs = null; 1065 static synchronized Token getCombiningCharacterSequence() { 1066 if (Token.token_ccs != null) 1067 return Token.token_ccs; 1068 1069 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM* 1070 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM* 1071 Token.token_ccs = foo; 1072 return Token.token_ccs; 1073 } 1074 1075 // ------------------------------------------------------ 1076 1077 // ------------------------------------------------------ 1078 /** 1079 * This class represents a node in parse tree. 1080 */ 1081 static class StringToken extends Token implements java.io.Serializable { 1082 1083 private static final long serialVersionUID = -4614366944218504172L; 1084 1085 String string; 1086 final int refNumber; 1087 1088 StringToken(int type, String str, int n) { 1089 super(type); 1090 this.string = str; 1091 this.refNumber = n; 1092 } 1093 1094 int getReferenceNumber() { // for STRING 1095 return this.refNumber; 1096 } 1097 String getString() { // for STRING 1098 return this.string; 1099 } 1100 1101 public String toString(int options) { 1102 if (this.type == BACKREFERENCE) 1103 return "\\"+this.refNumber; 1104 else 1105 return REUtil.quoteMeta(this.string); 1106 } 1107 } 1108 1109 /** 1110 * This class represents a node in parse tree. 1111 */ 1112 static class ConcatToken extends Token implements java.io.Serializable { 1113 1114 private static final long serialVersionUID = 8717321425541346381L; 1115 1116 final Token child; 1117 final Token child2; 1118 1119 ConcatToken(Token t1, Token t2) { 1120 super(Token.CONCAT); 1121 this.child = t1; 1122 this.child2 = t2; 1123 } 1124 1125 int size() { 1126 return 2; 1127 } 1128 Token getChild(int index) { 1129 return index == 0 ? this.child : this.child2; 1130 } 1131 1132 public String toString(int options) { 1133 String ret; 1134 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { 1135 ret = this.child.toString(options)+"+"; 1136 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { 1137 ret = this.child.toString(options)+"+?"; 1138 } else 1139 ret = this.child.toString(options)+this.child2.toString(options); 1140 return ret; 1141 } 1142 } 1143 1144 /** 1145 * This class represents a node in parse tree. 1146 */ 1147 static class CharToken extends Token implements java.io.Serializable { 1148 1149 private static final long serialVersionUID = -4394272816279496989L; 1150 1151 final int chardata; 1152 1153 CharToken(int type, int ch) { 1154 super(type); 1155 this.chardata = ch; 1156 } 1157 1158 int getChar() { 1159 return this.chardata; 1160 } 1161 1162 public String toString(int options) { 1163 String ret; 1164 switch (this.type) { 1165 case CHAR: 1166 switch (this.chardata) { 1167 case '|': case '*': case '+': case '?': 1168 case '(': case ')': case '.': case '[': 1169 case '{': case '\\': 1170 ret = "\\"+(char)this.chardata; 1171 break; 1172 case '\f': ret = "\\f"; break; 1173 case '\n': ret = "\\n"; break; 1174 case '\r': ret = "\\r"; break; 1175 case '\t': ret = "\\t"; break; 1176 case 0x1b: ret = "\\e"; break; 1177 //case 0x0b: ret = "\\v"; break; 1178 default: 1179 if (this.chardata >= 0x10000) { 1180 String pre = "0"+Integer.toHexString(this.chardata); 1181 ret = "\\v"+pre.substring(pre.length()-6, pre.length()); 1182 } else 1183 ret = ""+(char)this.chardata; 1184 } 1185 break; 1186 1187 case ANCHOR: 1188 if (this == Token.token_linebeginning || this == Token.token_lineend) 1189 ret = ""+(char)this.chardata; 1190 else 1191 ret = "\\"+(char)this.chardata; 1192 break; 1193 1194 default: 1195 ret = null; 1196 } 1197 return ret; 1198 } 1199 1200 boolean match(int ch) { 1201 if (this.type == CHAR) { 1202 return ch == this.chardata; 1203 } else 1204 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); 1205 } 1206 } 1207 1208 /** 1209 * This class represents a node in parse tree. 1210 */ 1211 static class ClosureToken extends Token implements java.io.Serializable { 1212 1213 private static final long serialVersionUID = 1308971930673997452L; 1214 1215 int min; 1216 int max; 1217 final Token child; 1218 1219 ClosureToken(int type, Token tok) { 1220 super(type); 1221 this.child = tok; 1222 this.setMin(-1); 1223 this.setMax(-1); 1224 } 1225 1226 int size() { 1227 return 1; 1228 } 1229 Token getChild(int index) { 1230 return this.child; 1231 } 1232 1233 final void setMin(int min) { 1234 this.min = min; 1235 } 1236 final void setMax(int max) { 1237 this.max = max; 1238 } 1239 final int getMin() { 1240 return this.min; 1241 } 1242 final int getMax() { 1243 return this.max; 1244 } 1245 1246 public String toString(int options) { 1247 String ret; 1248 if (this.type == CLOSURE) { 1249 if (this.getMin() < 0 && this.getMax() < 0) { 1250 ret = this.child.toString(options)+"*"; 1251 } else if (this.getMin() == this.getMax()) { 1252 ret = this.child.toString(options)+"{"+this.getMin()+"}"; 1253 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1254 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}"; 1255 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1256 ret = this.child.toString(options)+"{"+this.getMin()+",}"; 1257 } else 1258 throw new RuntimeException("Token#toString(): CLOSURE " 1259 +this.getMin()+", "+this.getMax()); 1260 } else { 1261 if (this.getMin() < 0 && this.getMax() < 0) { 1262 ret = this.child.toString(options)+"*?"; 1263 } else if (this.getMin() == this.getMax()) { 1264 ret = this.child.toString(options)+"{"+this.getMin()+"}?"; 1265 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1266 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?"; 1267 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1268 ret = this.child.toString(options)+"{"+this.getMin()+",}?"; 1269 } else 1270 throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE " 1271 +this.getMin()+", "+this.getMax()); 1272 } 1273 return ret; 1274 } 1275 } 1276 1277 /** 1278 * This class represents a node in parse tree. 1279 */ 1280 static class ParenToken extends Token implements java.io.Serializable { 1281 1282 private static final long serialVersionUID = -5938014719827987704L; 1283 1284 final Token child; 1285 final int parennumber; 1286 1287 ParenToken(int type, Token tok, int paren) { 1288 super(type); 1289 this.child = tok; 1290 this.parennumber = paren; 1291 } 1292 1293 int size() { 1294 return 1; 1295 } 1296 Token getChild(int index) { 1297 return this.child; 1298 } 1299 1300 int getParenNumber() { 1301 return this.parennumber; 1302 } 1303 1304 public String toString(int options) { 1305 String ret = null; 1306 switch (this.type) { 1307 case PAREN: 1308 if (this.parennumber == 0) { 1309 ret = "(?:"+this.child.toString(options)+")"; 1310 } else { 1311 ret = "("+this.child.toString(options)+")"; 1312 } 1313 break; 1314 1315 case LOOKAHEAD: 1316 ret = "(?="+this.child.toString(options)+")"; 1317 break; 1318 case NEGATIVELOOKAHEAD: 1319 ret = "(?!"+this.child.toString(options)+")"; 1320 break; 1321 case LOOKBEHIND: 1322 ret = "(?<="+this.child.toString(options)+")"; 1323 break; 1324 case NEGATIVELOOKBEHIND: 1325 ret = "(?<!"+this.child.toString(options)+")"; 1326 break; 1327 case INDEPENDENT: 1328 ret = "(?>"+this.child.toString(options)+")"; 1329 break; 1330 } 1331 return ret; 1332 } 1333 } 1334 1335 /** 1336 * (?(condition)yes-pattern|no-pattern) 1337 */ 1338 static class ConditionToken extends Token implements java.io.Serializable { 1339 1340 private static final long serialVersionUID = 4353765277910594411L; 1341 1342 final int refNumber; 1343 final Token condition; 1344 final Token yes; 1345 final Token no; 1346 ConditionToken(int refno, Token cond, Token yespat, Token nopat) { 1347 super(Token.CONDITION); 1348 this.refNumber = refno; 1349 this.condition = cond; 1350 this.yes = yespat; 1351 this.no = nopat; 1352 } 1353 int size() { 1354 return this.no == null ? 1 : 2; 1355 } 1356 Token getChild(int index) { 1357 if (index == 0) return this.yes; 1358 if (index == 1) return this.no; 1359 throw new RuntimeException("Internal Error: "+index); 1360 } 1361 1362 public String toString(int options) { 1363 String ret; 1364 if (refNumber > 0) { 1365 ret = "(?("+refNumber+")"; 1366 } else if (this.condition.type == Token.ANCHOR) { 1367 ret = "(?("+this.condition+")"; 1368 } else { 1369 ret = "(?"+this.condition; 1370 } 1371 1372 if (this.no == null) { 1373 ret += this.yes+")"; 1374 } else { 1375 ret += this.yes+"|"+this.no+")"; 1376 } 1377 return ret; 1378 } 1379 } 1380 1381 /** 1382 * (ims-ims: .... ) 1383 */ 1384 static class ModifierToken extends Token implements java.io.Serializable { 1385 1386 private static final long serialVersionUID = -9114536559696480356L; 1387 1388 final Token child; 1389 final int add; 1390 final int mask; 1391 1392 ModifierToken(Token tok, int add, int mask) { 1393 super(Token.MODIFIERGROUP); 1394 this.child = tok; 1395 this.add = add; 1396 this.mask = mask; 1397 } 1398 1399 int size() { 1400 return 1; 1401 } 1402 Token getChild(int index) { 1403 return this.child; 1404 } 1405 1406 int getOptions() { 1407 return this.add; 1408 } 1409 int getOptionsMask() { 1410 return this.mask; 1411 } 1412 1413 public String toString(int options) { 1414 return "(?" 1415 +(this.add == 0 ? "" : REUtil.createOptionString(this.add)) 1416 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) 1417 +":" 1418 +this.child.toString(options) 1419 +")"; 1420 } 1421 } 1422 1423 /** 1424 * This class represents a node in parse tree. 1425 * for UNION or CONCAT. 1426 */ 1427 static class UnionToken extends Token implements java.io.Serializable { 1428 1429 private static final long serialVersionUID = -2568843945989489861L; 1430 1431 Vector children; 1432 1433 UnionToken(int type) { 1434 super(type); 1435 } 1436 1437 void addChild(Token tok) { 1438 if (tok == null) return; 1439 if (this.children == null) this.children = new Vector(); 1440 if (this.type == UNION) { 1441 this.children.addElement(tok); 1442 return; 1443 } 1444 // This is CONCAT, and new child is CONCAT. 1445 if (tok.type == CONCAT) { 1446 for (int i = 0; i < tok.size(); i ++) 1447 this.addChild(tok.getChild(i)); // Recursion 1448 return; 1449 } 1450 int size = this.children.size(); 1451 if (size == 0) { 1452 this.children.addElement(tok); 1453 return; 1454 } 1455 Token previous = (Token)this.children.elementAt(size-1); 1456 if (!((previous.type == CHAR || previous.type == STRING) 1457 && (tok.type == CHAR || tok.type == STRING))) { 1458 this.children.addElement(tok); 1459 return; 1460 } 1461 1462 //System.err.println("Merge '"+previous+"' and '"+tok+"'."); 1463 1464 StringBuffer buffer; 1465 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length()); 1466 if (previous.type == CHAR) { // Replace previous token by STRING 1467 buffer = new StringBuffer(2 + nextMaxLength); 1468 int ch = previous.getChar(); 1469 if (ch >= 0x10000) 1470 buffer.append(REUtil.decomposeToSurrogates(ch)); 1471 else 1472 buffer.append((char)ch); 1473 previous = Token.createString(null); 1474 this.children.setElementAt(previous, size-1); 1475 } else { // STRING 1476 buffer = new StringBuffer(previous.getString().length() + nextMaxLength); 1477 buffer.append(previous.getString()); 1478 } 1479 1480 if (tok.type == CHAR) { 1481 int ch = tok.getChar(); 1482 if (ch >= 0x10000) 1483 buffer.append(REUtil.decomposeToSurrogates(ch)); 1484 else 1485 buffer.append((char)ch); 1486 } else { 1487 buffer.append(tok.getString()); 1488 } 1489 1490 ((StringToken)previous).string = new String(buffer); 1491 } 1492 1493 int size() { 1494 return this.children == null ? 0 : this.children.size(); 1495 } 1496 Token getChild(int index) { 1497 return (Token)this.children.elementAt(index); 1498 } 1499 1500 public String toString(int options) { 1501 String ret; 1502 if (this.type == CONCAT) { 1503 if (this.children.size() == 2) { 1504 Token ch = this.getChild(0); 1505 Token ch2 = this.getChild(1); 1506 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) { 1507 ret = ch.toString(options)+"+"; 1508 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) { 1509 ret = ch.toString(options)+"+?"; 1510 } else 1511 ret = ch.toString(options)+ch2.toString(options); 1512 } else { 1513 StringBuffer sb = new StringBuffer(); 1514 for (int i = 0; i < this.children.size(); i ++) { 1515 sb.append(((Token)this.children.elementAt(i)).toString(options)); 1516 } 1517 ret = new String(sb); 1518 } 1519 return ret; 1520 } 1521 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) { 1522 ret = this.getChild(0).toString(options)+"?"; 1523 } else if (this.children.size() == 2 1524 && this.getChild(0).type == EMPTY) { 1525 ret = this.getChild(1).toString(options)+"??"; 1526 } else { 1527 StringBuffer sb = new StringBuffer(); 1528 sb.append(((Token)this.children.elementAt(0)).toString(options)); 1529 for (int i = 1; i < this.children.size(); i ++) { 1530 sb.append((char)'|'); 1531 sb.append(((Token)this.children.elementAt(i)).toString(options)); 1532 } 1533 ret = new String(sb); 1534 } 1535 return ret; 1536 } 1537 } 1538 }