1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.xerces.impl.xpath.regex; 19 20 import java.util.Locale; 21 import java.util.MissingResourceException; 22 import java.util.ResourceBundle; 23 import java.util.Vector; 24 25 /** 26 * A Regular Expression Parser. 27 * 28 * @xerces.internal 29 * 30 * @version $Id: RegexParser.java 469061 2006-10-30 04:16:15Z mrglavas $ 31 */ 32 class RegexParser { 33 static final int T_CHAR = 0; 34 static final int T_EOF = 1; 35 static final int T_OR = 2; // '|' 36 static final int T_STAR = 3; // '*' 37 static final int T_PLUS = 4; // '+' 38 static final int T_QUESTION = 5; // '?' 39 static final int T_LPAREN = 6; // '(' 40 static final int T_RPAREN = 7; // ')' 41 static final int T_DOT = 8; // '.' 42 static final int T_LBRACKET = 9; // '[' 43 static final int T_BACKSOLIDUS = 10; // '\' 44 static final int T_CARET = 11; // '^' 45 static final int T_DOLLAR = 12; // '$' 46 static final int T_LPAREN2 = 13; // '(?:' 47 static final int T_LOOKAHEAD = 14; // '(?=' 48 static final int T_NEGATIVELOOKAHEAD = 15; // '(?!' 49 static final int T_LOOKBEHIND = 16; // '(?<=' 50 static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!' 51 static final int T_INDEPENDENT = 18; // '(?>' 52 static final int T_SET_OPERATIONS = 19; // '(?[' 53 static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class 54 static final int T_COMMENT = 21; // '(?#' 55 static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z] 56 static final int T_CONDITION = 23; // '(?(' 57 static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class 58 59 static class ReferencePosition { 60 int refNumber; 61 int position; 62 ReferencePosition(int n, int pos) { 63 this.refNumber = n; 64 this.position = pos; 65 } 66 } 67 68 int offset; 69 String regex; 70 int regexlen; 71 int options; 72 ResourceBundle resources; 73 int chardata; 74 int nexttoken; 75 static protected final int S_NORMAL = 0; 76 static protected final int S_INBRACKETS = 1; 77 static protected final int S_INXBRACKETS = 2; 78 int context = S_NORMAL; 79 int parennumber = 1; 80 boolean hasBackReferences; 81 Vector references = null; 82 83 public RegexParser() { 84 this.setLocale(Locale.getDefault()); 85 } 86 public RegexParser(Locale locale) { 87 this.setLocale(locale); 88 } 89 90 public void setLocale(Locale locale) { 91 try { 92 this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale); 93 } catch (MissingResourceException mre) { 94 throw new RuntimeException("Installation Problem??? Couldn't load messages: " 95 +mre.getMessage()); 96 } 97 } 98 99 final ParseException ex(String key, int loc) { 100 return new ParseException(this.resources.getString(key), loc); 101 } 102 103 private final boolean isSet(int flag) { 104 return (this.options & flag) == flag; 105 } 106 107 synchronized Token parse(String regex, int options) throws ParseException { 108 this.options = options; 109 this.offset = 0; 110 this.setContext(S_NORMAL); 111 this.parennumber = 1; 112 this.hasBackReferences = false; 113 this.regex = regex; 114 if (this.isSet(RegularExpression.EXTENDED_COMMENT)) 115 this.regex = REUtil.stripExtendedComment(this.regex); 116 this.regexlen = this.regex.length(); 117 118 119 this.next(); 120 Token ret = this.parseRegex(); 121 if (this.offset != this.regexlen) 122 throw ex("parser.parse.1", this.offset); 123 if (this.references != null) { 124 for (int i = 0; i < this.references.size(); i ++) { 125 ReferencePosition position = (ReferencePosition)this.references.elementAt(i); 126 if (this.parennumber <= position.refNumber) 127 throw ex("parser.parse.2", position.position); 128 } 129 this.references.removeAllElements(); 130 } 131 return ret; 132 } 133 134 /* 135 public RegularExpression createRegex(String regex, int options) throws ParseException { 136 Token tok = this.parse(regex, options); 137 return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options); 138 } 139 */ 140 141 protected final void setContext(int con) { 142 this.context = con; 143 } 144 145 final int read() { 146 return this.nexttoken; 147 } 148 149 final void next() { 150 if (this.offset >= this.regexlen) { 151 this.chardata = -1; 152 this.nexttoken = T_EOF; 153 return; 154 } 155 156 int ret; 157 int ch = this.regex.charAt(this.offset++); 158 this.chardata = ch; 159 160 if (this.context == S_INBRACKETS) { 161 // In a character class, this.chardata has one character, that is to say, 162 // a pair of surrogates is composed and stored to this.chardata. 163 switch (ch) { 164 case '\\': 165 ret = T_BACKSOLIDUS; 166 if (this.offset >= this.regexlen) 167 throw ex("parser.next.1", this.offset-1); 168 this.chardata = this.regex.charAt(this.offset++); 169 break; 170 171 case '-': 172 if (this.isSet(RegularExpression.XMLSCHEMA_MODE) 173 && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { 174 this.offset++; 175 ret = T_XMLSCHEMA_CC_SUBTRACTION; 176 } else 177 ret = T_CHAR; 178 break; 179 180 case '[': 181 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) 182 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { 183 this.offset++; 184 ret = T_POSIX_CHARCLASS_START; 185 break; 186 } // Through down 187 default: 188 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { 189 int low = this.regex.charAt(this.offset); 190 if (REUtil.isLowSurrogate(low)) { 191 this.chardata = REUtil.composeFromSurrogates(ch, low); 192 this.offset ++; 193 } 194 } 195 ret = T_CHAR; 196 } 197 this.nexttoken = ret; 198 return; 199 } 200 201 switch (ch) { 202 case '|': ret = T_OR; break; 203 case '*': ret = T_STAR; break; 204 case '+': ret = T_PLUS; break; 205 case '?': ret = T_QUESTION; break; 206 case ')': ret = T_RPAREN; break; 207 case '.': ret = T_DOT; break; 208 case '[': ret = T_LBRACKET; break; 209 case '^': 210 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { 211 ret = T_CHAR; 212 } 213 else { 214 ret = T_CARET; 215 } 216 break; 217 case '$': 218 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { 219 ret = T_CHAR; 220 } 221 else { 222 ret = T_DOLLAR; 223 } 224 break; 225 case '(': 226 ret = T_LPAREN; 227 if (this.offset >= this.regexlen) 228 break; 229 if (this.regex.charAt(this.offset) != '?') 230 break; 231 if (++this.offset >= this.regexlen) 232 throw ex("parser.next.2", this.offset-1); 233 ch = this.regex.charAt(this.offset++); 234 switch (ch) { 235 case ':': ret = T_LPAREN2; break; 236 case '=': ret = T_LOOKAHEAD; break; 237 case '!': ret = T_NEGATIVELOOKAHEAD; break; 238 case '[': ret = T_SET_OPERATIONS; break; 239 case '>': ret = T_INDEPENDENT; break; 240 case '<': 241 if (this.offset >= this.regexlen) 242 throw ex("parser.next.2", this.offset-3); 243 ch = this.regex.charAt(this.offset++); 244 if (ch == '=') { 245 ret = T_LOOKBEHIND; 246 } else if (ch == '!') { 247 ret = T_NEGATIVELOOKBEHIND; 248 } else 249 throw ex("parser.next.3", this.offset-3); 250 break; 251 case '#': 252 while (this.offset < this.regexlen) { 253 ch = this.regex.charAt(this.offset++); 254 if (ch == ')') break; 255 } 256 if (ch != ')') 257 throw ex("parser.next.4", this.offset-1); 258 ret = T_COMMENT; 259 break; 260 default: 261 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options 262 this.offset --; 263 ret = T_MODIFIERS; 264 break; 265 } else if (ch == '(') { // conditional 266 ret = T_CONDITION; // this.offsets points the next of '('. 267 break; 268 } 269 throw ex("parser.next.2", this.offset-2); 270 } 271 break; 272 273 case '\\': 274 ret = T_BACKSOLIDUS; 275 if (this.offset >= this.regexlen) 276 throw ex("parser.next.1", this.offset-1); 277 this.chardata = this.regex.charAt(this.offset++); 278 break; 279 280 default: 281 ret = T_CHAR; 282 } 283 this.nexttoken = ret; 284 } 285 286 /** 287 * regex ::= term (`|` term)* 288 * term ::= factor+ 289 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 290 * | atom (('*' | '+' | '?' | minmax ) '?'? )?) 291 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' 292 * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 293 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 294 */ 295 Token parseRegex() throws ParseException { 296 Token tok = this.parseTerm(); 297 Token parent = null; 298 while (this.read() == T_OR) { 299 this.next(); // '|' 300 if (parent == null) { 301 parent = Token.createUnion(); 302 parent.addChild(tok); 303 tok = parent; 304 } 305 tok.addChild(this.parseTerm()); 306 } 307 return tok; 308 } 309 310 /** 311 * term ::= factor+ 312 */ 313 Token parseTerm() throws ParseException { 314 int ch = this.read(); 315 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { 316 return Token.createEmpty(); 317 } else { 318 Token tok = this.parseFactor(); 319 Token concat = null; 320 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { 321 if (concat == null) { 322 concat = Token.createConcat(); 323 concat.addChild(tok); 324 tok = concat; 325 } 326 concat.addChild(this.parseFactor()); 327 //tok = Token.createConcat(tok, this.parseFactor()); 328 } 329 return tok; 330 } 331 } 332 333 // ---------------------------------------------------------------- 334 335 Token processCaret() throws ParseException { 336 this.next(); 337 return Token.token_linebeginning; 338 } 339 Token processDollar() throws ParseException { 340 this.next(); 341 return Token.token_lineend; 342 } 343 Token processLookahead() throws ParseException { 344 this.next(); 345 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); 346 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 347 this.next(); // ')' 348 return tok; 349 } 350 Token processNegativelookahead() throws ParseException { 351 this.next(); 352 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); 353 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 354 this.next(); // ')' 355 return tok; 356 } 357 Token processLookbehind() throws ParseException { 358 this.next(); 359 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); 360 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 361 this.next(); // ')' 362 return tok; 363 } 364 Token processNegativelookbehind() throws ParseException { 365 this.next(); 366 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); 367 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 368 this.next(); // ')' 369 return tok; 370 } 371 Token processBacksolidus_A() throws ParseException { 372 this.next(); 373 return Token.token_stringbeginning; 374 } 375 Token processBacksolidus_Z() throws ParseException { 376 this.next(); 377 return Token.token_stringend2; 378 } 379 Token processBacksolidus_z() throws ParseException { 380 this.next(); 381 return Token.token_stringend; 382 } 383 Token processBacksolidus_b() throws ParseException { 384 this.next(); 385 return Token.token_wordedge; 386 } 387 Token processBacksolidus_B() throws ParseException { 388 this.next(); 389 return Token.token_not_wordedge; 390 } 391 Token processBacksolidus_lt() throws ParseException { 392 this.next(); 393 return Token.token_wordbeginning; 394 } 395 Token processBacksolidus_gt() throws ParseException { 396 this.next(); 397 return Token.token_wordend; 398 } 399 Token processStar(Token tok) throws ParseException { 400 this.next(); 401 if (this.read() == T_QUESTION) { 402 this.next(); 403 return Token.createNGClosure(tok); 404 } else 405 return Token.createClosure(tok); 406 } 407 Token processPlus(Token tok) throws ParseException { 408 // X+ -> XX* 409 this.next(); 410 if (this.read() == T_QUESTION) { 411 this.next(); 412 return Token.createConcat(tok, Token.createNGClosure(tok)); 413 } else 414 return Token.createConcat(tok, Token.createClosure(tok)); 415 } 416 Token processQuestion(Token tok) throws ParseException { 417 // X? -> X| 418 this.next(); 419 Token par = Token.createUnion(); 420 if (this.read() == T_QUESTION) { 421 this.next(); 422 par.addChild(Token.createEmpty()); 423 par.addChild(tok); 424 } else { 425 par.addChild(tok); 426 par.addChild(Token.createEmpty()); 427 } 428 return par; 429 } 430 boolean checkQuestion(int off) { 431 return off < this.regexlen && this.regex.charAt(off) == '?'; 432 } 433 Token processParen() throws ParseException { 434 this.next(); 435 int p = this.parennumber++; 436 Token tok = Token.createParen(this.parseRegex(), p); 437 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 438 this.next(); // Skips ')' 439 return tok; 440 } 441 Token processParen2() throws ParseException { 442 this.next(); 443 Token tok = Token.createParen(this.parseRegex(), 0); 444 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 445 this.next(); // Skips ')' 446 return tok; 447 } 448 Token processCondition() throws ParseException { 449 // this.offset points the next of '(' 450 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); 451 // Parses a condition. 452 int refno = -1; 453 Token condition = null; 454 int ch = this.regex.charAt(this.offset); 455 if ('1' <= ch && ch <= '9') { 456 refno = ch-'0'; 457 this.hasBackReferences = true; 458 if (this.references == null) this.references = new Vector(); 459 this.references.addElement(new ReferencePosition(refno, this.offset)); 460 this.offset ++; 461 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); 462 this.offset ++; 463 } else { 464 if (ch == '?') this.offset --; // Points '('. 465 this.next(); 466 condition = this.parseFactor(); 467 switch (condition.type) { 468 case Token.LOOKAHEAD: 469 case Token.NEGATIVELOOKAHEAD: 470 case Token.LOOKBEHIND: 471 case Token.NEGATIVELOOKBEHIND: 472 break; 473 case Token.ANCHOR: 474 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 475 break; 476 default: 477 throw ex("parser.factor.5", this.offset); 478 } 479 } 480 // Parses yes/no-patterns. 481 this.next(); 482 Token yesPattern = this.parseRegex(); 483 Token noPattern = null; 484 if (yesPattern.type == Token.UNION) { 485 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); 486 noPattern = yesPattern.getChild(1); 487 yesPattern = yesPattern.getChild(0); 488 } 489 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 490 this.next(); 491 return Token.createCondition(refno, condition, yesPattern, noPattern); 492 } 493 Token processModifiers() throws ParseException { 494 // this.offset points the next of '?'. 495 // modifiers ::= [imsw]* ('-' [imsw]*)? ':' 496 int add = 0, mask = 0, ch = -1; 497 while (this.offset < this.regexlen) { 498 ch = this.regex.charAt(this.offset); 499 int v = REUtil.getOptionValue(ch); 500 if (v == 0) break; // '-' or ':'? 501 add |= v; 502 this.offset ++; 503 } 504 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 505 if (ch == '-') { 506 this.offset ++; 507 while (this.offset < this.regexlen) { 508 ch = this.regex.charAt(this.offset); 509 int v = REUtil.getOptionValue(ch); 510 if (v == 0) break; // ':'? 511 mask |= v; 512 this.offset ++; 513 } 514 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 515 } 516 Token tok; 517 if (ch == ':') { 518 this.offset ++; 519 this.next(); 520 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 521 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 522 this.next(); 523 } else if (ch == ')') { // such as (?-i) 524 this.offset ++; 525 this.next(); 526 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 527 } else 528 throw ex("parser.factor.3", this.offset); 529 530 return tok; 531 } 532 Token processIndependent() throws ParseException { 533 this.next(); 534 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); 535 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 536 this.next(); // Skips ')' 537 return tok; 538 } 539 Token processBacksolidus_c() throws ParseException { 540 int ch2; // Must be in 0x0040-0x005f 541 if (this.offset >= this.regexlen 542 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) 543 throw ex("parser.atom.1", this.offset-1); 544 this.next(); 545 return Token.createChar(ch2-0x40); 546 } 547 Token processBacksolidus_C() throws ParseException { 548 throw ex("parser.process.1", this.offset); 549 } 550 Token processBacksolidus_i() throws ParseException { 551 Token tok = Token.createChar('i'); 552 this.next(); 553 return tok; 554 } 555 Token processBacksolidus_I() throws ParseException { 556 throw ex("parser.process.1", this.offset); 557 } 558 Token processBacksolidus_g() throws ParseException { 559 this.next(); 560 return Token.getGraphemePattern(); 561 } 562 Token processBacksolidus_X() throws ParseException { 563 this.next(); 564 return Token.getCombiningCharacterSequence(); 565 } 566 Token processBackreference() throws ParseException { 567 int refnum = this.chardata-'0'; 568 Token tok = Token.createBackReference(refnum); 569 this.hasBackReferences = true; 570 if (this.references == null) this.references = new Vector(); 571 this.references.addElement(new ReferencePosition(refnum, this.offset-2)); 572 this.next(); 573 return tok; 574 } 575 576 // ---------------------------------------------------------------- 577 578 /** 579 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 580 * | atom (('*' | '+' | '?' | minmax ) '?'? )?) 581 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' 582 * | '(?#' [^)]* ')' 583 * minmax ::= '{' min (',' max?)? '}' 584 * min ::= [0-9]+ 585 * max ::= [0-9]+ 586 */ 587 Token parseFactor() throws ParseException { 588 int ch = this.read(); 589 Token tok; 590 switch (ch) { 591 case T_CARET: return this.processCaret(); 592 case T_DOLLAR: return this.processDollar(); 593 case T_LOOKAHEAD: return this.processLookahead(); 594 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); 595 case T_LOOKBEHIND: return this.processLookbehind(); 596 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); 597 598 case T_COMMENT: 599 this.next(); 600 return Token.createEmpty(); 601 602 case T_BACKSOLIDUS: 603 switch (this.chardata) { 604 case 'A': return this.processBacksolidus_A(); 605 case 'Z': return this.processBacksolidus_Z(); 606 case 'z': return this.processBacksolidus_z(); 607 case 'b': return this.processBacksolidus_b(); 608 case 'B': return this.processBacksolidus_B(); 609 case '<': return this.processBacksolidus_lt(); 610 case '>': return this.processBacksolidus_gt(); 611 } 612 // through down 613 } 614 tok = this.parseAtom(); 615 ch = this.read(); 616 switch (ch) { 617 case T_STAR: return this.processStar(tok); 618 case T_PLUS: return this.processPlus(tok); 619 case T_QUESTION: return this.processQuestion(tok); 620 case T_CHAR: 621 if (this.chardata == '{' && this.offset < this.regexlen) { 622 623 int off = this.offset; // this.offset -> next of '{' 624 int min = 0, max = -1; 625 626 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 627 628 min = ch -'0'; 629 while (off < this.regexlen 630 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 631 min = min*10 +ch-'0'; 632 if (min < 0) 633 throw ex("parser.quantifier.5", this.offset); 634 } 635 } 636 else { 637 throw ex("parser.quantifier.1", this.offset); 638 } 639 640 max = min; 641 if (ch == ',') { 642 643 if (off >= this.regexlen) { 644 throw ex("parser.quantifier.3", this.offset); 645 } 646 else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 647 648 max = ch -'0'; // {min,max} 649 while (off < this.regexlen 650 && (ch = this.regex.charAt(off++)) >= '0' 651 && ch <= '9') { 652 max = max*10 +ch-'0'; 653 if (max < 0) 654 throw ex("parser.quantifier.5", this.offset); 655 } 656 657 if (min > max) 658 throw ex("parser.quantifier.4", this.offset); 659 } 660 else { // assume {min,} 661 max = -1; 662 } 663 } 664 665 if (ch != '}') 666 throw ex("parser.quantifier.2", this.offset); 667 668 if (this.checkQuestion(off)) { // off -> next of '}' 669 tok = Token.createNGClosure(tok); 670 this.offset = off+1; 671 } else { 672 tok = Token.createClosure(tok); 673 this.offset = off; 674 } 675 676 tok.setMin(min); 677 tok.setMax(max); 678 //System.err.println("CLOSURE: "+min+", "+max); 679 this.next(); 680 } 681 } 682 return tok; 683 } 684 685 /** 686 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 687 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 688 * | '(?>' regex ')' 689 * char ::= '\\' | '\' [efnrt] | bmp-code | character-1 690 */ 691 Token parseAtom() throws ParseException { 692 int ch = this.read(); 693 Token tok = null; 694 switch (ch) { 695 case T_LPAREN: return this.processParen(); 696 case T_LPAREN2: return this.processParen2(); // '(?:' 697 case T_CONDITION: return this.processCondition(); // '(?(' 698 case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) 699 case T_INDEPENDENT: return this.processIndependent(); 700 case T_DOT: 701 this.next(); // Skips '.' 702 tok = Token.token_dot; 703 break; 704 705 /** 706 * char-class ::= '[' ( '^'? range ','?)+ ']' 707 * range ::= '\d' | '\w' | '\s' | category-block | range-char 708 * | range-char '-' range-char 709 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 710 * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 711 */ 712 case T_LBRACKET: return this.parseCharacterClass(true); 713 case T_SET_OPERATIONS: return this.parseSetOperations(); 714 715 case T_BACKSOLIDUS: 716 switch (this.chardata) { 717 case 'd': case 'D': 718 case 'w': case 'W': 719 case 's': case 'S': 720 tok = this.getTokenForShorthand(this.chardata); 721 this.next(); 722 return tok; 723 724 case 'e': case 'f': case 'n': case 'r': 725 case 't': case 'u': case 'v': case 'x': 726 { 727 int ch2 = this.decodeEscaped(); 728 if (ch2 < 0x10000) { 729 tok = Token.createChar(ch2); 730 } else { 731 tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); 732 } 733 } 734 break; 735 736 case 'c': return this.processBacksolidus_c(); 737 case 'C': return this.processBacksolidus_C(); 738 case 'i': return this.processBacksolidus_i(); 739 case 'I': return this.processBacksolidus_I(); 740 case 'g': return this.processBacksolidus_g(); 741 case 'X': return this.processBacksolidus_X(); 742 case '1': case '2': case '3': case '4': 743 case '5': case '6': case '7': case '8': case '9': 744 return this.processBackreference(); 745 746 case 'P': 747 case 'p': 748 int pstart = this.offset; 749 tok = processBacksolidus_pP(this.chardata); 750 if (tok == null) throw this.ex("parser.atom.5", pstart); 751 break; 752 753 default: 754 tok = Token.createChar(this.chardata); 755 } 756 this.next(); 757 break; 758 759 case T_CHAR: 760 if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') 761 throw this.ex("parser.atom.4", this.offset-1); 762 tok = Token.createChar(this.chardata); 763 int high = this.chardata; 764 this.next(); 765 if (REUtil.isHighSurrogate(high) 766 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { 767 char[] sur = new char[2]; 768 sur[0] = (char)high; 769 sur[1] = (char)this.chardata; 770 tok = Token.createParen(Token.createString(new String(sur)), 0); 771 this.next(); 772 } 773 break; 774 775 default: 776 throw this.ex("parser.atom.4", this.offset-1); 777 } 778 return tok; 779 } 780 781 protected RangeToken processBacksolidus_pP(int c) throws ParseException { 782 783 this.next(); 784 if (this.read() != T_CHAR || this.chardata != '{') 785 throw this.ex("parser.atom.2", this.offset-1); 786 787 // handle category escape 788 boolean positive = c == 'p'; 789 int namestart = this.offset; 790 int nameend = this.regex.indexOf('}', namestart); 791 792 if (nameend < 0) 793 throw this.ex("parser.atom.3", this.offset); 794 795 String pname = this.regex.substring(namestart, nameend); 796 this.offset = nameend+1; 797 798 return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); 799 } 800 801 int processCIinCharacterClass(RangeToken tok, int c) { 802 return this.decodeEscaped(); 803 } 804 805 /** 806 * char-class ::= '[' ( '^'? range ','?)+ ']' 807 * range ::= '\d' | '\w' | '\s' | category-block | range-char 808 * | range-char '-' range-char 809 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 810 * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 811 */ 812 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { 813 this.setContext(S_INBRACKETS); 814 this.next(); // '[' 815 boolean nrange = false; 816 RangeToken base = null; 817 RangeToken tok; 818 if (this.read() == T_CHAR && this.chardata == '^') { 819 nrange = true; 820 this.next(); // '^' 821 if (useNrange) { 822 tok = Token.createNRange(); 823 } else { 824 base = Token.createRange(); 825 base.addRange(0, Token.UTF16_MAX); 826 tok = Token.createRange(); 827 } 828 } else { 829 tok = Token.createRange(); 830 } 831 int type; 832 boolean firstloop = true; 833 while ((type = this.read()) != T_EOF) { 834 if (type == T_CHAR && this.chardata == ']' && !firstloop) 835 break; 836 firstloop = false; 837 int c = this.chardata; 838 boolean end = false; 839 if (type == T_BACKSOLIDUS) { 840 switch (c) { 841 case 'd': case 'D': 842 case 'w': case 'W': 843 case 's': case 'S': 844 tok.mergeRanges(this.getTokenForShorthand(c)); 845 end = true; 846 break; 847 848 case 'i': case 'I': 849 case 'c': case 'C': 850 c = this.processCIinCharacterClass(tok, c); 851 if (c < 0) end = true; 852 break; 853 854 case 'p': 855 case 'P': 856 int pstart = this.offset; 857 RangeToken tok2 = this.processBacksolidus_pP(c); 858 if (tok2 == null) throw this.ex("parser.atom.5", pstart); 859 tok.mergeRanges(tok2); 860 end = true; 861 break; 862 863 default: 864 c = this.decodeEscaped(); 865 } // \ + c 866 } // backsolidus 867 // POSIX Character class such as [:alnum:] 868 else if (type == T_POSIX_CHARCLASS_START) { 869 int nameend = this.regex.indexOf(':', this.offset); 870 if (nameend < 0) throw this.ex("parser.cc.1", this.offset); 871 boolean positive = true; 872 if (this.regex.charAt(this.offset) == '^') { 873 this.offset ++; 874 positive = false; 875 } 876 String name = this.regex.substring(this.offset, nameend); 877 RangeToken range = Token.getRange(name, positive, 878 this.isSet(RegularExpression.XMLSCHEMA_MODE)); 879 if (range == null) throw this.ex("parser.cc.3", this.offset); 880 tok.mergeRanges(range); 881 end = true; 882 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') 883 throw this.ex("parser.cc.1", nameend); 884 this.offset = nameend+2; 885 } 886 this.next(); 887 if (!end) { // if not shorthands... 888 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'. 889 tok.addRange(c, c); 890 } else { 891 this.next(); // Skips '-' 892 if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); 893 if (type == T_CHAR && this.chardata == ']') { 894 tok.addRange(c, c); 895 tok.addRange('-', '-'); 896 } else { 897 int rangeend = this.chardata; 898 if (type == T_BACKSOLIDUS) 899 rangeend = this.decodeEscaped(); 900 this.next(); 901 tok.addRange(c, rangeend); 902 } 903 } 904 } 905 if (this.isSet(RegularExpression.SPECIAL_COMMA) 906 && this.read() == T_CHAR && this.chardata == ',') 907 this.next(); 908 } 909 if (this.read() == T_EOF) 910 throw this.ex("parser.cc.2", this.offset); 911 if (!useNrange && nrange) { 912 base.subtractRanges(tok); 913 tok = base; 914 } 915 tok.sortRanges(); 916 tok.compactRanges(); 917 //tok.dumpRanges(); 918 /* 919 if (this.isSet(RegularExpression.IGNORE_CASE)) 920 tok = RangeToken.createCaseInsensitiveToken(tok); 921 */ 922 this.setContext(S_NORMAL); 923 this.next(); // Skips ']' 924 925 return tok; 926 } 927 928 /** 929 * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')' 930 */ 931 protected RangeToken parseSetOperations() throws ParseException { 932 RangeToken tok = this.parseCharacterClass(false); 933 int type; 934 while ((type = this.read()) != T_RPAREN) { 935 int ch = this.chardata; 936 if (type == T_CHAR && (ch == '-' || ch == '&') 937 || type == T_PLUS) { 938 this.next(); 939 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); 940 RangeToken t2 = this.parseCharacterClass(false); 941 if (type == T_PLUS) 942 tok.mergeRanges(t2); 943 else if (ch == '-') 944 tok.subtractRanges(t2); 945 else if (ch == '&') 946 tok.intersectRanges(t2); 947 else 948 throw new RuntimeException("ASSERT"); 949 } else { 950 throw ex("parser.ope.2", this.offset-1); 951 } 952 } 953 this.next(); 954 return tok; 955 } 956 957 Token getTokenForShorthand(int ch) { 958 Token tok; 959 switch (ch) { 960 case 'd': 961 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 962 ? Token.getRange("Nd", true) : Token.token_0to9; 963 break; 964 case 'D': 965 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 966 ? Token.getRange("Nd", false) : Token.token_not_0to9; 967 break; 968 case 'w': 969 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 970 ? Token.getRange("IsWord", true) : Token.token_wordchars; 971 break; 972 case 'W': 973 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 974 ? Token.getRange("IsWord", false) : Token.token_not_wordchars; 975 break; 976 case 's': 977 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 978 ? Token.getRange("IsSpace", true) : Token.token_spaces; 979 break; 980 case 'S': 981 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 982 ? Token.getRange("IsSpace", false) : Token.token_not_spaces; 983 break; 984 985 default: 986 throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); 987 } 988 return tok; 989 } 990 991 /** 992 */ 993 int decodeEscaped() throws ParseException { 994 if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); 995 int c = this.chardata; 996 switch (c) { 997 case 'e': c = 0x1b; break; // ESCAPE U+001B 998 case 'f': c = '\f'; break; // FORM FEED U+000C 999 case 'n': c = '\n'; break; // LINE FEED U+000A 1000 case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D 1001 case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009 1002 //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B 1003 case 'x': 1004 this.next(); 1005 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1006 if (this.chardata == '{') { 1007 int v1 = 0; 1008 int uv = 0; 1009 do { 1010 this.next(); 1011 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1012 if ((v1 = hexChar(this.chardata)) < 0) 1013 break; 1014 if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); 1015 uv = uv*16+v1; 1016 } while (true); 1017 if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); 1018 if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); 1019 c = uv; 1020 } else { 1021 int v1 = 0; 1022 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1023 throw ex("parser.descape.1", this.offset-1); 1024 int uv = v1; 1025 this.next(); 1026 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1027 throw ex("parser.descape.1", this.offset-1); 1028 uv = uv*16+v1; 1029 c = uv; 1030 } 1031 break; 1032 1033 case 'u': 1034 int v1 = 0; 1035 this.next(); 1036 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1037 throw ex("parser.descape.1", this.offset-1); 1038 int uv = v1; 1039 this.next(); 1040 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1041 throw ex("parser.descape.1", this.offset-1); 1042 uv = uv*16+v1; 1043 this.next(); 1044 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1045 throw ex("parser.descape.1", this.offset-1); 1046 uv = uv*16+v1; 1047 this.next(); 1048 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1049 throw ex("parser.descape.1", this.offset-1); 1050 uv = uv*16+v1; 1051 c = uv; 1052 break; 1053 1054 case 'v': 1055 this.next(); 1056 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1057 throw ex("parser.descape.1", this.offset-1); 1058 uv = v1; 1059 this.next(); 1060 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1061 throw ex("parser.descape.1", this.offset-1); 1062 uv = uv*16+v1; 1063 this.next(); 1064 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1065 throw ex("parser.descape.1", this.offset-1); 1066 uv = uv*16+v1; 1067 this.next(); 1068 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1069 throw ex("parser.descape.1", this.offset-1); 1070 uv = uv*16+v1; 1071 this.next(); 1072 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1073 throw ex("parser.descape.1", this.offset-1); 1074 uv = uv*16+v1; 1075 this.next(); 1076 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1077 throw ex("parser.descape.1", this.offset-1); 1078 uv = uv*16+v1; 1079 if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); 1080 c = uv; 1081 break; 1082 case 'A': 1083 case 'Z': 1084 case 'z': 1085 throw ex("parser.descape.5", this.offset-2); 1086 default: 1087 } 1088 return c; 1089 } 1090 1091 static private final int hexChar(int ch) { 1092 if (ch < '0') return -1; 1093 if (ch > 'f') return -1; 1094 if (ch <= '9') return ch-'0'; 1095 if (ch < 'A') return -1; 1096 if (ch <= 'F') return ch-'A'+10; 1097 if (ch < 'a') return -1; 1098 return ch-'a'+10; 1099 } 1100 }