Home » Xerces-J-src.2.9.1 » org.apache.xerces » impl » xpath » regex » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    * 
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    * 
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.xerces.impl.xpath.regex;
   19   
   20   import java.util.Locale;
   21   import java.util.MissingResourceException;
   22   import java.util.ResourceBundle;
   23   import java.util.Vector;
   24   
   25   /**
   26    * A Regular Expression Parser.
   27    * 
   28    * @xerces.internal
   29    *
   30    * @version $Id: RegexParser.java 469061 2006-10-30 04:16:15Z mrglavas $
   31    */
   32   class RegexParser {
   33       static final int T_CHAR = 0;
   34       static final int T_EOF = 1;
   35       static final int T_OR = 2;                  // '|'
   36       static final int T_STAR = 3;                // '*'
   37       static final int T_PLUS = 4;                // '+'
   38       static final int T_QUESTION = 5;            // '?'
   39       static final int T_LPAREN = 6;              // '('
   40       static final int T_RPAREN = 7;              // ')'
   41       static final int T_DOT = 8;                 // '.'
   42       static final int T_LBRACKET = 9;            // '['
   43       static final int T_BACKSOLIDUS = 10;        // '\'
   44       static final int T_CARET = 11;              // '^'
   45       static final int T_DOLLAR = 12;             // '$'
   46       static final int T_LPAREN2 = 13;            // '(?:'
   47       static final int T_LOOKAHEAD = 14;          // '(?='
   48       static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
   49       static final int T_LOOKBEHIND = 16;         // '(?<='
   50       static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
   51       static final int T_INDEPENDENT = 18;        // '(?>'
   52       static final int T_SET_OPERATIONS = 19;     // '(?['
   53       static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
   54       static final int T_COMMENT = 21;            // '(?#'
   55       static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
   56       static final int T_CONDITION = 23;          // '(?('
   57       static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
   58   
   59       static class ReferencePosition {
   60           int refNumber;
   61           int position;
   62           ReferencePosition(int n, int pos) {
   63               this.refNumber = n;
   64               this.position = pos;
   65           }
   66       }
   67   
   68       int offset;
   69       String regex;
   70       int regexlen;
   71       int options;
   72       ResourceBundle resources;
   73       int chardata;
   74       int nexttoken;
   75       static protected final int S_NORMAL = 0;
   76       static protected final int S_INBRACKETS = 1;
   77       static protected final int S_INXBRACKETS = 2;
   78       int context = S_NORMAL;
   79       int parennumber = 1;
   80       boolean hasBackReferences;
   81       Vector references = null;
   82   
   83       public RegexParser() {
   84           this.setLocale(Locale.getDefault());
   85       }
   86       public RegexParser(Locale locale) {
   87           this.setLocale(locale);
   88       }
   89   
   90       public void setLocale(Locale locale) {
   91           try {
   92               this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale);
   93           } catch (MissingResourceException mre) {
   94               throw new RuntimeException("Installation Problem???  Couldn't load messages: "
   95                                          +mre.getMessage());
   96           }
   97       }
   98   
   99       final ParseException ex(String key, int loc) {
  100           return new ParseException(this.resources.getString(key), loc);
  101       }
  102   
  103       private final boolean isSet(int flag) {
  104           return (this.options & flag) == flag;
  105       }
  106   
  107       synchronized Token parse(String regex, int options) throws ParseException {
  108           this.options = options;
  109           this.offset = 0;
  110           this.setContext(S_NORMAL);
  111           this.parennumber = 1;
  112           this.hasBackReferences = false;
  113           this.regex = regex;
  114           if (this.isSet(RegularExpression.EXTENDED_COMMENT))
  115               this.regex = REUtil.stripExtendedComment(this.regex);
  116           this.regexlen = this.regex.length();
  117   
  118   
  119           this.next();
  120           Token ret = this.parseRegex();
  121           if (this.offset != this.regexlen)
  122               throw ex("parser.parse.1", this.offset);
  123           if (this.references != null) {
  124               for (int i = 0;  i < this.references.size();  i ++) {
  125                   ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
  126                   if (this.parennumber <= position.refNumber)
  127                       throw ex("parser.parse.2", position.position);
  128               }
  129               this.references.removeAllElements();
  130           }
  131           return ret;
  132       }
  133   
  134       /*
  135       public RegularExpression createRegex(String regex, int options) throws ParseException {
  136           Token tok = this.parse(regex, options);
  137           return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
  138       }
  139       */
  140   
  141       protected final void setContext(int con) {
  142           this.context = con;
  143       }
  144   
  145       final int read() {
  146           return this.nexttoken;
  147       }
  148   
  149       final void next() {
  150           if (this.offset >= this.regexlen) {
  151               this.chardata = -1;
  152               this.nexttoken = T_EOF;
  153               return;
  154           }
  155   
  156           int ret;
  157           int ch = this.regex.charAt(this.offset++);
  158           this.chardata = ch;
  159   
  160           if (this.context == S_INBRACKETS) {
  161               // In a character class, this.chardata has one character, that is to say,
  162               // a pair of surrogates is composed and stored to this.chardata.
  163               switch (ch) {
  164                 case '\\':
  165                   ret = T_BACKSOLIDUS;
  166                   if (this.offset >= this.regexlen)
  167                       throw ex("parser.next.1", this.offset-1);
  168                   this.chardata = this.regex.charAt(this.offset++);
  169                   break;
  170   
  171                 case '-':
  172                   if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
  173                       && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
  174                       this.offset++;
  175                       ret = T_XMLSCHEMA_CC_SUBTRACTION;
  176                   } else
  177                       ret = T_CHAR;
  178                   break;
  179   
  180                 case '[':
  181                   if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
  182                       && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
  183                       this.offset++;
  184                       ret = T_POSIX_CHARCLASS_START;
  185                       break;
  186                   } // Through down
  187                 default:
  188                   if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
  189                       int low = this.regex.charAt(this.offset);
  190                       if (REUtil.isLowSurrogate(low)) {
  191                           this.chardata = REUtil.composeFromSurrogates(ch, low);
  192                           this.offset ++;
  193                       }
  194                   }
  195                   ret = T_CHAR;
  196               }
  197               this.nexttoken = ret;
  198               return;
  199           }
  200   
  201           switch (ch) {
  202             case '|': ret = T_OR;             break;
  203             case '*': ret = T_STAR;           break;
  204             case '+': ret = T_PLUS;           break;
  205             case '?': ret = T_QUESTION;       break;
  206             case ')': ret = T_RPAREN;         break;
  207             case '.': ret = T_DOT;            break;
  208             case '[': ret = T_LBRACKET;       break;
  209             case '^':
  210                 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
  211                     ret = T_CHAR;
  212                 }
  213                 else {
  214                     ret = T_CARET;
  215                 }
  216                 break;
  217             case '$': 
  218                 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
  219                     ret = T_CHAR;
  220                 }
  221                 else {
  222                     ret = T_DOLLAR;
  223                 }
  224                 break;
  225             case '(':
  226               ret = T_LPAREN;
  227               if (this.offset >= this.regexlen)
  228                   break;
  229               if (this.regex.charAt(this.offset) != '?')
  230                   break;
  231               if (++this.offset >= this.regexlen)
  232                   throw ex("parser.next.2", this.offset-1);
  233               ch = this.regex.charAt(this.offset++);
  234               switch (ch) {
  235                 case ':':  ret = T_LPAREN2;            break;
  236                 case '=':  ret = T_LOOKAHEAD;          break;
  237                 case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
  238                 case '[':  ret = T_SET_OPERATIONS;     break;
  239                 case '>':  ret = T_INDEPENDENT;        break;
  240                 case '<':
  241                   if (this.offset >= this.regexlen)
  242                       throw ex("parser.next.2", this.offset-3);
  243                   ch = this.regex.charAt(this.offset++);
  244                   if (ch == '=') {
  245                       ret = T_LOOKBEHIND;
  246                   } else if (ch == '!') {
  247                       ret = T_NEGATIVELOOKBEHIND;
  248                   } else
  249                       throw ex("parser.next.3", this.offset-3);
  250                   break;
  251                 case '#':
  252                   while (this.offset < this.regexlen) {
  253                       ch = this.regex.charAt(this.offset++);
  254                       if (ch == ')')  break;
  255                   }
  256                   if (ch != ')')
  257                       throw ex("parser.next.4", this.offset-1);
  258                   ret = T_COMMENT;
  259                   break;
  260                 default:
  261                   if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
  262                       this.offset --;
  263                       ret = T_MODIFIERS;
  264                       break;
  265                   } else if (ch == '(') {         // conditional
  266                       ret = T_CONDITION;          // this.offsets points the next of '('.
  267                       break;
  268                   }
  269                   throw ex("parser.next.2", this.offset-2);
  270               }
  271               break;
  272               
  273             case '\\':
  274               ret = T_BACKSOLIDUS;
  275               if (this.offset >= this.regexlen)
  276                   throw ex("parser.next.1", this.offset-1);
  277               this.chardata = this.regex.charAt(this.offset++);
  278               break;
  279   
  280             default:
  281               ret = T_CHAR;
  282           }
  283           this.nexttoken = ret;
  284       }
  285   
  286       /**
  287        * regex ::= term (`|` term)*
  288        * term ::= factor+
  289        * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
  290        *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
  291        *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
  292        * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  293        *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 
  294        */
  295       Token parseRegex() throws ParseException {
  296           Token tok = this.parseTerm();
  297           Token parent = null;
  298           while (this.read() == T_OR) {
  299               this.next();                    // '|'
  300               if (parent == null) {
  301                   parent = Token.createUnion();
  302                   parent.addChild(tok);
  303                   tok = parent;
  304               }
  305               tok.addChild(this.parseTerm());
  306           }
  307           return tok;
  308       }
  309   
  310       /**
  311        * term ::= factor+
  312        */
  313       Token parseTerm() throws ParseException {
  314           int ch = this.read();
  315           if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
  316               return Token.createEmpty();
  317           } else {
  318               Token tok = this.parseFactor();
  319               Token concat = null;
  320               while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
  321                   if (concat == null) {
  322                       concat = Token.createConcat();
  323                       concat.addChild(tok);
  324                       tok = concat;
  325                   }
  326                   concat.addChild(this.parseFactor());
  327                   //tok = Token.createConcat(tok, this.parseFactor());
  328               }
  329               return tok;
  330           }
  331       }
  332   
  333       // ----------------------------------------------------------------
  334   
  335       Token processCaret() throws ParseException {
  336           this.next();
  337           return Token.token_linebeginning;
  338       }
  339       Token processDollar() throws ParseException {
  340           this.next();
  341           return Token.token_lineend;
  342       }
  343       Token processLookahead() throws ParseException {
  344           this.next();
  345           Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
  346           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  347           this.next();                            // ')'
  348           return tok;
  349       }
  350       Token processNegativelookahead() throws ParseException {
  351           this.next();
  352           Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
  353           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  354           this.next();                            // ')'
  355           return tok;
  356       }
  357       Token processLookbehind() throws ParseException {
  358           this.next();
  359           Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
  360           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  361           this.next();                            // ')'
  362           return tok;
  363       }
  364       Token processNegativelookbehind() throws ParseException {
  365           this.next();
  366           Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
  367           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  368           this.next();                    // ')'
  369           return tok;
  370       }
  371       Token processBacksolidus_A() throws ParseException {
  372           this.next();
  373           return Token.token_stringbeginning;
  374       }
  375       Token processBacksolidus_Z() throws ParseException {
  376           this.next();
  377           return Token.token_stringend2;
  378       }
  379       Token processBacksolidus_z() throws ParseException {
  380           this.next();
  381           return Token.token_stringend;
  382       }
  383       Token processBacksolidus_b() throws ParseException {
  384           this.next();
  385           return Token.token_wordedge;
  386       }
  387       Token processBacksolidus_B() throws ParseException {
  388           this.next();
  389           return Token.token_not_wordedge;
  390       }
  391       Token processBacksolidus_lt() throws ParseException {
  392           this.next();
  393           return Token.token_wordbeginning;
  394       }
  395       Token processBacksolidus_gt() throws ParseException {
  396           this.next();
  397           return Token.token_wordend;
  398       }
  399       Token processStar(Token tok) throws ParseException {
  400           this.next();
  401           if (this.read() == T_QUESTION) {
  402               this.next();
  403               return Token.createNGClosure(tok);
  404           } else
  405               return Token.createClosure(tok);
  406       }
  407       Token processPlus(Token tok) throws ParseException {
  408           // X+ -> XX*
  409           this.next();
  410           if (this.read() == T_QUESTION) {
  411               this.next();
  412               return Token.createConcat(tok, Token.createNGClosure(tok));
  413           } else
  414               return Token.createConcat(tok, Token.createClosure(tok));
  415       }
  416       Token processQuestion(Token tok) throws ParseException {
  417           // X? -> X|
  418           this.next();
  419           Token par = Token.createUnion();
  420           if (this.read() == T_QUESTION) {
  421               this.next();
  422               par.addChild(Token.createEmpty());
  423               par.addChild(tok);
  424           } else {
  425               par.addChild(tok);
  426               par.addChild(Token.createEmpty());
  427           }
  428           return par;
  429       }
  430       boolean checkQuestion(int off) {
  431           return off < this.regexlen && this.regex.charAt(off) == '?';
  432       }
  433       Token processParen() throws ParseException {
  434           this.next();
  435           int p = this.parennumber++;
  436           Token tok = Token.createParen(this.parseRegex(), p);
  437           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  438           this.next();                            // Skips ')'
  439           return tok;
  440       }
  441       Token processParen2() throws ParseException {
  442           this.next();
  443           Token tok = Token.createParen(this.parseRegex(), 0);
  444           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  445           this.next();                            // Skips ')'
  446           return tok;
  447       }
  448       Token processCondition() throws ParseException {
  449                                                   // this.offset points the next of '('
  450           if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
  451                                                   // Parses a condition.
  452           int refno = -1;
  453           Token condition = null;
  454           int ch = this.regex.charAt(this.offset);
  455           if ('1' <= ch && ch <= '9') {
  456               refno = ch-'0';
  457               this.hasBackReferences = true;
  458               if (this.references == null)  this.references = new Vector();
  459               this.references.addElement(new ReferencePosition(refno, this.offset));
  460               this.offset ++;
  461               if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
  462               this.offset ++;
  463           } else {
  464               if (ch == '?')  this.offset --; // Points '('.
  465               this.next();
  466               condition = this.parseFactor();
  467               switch (condition.type) {
  468                 case Token.LOOKAHEAD:
  469                 case Token.NEGATIVELOOKAHEAD:
  470                 case Token.LOOKBEHIND:
  471                 case Token.NEGATIVELOOKBEHIND:
  472                   break;
  473                 case Token.ANCHOR:
  474                   if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  475                   break;
  476                 default:
  477                   throw ex("parser.factor.5", this.offset);
  478               }
  479           }
  480                                                   // Parses yes/no-patterns.
  481           this.next();
  482           Token yesPattern = this.parseRegex();
  483           Token noPattern = null;
  484           if (yesPattern.type == Token.UNION) {
  485               if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
  486               noPattern = yesPattern.getChild(1);
  487               yesPattern = yesPattern.getChild(0);
  488           }
  489           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  490           this.next();
  491           return Token.createCondition(refno, condition, yesPattern, noPattern);
  492       }
  493       Token processModifiers() throws ParseException {
  494                                                   // this.offset points the next of '?'.
  495                                                   // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
  496           int add = 0, mask = 0, ch = -1;
  497           while (this.offset < this.regexlen) {
  498               ch = this.regex.charAt(this.offset);
  499               int v = REUtil.getOptionValue(ch);
  500               if (v == 0)  break;                 // '-' or ':'?
  501               add |= v;
  502               this.offset ++;
  503           }
  504           if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
  505           if (ch == '-') {
  506               this.offset ++;
  507               while (this.offset < this.regexlen) {
  508                   ch = this.regex.charAt(this.offset);
  509                   int v = REUtil.getOptionValue(ch);
  510                   if (v == 0)  break;             // ':'?
  511                   mask |= v;
  512                   this.offset ++;
  513               }
  514               if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
  515           }
  516           Token tok;
  517           if (ch == ':') {
  518               this.offset ++;
  519               this.next();
  520               tok = Token.createModifierGroup(this.parseRegex(), add, mask);
  521               if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  522               this.next();
  523           } else if (ch == ')') {                 // such as (?-i)
  524               this.offset ++;
  525               this.next();
  526               tok = Token.createModifierGroup(this.parseRegex(), add, mask);
  527           } else
  528               throw ex("parser.factor.3", this.offset);
  529   
  530           return tok;
  531       }
  532       Token processIndependent() throws ParseException {
  533           this.next();
  534           Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
  535           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  536           this.next();                            // Skips ')'
  537           return tok;
  538       }
  539       Token processBacksolidus_c() throws ParseException {
  540           int ch2;                                // Must be in 0x0040-0x005f
  541           if (this.offset >= this.regexlen
  542               || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
  543               throw ex("parser.atom.1", this.offset-1);
  544           this.next();
  545           return Token.createChar(ch2-0x40);
  546       }
  547       Token processBacksolidus_C() throws ParseException {
  548           throw ex("parser.process.1", this.offset);
  549       }
  550       Token processBacksolidus_i() throws ParseException {
  551           Token tok = Token.createChar('i');
  552           this.next();
  553           return tok;
  554       }
  555       Token processBacksolidus_I() throws ParseException {
  556           throw ex("parser.process.1", this.offset);
  557       }
  558       Token processBacksolidus_g() throws ParseException {
  559           this.next();
  560           return Token.getGraphemePattern();
  561       }
  562       Token processBacksolidus_X() throws ParseException {
  563           this.next();
  564           return Token.getCombiningCharacterSequence();
  565       }
  566       Token processBackreference() throws ParseException {
  567           int refnum = this.chardata-'0';
  568           Token tok = Token.createBackReference(refnum);
  569           this.hasBackReferences = true;
  570           if (this.references == null)  this.references = new Vector();
  571           this.references.addElement(new ReferencePosition(refnum, this.offset-2));
  572           this.next();
  573           return tok;
  574       }
  575   
  576       // ----------------------------------------------------------------
  577   
  578       /**
  579        * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
  580        *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
  581        *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
  582        *            | '(?#' [^)]* ')'
  583        * minmax ::= '{' min (',' max?)? '}'
  584        * min ::= [0-9]+
  585        * max ::= [0-9]+
  586        */
  587       Token parseFactor() throws ParseException {        
  588           int ch = this.read();
  589           Token tok;
  590           switch (ch) {
  591             case T_CARET:         return this.processCaret();
  592             case T_DOLLAR:        return this.processDollar();
  593             case T_LOOKAHEAD:     return this.processLookahead();
  594             case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
  595             case T_LOOKBEHIND:    return this.processLookbehind();
  596             case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
  597   
  598             case T_COMMENT:
  599               this.next();
  600               return Token.createEmpty();
  601   
  602             case T_BACKSOLIDUS:
  603               switch (this.chardata) {
  604                 case 'A': return this.processBacksolidus_A();
  605                 case 'Z': return this.processBacksolidus_Z();
  606                 case 'z': return this.processBacksolidus_z();
  607                 case 'b': return this.processBacksolidus_b();
  608                 case 'B': return this.processBacksolidus_B();
  609                 case '<': return this.processBacksolidus_lt();
  610                 case '>': return this.processBacksolidus_gt();
  611               }
  612                                                   // through down
  613           }
  614           tok = this.parseAtom();
  615           ch = this.read();
  616           switch (ch) {
  617             case T_STAR:  return this.processStar(tok);
  618             case T_PLUS:  return this.processPlus(tok);
  619             case T_QUESTION: return this.processQuestion(tok);
  620             case T_CHAR:
  621               if (this.chardata == '{' && this.offset < this.regexlen) {
  622   
  623                   int off = this.offset;          // this.offset -> next of '{'
  624                   int min = 0, max = -1;
  625   
  626                   if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
  627   
  628                       min = ch -'0';
  629                       while (off < this.regexlen
  630                              && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
  631                           min = min*10 +ch-'0';
  632                           if (min < 0)
  633                               throw ex("parser.quantifier.5", this.offset);
  634                       }
  635                   }
  636                   else {
  637                       throw ex("parser.quantifier.1", this.offset);
  638                   }
  639   
  640                   max = min;
  641                   if (ch == ',') {
  642   
  643                      if (off >= this.regexlen) {
  644                          throw ex("parser.quantifier.3", this.offset);
  645                      }
  646                      else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {                       
  647   
  648                           max = ch -'0';       // {min,max}
  649                           while (off < this.regexlen
  650                                  && (ch = this.regex.charAt(off++)) >= '0'
  651                                  && ch <= '9') {
  652                               max = max*10 +ch-'0';
  653                               if (max < 0)
  654                                   throw ex("parser.quantifier.5", this.offset);
  655                           }
  656   
  657                           if (min > max)
  658                               throw ex("parser.quantifier.4", this.offset);
  659                      }
  660                      else { // assume {min,}
  661                           max = -1;           
  662                       }
  663                   }
  664   
  665                  if (ch != '}')
  666                      throw ex("parser.quantifier.2", this.offset);
  667   
  668                  if (this.checkQuestion(off)) {  // off -> next of '}'
  669                       tok = Token.createNGClosure(tok);
  670                       this.offset = off+1;
  671                   } else {
  672                       tok = Token.createClosure(tok);
  673                       this.offset = off;
  674                   }
  675   
  676                   tok.setMin(min);
  677                   tok.setMax(max);
  678                   //System.err.println("CLOSURE: "+min+", "+max);
  679                   this.next();
  680               }
  681           }
  682           return tok;
  683       }
  684   
  685       /**
  686        * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  687        *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
  688        *          | '(?>' regex ')'
  689        * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
  690        */
  691       Token parseAtom() throws ParseException {
  692           int ch = this.read();
  693           Token tok = null;
  694           switch (ch) {
  695             case T_LPAREN:        return this.processParen();
  696             case T_LPAREN2:       return this.processParen2(); // '(?:'
  697             case T_CONDITION:     return this.processCondition(); // '(?('
  698             case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
  699             case T_INDEPENDENT:   return this.processIndependent();
  700             case T_DOT:
  701               this.next();                    // Skips '.'
  702               tok = Token.token_dot;
  703               break;
  704   
  705               /**
  706                * char-class ::= '[' ( '^'? range ','?)+ ']'
  707                * range ::= '\d' | '\w' | '\s' | category-block | range-char
  708                *           | range-char '-' range-char
  709                * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
  710                * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
  711                */
  712             case T_LBRACKET:      return this.parseCharacterClass(true);
  713             case T_SET_OPERATIONS: return this.parseSetOperations();
  714   
  715             case T_BACKSOLIDUS:
  716               switch (this.chardata) {
  717                 case 'd':  case 'D':
  718                 case 'w':  case 'W':
  719                 case 's':  case 'S':
  720                   tok = this.getTokenForShorthand(this.chardata);
  721                   this.next();
  722                   return tok;
  723   
  724                 case 'e':  case 'f':  case 'n':  case 'r':
  725                 case 't':  case 'u':  case 'v':  case 'x':
  726                   {
  727                       int ch2 = this.decodeEscaped();
  728                       if (ch2 < 0x10000) {
  729                           tok = Token.createChar(ch2);
  730                       } else {
  731                           tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
  732                       }
  733                   }
  734                   break;
  735   
  736                 case 'c': return this.processBacksolidus_c();
  737                 case 'C': return this.processBacksolidus_C();
  738                 case 'i': return this.processBacksolidus_i();
  739                 case 'I': return this.processBacksolidus_I();
  740                 case 'g': return this.processBacksolidus_g();
  741                 case 'X': return this.processBacksolidus_X();
  742                 case '1':  case '2':  case '3':  case '4':
  743                 case '5':  case '6':  case '7':  case '8':  case '9':
  744                   return this.processBackreference();
  745   
  746                 case 'P':
  747                 case 'p':
  748                   int pstart = this.offset;
  749                   tok = processBacksolidus_pP(this.chardata);
  750                   if (tok == null)  throw this.ex("parser.atom.5", pstart);
  751                   break;
  752   
  753                 default:
  754                   tok = Token.createChar(this.chardata);
  755               }
  756               this.next();
  757               break;
  758   
  759             case T_CHAR:
  760               if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
  761                   throw this.ex("parser.atom.4", this.offset-1);
  762               tok = Token.createChar(this.chardata);
  763               int high = this.chardata;
  764               this.next();
  765               if (REUtil.isHighSurrogate(high)
  766                   && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
  767                   char[] sur = new char[2];
  768                   sur[0] = (char)high;
  769                   sur[1] = (char)this.chardata;
  770                   tok = Token.createParen(Token.createString(new String(sur)), 0);
  771                   this.next();
  772               }
  773               break;
  774   
  775             default:
  776               throw this.ex("parser.atom.4", this.offset-1);
  777           }
  778           return tok;
  779       }
  780   
  781       protected RangeToken processBacksolidus_pP(int c) throws ParseException {
  782   
  783           this.next();
  784           if (this.read() != T_CHAR || this.chardata != '{')
  785               throw this.ex("parser.atom.2", this.offset-1);
  786   
  787           // handle category escape
  788           boolean positive = c == 'p';
  789           int namestart = this.offset;
  790           int nameend = this.regex.indexOf('}', namestart);
  791   
  792           if (nameend < 0)
  793               throw this.ex("parser.atom.3", this.offset);
  794   
  795           String pname = this.regex.substring(namestart, nameend);
  796           this.offset = nameend+1;
  797   
  798           return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
  799       }
  800   
  801       int processCIinCharacterClass(RangeToken tok, int c) {
  802           return this.decodeEscaped();
  803       }
  804   
  805       /**
  806        * char-class ::= '[' ( '^'? range ','?)+ ']'
  807        * range ::= '\d' | '\w' | '\s' | category-block | range-char
  808        *           | range-char '-' range-char
  809        * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
  810        * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
  811        */
  812       protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
  813           this.setContext(S_INBRACKETS);
  814           this.next();                            // '['
  815           boolean nrange = false;
  816           RangeToken base = null;
  817           RangeToken tok;
  818           if (this.read() == T_CHAR && this.chardata == '^') {
  819               nrange = true;
  820               this.next();                        // '^'
  821               if (useNrange) {
  822                   tok = Token.createNRange();
  823               } else {
  824                   base = Token.createRange();
  825                   base.addRange(0, Token.UTF16_MAX);
  826                   tok = Token.createRange();
  827               }
  828           } else {
  829               tok = Token.createRange();
  830           }
  831           int type;
  832           boolean firstloop = true;
  833           while ((type = this.read()) != T_EOF) {
  834               if (type == T_CHAR && this.chardata == ']' && !firstloop)
  835                   break;
  836               firstloop = false;
  837               int c = this.chardata;
  838               boolean end = false;
  839               if (type == T_BACKSOLIDUS) {
  840                   switch (c) {
  841                     case 'd':  case 'D':
  842                     case 'w':  case 'W':
  843                     case 's':  case 'S':
  844                       tok.mergeRanges(this.getTokenForShorthand(c));
  845                       end = true;
  846                       break;
  847   
  848                     case 'i':  case 'I':
  849                     case 'c':  case 'C':
  850                       c = this.processCIinCharacterClass(tok, c);
  851                       if (c < 0)  end = true;
  852                       break;
  853                       
  854                     case 'p':
  855                     case 'P':
  856                       int pstart = this.offset;
  857                       RangeToken tok2 = this.processBacksolidus_pP(c);
  858                       if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
  859                       tok.mergeRanges(tok2);
  860                       end = true;
  861                       break;
  862   
  863                     default:
  864                       c = this.decodeEscaped();
  865                   } // \ + c
  866               } // backsolidus
  867                                                   // POSIX Character class such as [:alnum:]
  868               else if (type == T_POSIX_CHARCLASS_START) {
  869                   int nameend = this.regex.indexOf(':', this.offset);
  870                   if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
  871                   boolean positive = true;
  872                   if (this.regex.charAt(this.offset) == '^') {
  873                       this.offset ++;
  874                       positive = false;
  875                   }
  876                   String name = this.regex.substring(this.offset, nameend);
  877                   RangeToken range = Token.getRange(name, positive,
  878                                                     this.isSet(RegularExpression.XMLSCHEMA_MODE));
  879                   if (range == null)  throw this.ex("parser.cc.3", this.offset);
  880                   tok.mergeRanges(range);
  881                   end = true;
  882                   if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
  883                       throw this.ex("parser.cc.1", nameend);
  884                   this.offset = nameend+2;
  885               }
  886               this.next();
  887               if (!end) {                         // if not shorthands...
  888                   if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
  889                       tok.addRange(c, c);
  890                   } else {
  891                       this.next(); // Skips '-'
  892                       if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
  893                       if (type == T_CHAR && this.chardata == ']') {
  894                           tok.addRange(c, c);
  895                           tok.addRange('-', '-');
  896                       } else {
  897                           int rangeend = this.chardata;
  898                           if (type == T_BACKSOLIDUS)
  899                               rangeend = this.decodeEscaped();
  900                           this.next();
  901                           tok.addRange(c, rangeend);
  902                       }
  903                   }
  904               }
  905               if (this.isSet(RegularExpression.SPECIAL_COMMA)
  906                   && this.read() == T_CHAR && this.chardata == ',')
  907                   this.next();
  908           }
  909           if (this.read() == T_EOF)
  910               throw this.ex("parser.cc.2", this.offset);
  911           if (!useNrange && nrange) {
  912               base.subtractRanges(tok);
  913               tok = base;
  914           }
  915           tok.sortRanges();
  916           tok.compactRanges();
  917           //tok.dumpRanges();
  918           /*
  919           if (this.isSet(RegularExpression.IGNORE_CASE))
  920               tok = RangeToken.createCaseInsensitiveToken(tok);
  921           */
  922           this.setContext(S_NORMAL);
  923           this.next();                    // Skips ']'
  924   
  925           return tok;
  926       }
  927   
  928       /**
  929        * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
  930        */
  931       protected RangeToken parseSetOperations() throws ParseException {
  932           RangeToken tok = this.parseCharacterClass(false);
  933           int type;
  934           while ((type = this.read()) != T_RPAREN) {
  935               int ch = this.chardata;
  936               if (type == T_CHAR && (ch == '-' || ch == '&')
  937                   || type == T_PLUS) {
  938                   this.next();
  939                   if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
  940                   RangeToken t2 = this.parseCharacterClass(false);
  941                   if (type == T_PLUS)
  942                       tok.mergeRanges(t2);
  943                   else if (ch == '-')
  944                       tok.subtractRanges(t2);
  945                   else if (ch == '&')
  946                       tok.intersectRanges(t2);
  947                   else
  948                       throw new RuntimeException("ASSERT");
  949               } else {
  950                   throw ex("parser.ope.2", this.offset-1);
  951               }
  952           }
  953           this.next();
  954           return tok;
  955       }
  956   
  957       Token getTokenForShorthand(int ch) {
  958           Token tok;
  959           switch (ch) {
  960             case 'd':
  961               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  962                   ? Token.getRange("Nd", true) : Token.token_0to9;
  963               break;
  964             case 'D':
  965               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  966                   ? Token.getRange("Nd", false) : Token.token_not_0to9;
  967               break;
  968             case 'w':
  969               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  970                   ? Token.getRange("IsWord", true) : Token.token_wordchars;
  971               break;
  972             case 'W':
  973               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  974                   ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
  975               break;
  976             case 's':
  977               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  978                   ? Token.getRange("IsSpace", true) : Token.token_spaces;
  979               break;
  980             case 'S':
  981               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  982                   ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
  983               break;
  984   
  985             default:
  986               throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
  987           }
  988           return tok;
  989       }
  990   
  991       /**
  992        */
  993       int decodeEscaped() throws ParseException {
  994           if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
  995           int c = this.chardata;
  996           switch (c) {
  997             case 'e':  c = 0x1b;  break; // ESCAPE U+001B
  998             case 'f':  c = '\f';  break; // FORM FEED U+000C
  999             case 'n':  c = '\n';  break; // LINE FEED U+000A
 1000             case 'r':  c = '\r';  break; // CRRIAGE RETURN U+000D
 1001             case 't':  c = '\t';  break; // HORIZONTAL TABULATION U+0009
 1002             //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
 1003             case 'x':
 1004               this.next();
 1005               if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
 1006               if (this.chardata == '{') {
 1007                   int v1 = 0;
 1008                   int uv = 0;
 1009                   do {
 1010                       this.next();
 1011                       if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
 1012                       if ((v1 = hexChar(this.chardata)) < 0)
 1013                           break;
 1014                       if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
 1015                       uv = uv*16+v1;
 1016                   } while (true);
 1017                   if (this.chardata != '}')  throw ex("parser.descape.3", this.offset-1);
 1018                   if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
 1019                   c = uv;
 1020               } else {
 1021                   int v1 = 0;
 1022                   if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1023                       throw ex("parser.descape.1", this.offset-1);
 1024                   int uv = v1;
 1025                   this.next();
 1026                   if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1027                       throw ex("parser.descape.1", this.offset-1);
 1028                   uv = uv*16+v1;
 1029                   c = uv;
 1030               }
 1031               break;
 1032   
 1033             case 'u':
 1034               int v1 = 0;
 1035               this.next();
 1036               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1037                   throw ex("parser.descape.1", this.offset-1);
 1038               int uv = v1;
 1039               this.next();
 1040               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1041                   throw ex("parser.descape.1", this.offset-1);
 1042               uv = uv*16+v1;
 1043               this.next();
 1044               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1045                   throw ex("parser.descape.1", this.offset-1);
 1046               uv = uv*16+v1;
 1047               this.next();
 1048               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1049                   throw ex("parser.descape.1", this.offset-1);
 1050               uv = uv*16+v1;
 1051               c = uv;
 1052               break;
 1053   
 1054             case 'v':
 1055               this.next();
 1056               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1057                   throw ex("parser.descape.1", this.offset-1);
 1058               uv = v1;
 1059               this.next();
 1060               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1061                   throw ex("parser.descape.1", this.offset-1);
 1062               uv = uv*16+v1;
 1063               this.next();
 1064               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1065                   throw ex("parser.descape.1", this.offset-1);
 1066               uv = uv*16+v1;
 1067               this.next();
 1068               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1069                   throw ex("parser.descape.1", this.offset-1);
 1070               uv = uv*16+v1;
 1071               this.next();
 1072               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1073                   throw ex("parser.descape.1", this.offset-1);
 1074               uv = uv*16+v1;
 1075               this.next();
 1076               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1077                   throw ex("parser.descape.1", this.offset-1);
 1078               uv = uv*16+v1;
 1079               if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
 1080               c = uv;
 1081               break;
 1082             case 'A':
 1083             case 'Z':
 1084             case 'z':
 1085               throw ex("parser.descape.5", this.offset-2);
 1086             default:
 1087           }
 1088           return c;
 1089       }
 1090   
 1091       static private final int hexChar(int ch) {
 1092           if (ch < '0')  return -1;
 1093           if (ch > 'f')  return -1;
 1094           if (ch <= '9')  return ch-'0';
 1095           if (ch < 'A')  return -1;
 1096           if (ch <= 'F')  return ch-'A'+10;
 1097           if (ch < 'a')  return -1;
 1098           return ch-'a'+10;
 1099       }
 1100   }

Home » Xerces-J-src.2.9.1 » org.apache.xerces » impl » xpath » regex » [javadoc | source]