Home » Xerces-J-src.2.9.1 » org.apache.xerces » impl » xpath » regex » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    * 
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    * 
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.xerces.impl.xpath.regex;
   19   
   20   import java.util.Vector;
   21   import java.util.Hashtable;
   22   
   23   /**
   24    * This class represents a node in parse tree.
   25    * 
   26    * @xerces.internal
   27    *
   28    * @version $Id: Token.java 572108 2007-09-02 18:48:31Z mrglavas $
   29    */
   30   class Token implements java.io.Serializable {
   31   
   32       private static final long serialVersionUID = 8484976002585487481L;
   33   
   34       static final boolean COUNTTOKENS = true;
   35       static int tokens = 0;
   36   
   37       static final int CHAR = 0;                  // Literal char
   38       static final int DOT = 11;                  // .
   39       static final int CONCAT = 1;                // XY
   40       static final int UNION = 2;                 // X|Y|Z
   41       static final int CLOSURE = 3;               // X*
   42       static final int RANGE = 4;                 // [a-zA-Z] etc.
   43       static final int NRANGE = 5;                // [^a-zA-Z] etc.
   44       static final int PAREN = 6;                 // (X) or (?:X)
   45       static final int EMPTY = 7;                 //
   46       static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
   47       static final int NONGREEDYCLOSURE = 9;      // *? +?
   48       static final int STRING = 10;               // strings
   49       static final int BACKREFERENCE = 12;        // back references
   50       static final int LOOKAHEAD = 20;            // (?=...)
   51       static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
   52       static final int LOOKBEHIND = 22;           // (?<=...)
   53       static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
   54       static final int INDEPENDENT = 24;          // (?>...)
   55       static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
   56       static final int CONDITION = 26;            // (?(...)yes|no)
   57   
   58       static final int UTF16_MAX = 0x10ffff;
   59   
   60       final int type;
   61   
   62       static Token token_dot;
   63       static Token token_0to9;
   64       static Token token_wordchars;
   65       static Token token_not_0to9;
   66       static Token token_not_wordchars;
   67       static Token token_spaces;
   68       static Token token_not_spaces;
   69       static Token token_empty;
   70       static Token token_linebeginning;
   71       static Token token_linebeginning2;
   72       static Token token_lineend;
   73       static Token token_stringbeginning;
   74       static Token token_stringend;
   75       static Token token_stringend2;
   76       static Token token_wordedge;
   77       static Token token_not_wordedge;
   78       static Token token_wordbeginning;
   79       static Token token_wordend;
   80       static {
   81           Token.token_empty = new Token(Token.EMPTY);
   82   
   83           Token.token_linebeginning = Token.createAnchor('^');
   84           Token.token_linebeginning2 = Token.createAnchor('@');
   85           Token.token_lineend = Token.createAnchor('$');
   86           Token.token_stringbeginning = Token.createAnchor('A');
   87           Token.token_stringend = Token.createAnchor('z');
   88           Token.token_stringend2 = Token.createAnchor('Z');
   89           Token.token_wordedge = Token.createAnchor('b');
   90           Token.token_not_wordedge = Token.createAnchor('B');
   91           Token.token_wordbeginning = Token.createAnchor('<');
   92           Token.token_wordend = Token.createAnchor('>');
   93   
   94           Token.token_dot = new Token(Token.DOT);
   95   
   96           Token.token_0to9 = Token.createRange();
   97           Token.token_0to9.addRange('0', '9');
   98           Token.token_wordchars = Token.createRange();
   99           Token.token_wordchars.addRange('0', '9');
  100           Token.token_wordchars.addRange('A', 'Z');
  101           Token.token_wordchars.addRange('_', '_');
  102           Token.token_wordchars.addRange('a', 'z');
  103           Token.token_spaces = Token.createRange();
  104           Token.token_spaces.addRange('\t', '\t');
  105           Token.token_spaces.addRange('\n', '\n');
  106           Token.token_spaces.addRange('\f', '\f');
  107           Token.token_spaces.addRange('\r', '\r');
  108           Token.token_spaces.addRange(' ', ' ');
  109   
  110           Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
  111           Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
  112           Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
  113       }
  114   
  115       static Token.ParenToken createLook(int type, Token child) {
  116           if (COUNTTOKENS)  Token.tokens ++;
  117           return new Token.ParenToken(type, child, 0);
  118       }
  119       static Token.ParenToken createParen(Token child, int pnumber) {
  120           if (COUNTTOKENS)  Token.tokens ++;
  121           return new Token.ParenToken(Token.PAREN, child, pnumber);
  122       }
  123       static Token.ClosureToken createClosure(Token tok) {
  124           if (COUNTTOKENS)  Token.tokens ++;
  125           return new Token.ClosureToken(Token.CLOSURE, tok);
  126       }
  127       static Token.ClosureToken createNGClosure(Token tok) {
  128           if (COUNTTOKENS)  Token.tokens ++;
  129           return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
  130       }
  131       static Token.ConcatToken createConcat(Token tok1, Token tok2) {
  132           if (COUNTTOKENS)  Token.tokens ++;
  133           return new Token.ConcatToken(tok1, tok2);
  134       }
  135       static Token.UnionToken createConcat() {
  136           if (COUNTTOKENS)  Token.tokens ++;
  137           return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
  138       }
  139       static Token.UnionToken createUnion() {
  140           if (COUNTTOKENS)  Token.tokens ++;
  141           return new Token.UnionToken(Token.UNION);
  142       }
  143       static Token createEmpty() {
  144           return Token.token_empty;
  145       }
  146       static RangeToken createRange() {
  147           if (COUNTTOKENS)  Token.tokens ++;
  148           return new RangeToken(Token.RANGE);
  149       }
  150       static RangeToken createNRange() {
  151           if (COUNTTOKENS)  Token.tokens ++;
  152           return new RangeToken(Token.NRANGE);
  153       }
  154       static Token.CharToken createChar(int ch) {
  155           if (COUNTTOKENS)  Token.tokens ++;
  156           return new Token.CharToken(Token.CHAR, ch);
  157       }
  158       static private Token.CharToken createAnchor(int ch) {
  159           if (COUNTTOKENS)  Token.tokens ++;
  160           return new Token.CharToken(Token.ANCHOR, ch);
  161       }
  162       static Token.StringToken createBackReference(int refno) {
  163           if (COUNTTOKENS)  Token.tokens ++;
  164           return new Token.StringToken(Token.BACKREFERENCE, null, refno);
  165       }
  166       static Token.StringToken createString(String str) {
  167           if (COUNTTOKENS)  Token.tokens ++;
  168           return new Token.StringToken(Token.STRING, str, 0);
  169       }
  170       static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
  171           if (COUNTTOKENS)  Token.tokens ++;
  172           return new Token.ModifierToken(child, add, mask);
  173       }
  174       static Token.ConditionToken createCondition(int refno, Token condition,
  175                                                   Token yespat, Token nopat) {
  176           if (COUNTTOKENS)  Token.tokens ++;
  177           return new Token.ConditionToken(refno, condition, yespat, nopat);
  178       }
  179   
  180       protected Token(int type) {
  181           this.type = type;
  182       }
  183   
  184       /**
  185        * A number of children.
  186        */
  187       int size() {
  188           return 0;
  189       }
  190       Token getChild(int index) {
  191           return null;
  192       }
  193       void addChild(Token tok) {
  194           throw new RuntimeException("Not supported.");
  195       }
  196   
  197                                                   // for RANGE or NRANGE
  198       protected void addRange(int start, int end) {
  199           throw new RuntimeException("Not supported.");
  200       }
  201       protected void sortRanges() {
  202           throw new RuntimeException("Not supported.");
  203       }
  204       protected void compactRanges() {
  205           throw new RuntimeException("Not supported.");
  206       }
  207       protected void mergeRanges(Token tok) {
  208           throw new RuntimeException("Not supported.");
  209       }
  210       protected void subtractRanges(Token tok) {
  211           throw new RuntimeException("Not supported.");
  212       }
  213       protected void intersectRanges(Token tok) {
  214           throw new RuntimeException("Not supported.");
  215       }
  216       static Token complementRanges(Token tok) {
  217           return RangeToken.complementRanges(tok);
  218       }
  219   
  220   
  221       void setMin(int min) {                      // for CLOSURE
  222       }
  223       void setMax(int max) {                      // for CLOSURE
  224       }
  225       int getMin() {                              // for CLOSURE
  226           return -1;
  227       }
  228       int getMax() {                              // for CLOSURE
  229           return -1;
  230       }
  231       int getReferenceNumber() {                  // for STRING
  232           return 0;
  233       }
  234       String getString() {                        // for STRING
  235           return null;
  236       }
  237   
  238       int getParenNumber() {
  239           return 0;
  240       }
  241       int getChar() {
  242           return -1;
  243       }
  244   
  245       public String toString() {
  246           return this.toString(0);
  247       }
  248       public String toString(int options) {
  249           return this.type == Token.DOT ? "." : "";
  250       }
  251   
  252       /**
  253        * How many characters are needed?
  254        */
  255       final int getMinLength() {
  256           switch (this.type) {
  257             case CONCAT:
  258               int sum = 0;
  259               for (int i = 0;  i < this.size();  i ++)
  260                   sum += this.getChild(i).getMinLength();
  261               return sum;
  262   
  263             case CONDITION:
  264             case UNION:
  265               if (this.size() == 0)
  266                   return 0;
  267               int ret = this.getChild(0).getMinLength();
  268               for (int i = 1;  i < this.size();  i ++) {
  269                   int min = this.getChild(i).getMinLength();
  270                   if (min < ret)  ret = min;
  271               }
  272               return ret;
  273   
  274             case CLOSURE:
  275             case NONGREEDYCLOSURE:
  276               if (this.getMin() >= 0)
  277                   return this.getMin() * this.getChild(0).getMinLength();
  278               return 0;
  279   
  280             case EMPTY:
  281             case ANCHOR:
  282               return 0;
  283   
  284             case DOT:
  285             case CHAR:
  286             case RANGE:
  287             case NRANGE:
  288               return 1;
  289   
  290             case INDEPENDENT:
  291             case PAREN:
  292             case MODIFIERGROUP:
  293               return this.getChild(0).getMinLength();
  294   
  295             case BACKREFERENCE:
  296               return 0;                           // *******
  297   
  298             case STRING:
  299               return this.getString().length();
  300   
  301             case LOOKAHEAD:
  302             case NEGATIVELOOKAHEAD:
  303             case LOOKBEHIND:
  304             case NEGATIVELOOKBEHIND:
  305               return 0;                           // ***** Really?
  306   
  307             default:
  308               throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
  309           }
  310       }
  311   
  312       final int getMaxLength() {
  313           switch (this.type) {
  314             case CONCAT:
  315               int sum = 0;
  316               for (int i = 0;  i < this.size();  i ++) {
  317                   int d = this.getChild(i).getMaxLength();
  318                   if (d < 0)  return -1;
  319                   sum += d;
  320               }
  321               return sum;
  322   
  323             case CONDITION:
  324             case UNION:
  325               if (this.size() == 0)
  326                   return 0;
  327               int ret = this.getChild(0).getMaxLength();
  328               for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
  329                   int max = this.getChild(i).getMaxLength();
  330                   if (max < 0) {                  // infinity
  331                       ret = -1;
  332                       break;
  333                   }
  334                   if (max > ret)  ret = max;
  335               }
  336               return ret;
  337   
  338             case CLOSURE:
  339             case NONGREEDYCLOSURE:
  340               if (this.getMax() >= 0)
  341                                                   // When this.child.getMaxLength() < 0,
  342                                                   // this returns minus value
  343                   return this.getMax() * this.getChild(0).getMaxLength();
  344               return -1;
  345   
  346             case EMPTY:
  347             case ANCHOR:
  348               return 0;
  349   
  350             case CHAR:
  351               return 1;
  352             case DOT:
  353             case RANGE:
  354             case NRANGE:
  355               return 2;
  356   
  357             case INDEPENDENT:
  358             case PAREN:
  359             case MODIFIERGROUP:
  360               return this.getChild(0).getMaxLength();
  361   
  362             case BACKREFERENCE:
  363               return -1;                          // ******
  364   
  365             case STRING:
  366               return this.getString().length();
  367   
  368             case LOOKAHEAD:
  369             case NEGATIVELOOKAHEAD:
  370             case LOOKBEHIND:
  371             case NEGATIVELOOKBEHIND:
  372               return 0;                           // ***** Really?
  373   
  374             default:
  375               throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
  376           }
  377       }
  378   
  379       static final int FC_CONTINUE = 0;
  380       static final int FC_TERMINAL = 1;
  381       static final int FC_ANY = 2;
  382       private static final boolean isSet(int options, int flag) {
  383           return (options & flag) == flag;
  384       }
  385       final int analyzeFirstCharacter(RangeToken result, int options) {
  386           switch (this.type) {
  387             case CONCAT:
  388               int ret = FC_CONTINUE;
  389               for (int i = 0;  i < this.size();  i ++)
  390                   if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
  391                       break;
  392               return ret;
  393   
  394             case UNION:
  395               if (this.size() == 0)
  396                   return FC_CONTINUE;
  397               /*
  398                *  a|b|c -> FC_TERMINAL
  399                *  a|.|c -> FC_ANY
  400                *  a|b|  -> FC_CONTINUE
  401                */
  402               int ret2 = FC_CONTINUE;
  403               boolean hasEmpty = false;
  404               for (int i = 0;  i < this.size();  i ++) {
  405                   ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
  406                   if (ret2 == FC_ANY)
  407                       break;
  408                   else if (ret2 == FC_CONTINUE)
  409                       hasEmpty = true;
  410               }
  411               return hasEmpty ? FC_CONTINUE : ret2;
  412   
  413             case CONDITION:
  414               int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
  415               if (this.size() == 1)  return FC_CONTINUE;
  416               if (ret3 == FC_ANY)  return ret3;
  417               int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
  418               if (ret4 == FC_ANY)  return ret4;
  419               return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
  420   
  421             case CLOSURE:
  422             case NONGREEDYCLOSURE:
  423               this.getChild(0).analyzeFirstCharacter(result, options);
  424               return FC_CONTINUE;
  425   
  426             case EMPTY:
  427             case ANCHOR:
  428               return FC_CONTINUE;
  429   
  430             case CHAR:
  431               int ch = this.getChar();
  432               result.addRange(ch, ch);
  433               if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
  434                   ch = Character.toUpperCase((char)ch);
  435                   result.addRange(ch, ch);
  436                   ch = Character.toLowerCase((char)ch);
  437                   result.addRange(ch, ch);
  438               }
  439               return FC_TERMINAL;
  440   
  441             case DOT:
  442                 return FC_ANY;
  443   
  444             case RANGE:
  445               if (isSet(options, RegularExpression.IGNORE_CASE)) {
  446                   result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
  447               } else {
  448                   result.mergeRanges(this);
  449               }
  450               return FC_TERMINAL;
  451   
  452             case NRANGE:                          // ****
  453               if (isSet(options, RegularExpression.IGNORE_CASE)) {
  454                   result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
  455               } else {
  456                   result.mergeRanges(Token.complementRanges(this));
  457               }
  458               return FC_TERMINAL;
  459   
  460             case INDEPENDENT:
  461             case PAREN:
  462               return this.getChild(0).analyzeFirstCharacter(result, options);
  463   
  464             case MODIFIERGROUP:
  465               options |= ((ModifierToken)this).getOptions();
  466               options &= ~((ModifierToken)this).getOptionsMask();
  467               return this.getChild(0).analyzeFirstCharacter(result, options);
  468   
  469             case BACKREFERENCE:
  470               result.addRange(0, UTF16_MAX);  // **** We can not optimize.
  471               return FC_ANY;
  472   
  473             case STRING:
  474               int cha = this.getString().charAt(0);
  475               int ch2;
  476               if (REUtil.isHighSurrogate(cha)
  477                   && this.getString().length() >= 2
  478                   && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
  479                   cha = REUtil.composeFromSurrogates(cha, ch2);
  480               result.addRange(cha, cha);
  481               if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
  482                   cha = Character.toUpperCase((char)cha);
  483                   result.addRange(cha, cha);
  484                   cha = Character.toLowerCase((char)cha);
  485                   result.addRange(cha, cha);
  486               }
  487               return FC_TERMINAL;
  488   
  489             case LOOKAHEAD:
  490             case NEGATIVELOOKAHEAD:
  491             case LOOKBEHIND:
  492             case NEGATIVELOOKBEHIND:
  493               return FC_CONTINUE;
  494   
  495             default:
  496               throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
  497           }
  498       }
  499   
  500       private final boolean isShorterThan(Token tok) {
  501           if (tok == null)  return false;
  502           /*
  503           int mylength;
  504           if (this.type == STRING)  mylength = this.getString().length();
  505           else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
  506           else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
  507           int otherlength;
  508           if (tok.type == STRING)  otherlength = tok.getString().length();
  509           else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
  510           else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
  511           */
  512           int mylength;
  513           if (this.type == STRING)  mylength = this.getString().length();
  514           else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
  515           int otherlength;
  516           if (tok.type == STRING)  otherlength = tok.getString().length();
  517           else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
  518           return mylength < otherlength;
  519       }
  520   
  521       static class FixedStringContainer {
  522           Token token = null;
  523           int options = 0;
  524           FixedStringContainer() {
  525           }
  526       }
  527   
  528       final void findFixedString(FixedStringContainer container, int options) {
  529           switch (this.type) {
  530             case CONCAT:
  531               Token prevToken = null;
  532               int prevOptions = 0;
  533               for (int i = 0;  i < this.size();  i ++) {
  534                   this.getChild(i).findFixedString(container, options);
  535                   if (prevToken == null || prevToken.isShorterThan(container.token)) {
  536                       prevToken = container.token;
  537                       prevOptions = container.options;
  538                   }
  539               }
  540               container.token = prevToken;
  541               container.options = prevOptions;
  542               return;
  543   
  544             case UNION:
  545             case CLOSURE:
  546             case NONGREEDYCLOSURE:
  547             case EMPTY:
  548             case ANCHOR:
  549             case RANGE:
  550             case DOT:
  551             case NRANGE:
  552             case BACKREFERENCE:
  553             case LOOKAHEAD:
  554             case NEGATIVELOOKAHEAD:
  555             case LOOKBEHIND:
  556             case NEGATIVELOOKBEHIND:
  557             case CONDITION:
  558               container.token = null;
  559               return;
  560   
  561             case CHAR:                            // Ignore CHAR tokens.
  562               container.token = null;             // **
  563               return;                             // **
  564   
  565             case STRING:
  566               container.token = this;
  567               container.options = options;
  568               return;
  569   
  570             case INDEPENDENT:
  571             case PAREN:
  572               this.getChild(0).findFixedString(container, options);
  573               return;
  574   
  575             case MODIFIERGROUP:
  576               options |= ((ModifierToken)this).getOptions();
  577               options &= ~((ModifierToken)this).getOptionsMask();
  578               this.getChild(0).findFixedString(container, options);
  579               return;
  580   
  581             default:
  582               throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
  583           }
  584       }
  585   
  586       boolean match(int ch) {
  587           throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
  588       }
  589   
  590       // ------------------------------------------------------
  591       private final static Hashtable categories = new Hashtable();
  592       private final static Hashtable categories2 = new Hashtable();
  593       private static final String[] categoryNames = {
  594           "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
  595           "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
  596           "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
  597           "Pi", "Pf",  // 29, 30
  598           "L", "M", "N", "Z", "C", "P", "S",      // 31-37
  599       };
  600   
  601       // Schema Rec. {Datatypes} - Punctuation 
  602       static final int CHAR_INIT_QUOTE  = 29;     // Pi - initial quote
  603       static final int CHAR_FINAL_QUOTE = 30;     // Pf - final quote
  604       static final int CHAR_LETTER = 31;
  605       static final int CHAR_MARK = 32;
  606       static final int CHAR_NUMBER = 33;
  607       static final int CHAR_SEPARATOR = 34;
  608       static final int CHAR_OTHER = 35;
  609       static final int CHAR_PUNCTUATION = 36;
  610       static final int CHAR_SYMBOL = 37;
  611       
  612       //blockNames in UNICODE 3.1 that supported by XML Schema REC             
  613       private static final String[] blockNames = {
  614           /*0000..007F;*/ "Basic Latin",
  615           /*0080..00FF;*/ "Latin-1 Supplement",
  616           /*0100..017F;*/ "Latin Extended-A",
  617           /*0180..024F;*/ "Latin Extended-B",
  618           /*0250..02AF;*/ "IPA Extensions",
  619           /*02B0..02FF;*/ "Spacing Modifier Letters",
  620           /*0300..036F;*/ "Combining Diacritical Marks",
  621           /*0370..03FF;*/ "Greek",
  622           /*0400..04FF;*/ "Cyrillic",
  623           /*0530..058F;*/ "Armenian",
  624           /*0590..05FF;*/ "Hebrew",
  625           /*0600..06FF;*/ "Arabic",
  626           /*0700..074F;*/ "Syriac",  
  627           /*0780..07BF;*/ "Thaana",
  628           /*0900..097F;*/ "Devanagari",
  629           /*0980..09FF;*/ "Bengali",
  630           /*0A00..0A7F;*/ "Gurmukhi",
  631           /*0A80..0AFF;*/ "Gujarati",
  632           /*0B00..0B7F;*/ "Oriya",
  633           /*0B80..0BFF;*/ "Tamil",
  634           /*0C00..0C7F;*/ "Telugu",
  635           /*0C80..0CFF;*/ "Kannada",
  636           /*0D00..0D7F;*/ "Malayalam",
  637           /*0D80..0DFF;*/ "Sinhala",
  638           /*0E00..0E7F;*/ "Thai",
  639           /*0E80..0EFF;*/ "Lao",
  640           /*0F00..0FFF;*/ "Tibetan",
  641           /*1000..109F;*/ "Myanmar", 
  642           /*10A0..10FF;*/ "Georgian",
  643           /*1100..11FF;*/ "Hangul Jamo",
  644           /*1200..137F;*/ "Ethiopic",
  645           /*13A0..13FF;*/ "Cherokee",
  646           /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
  647           /*1680..169F;*/ "Ogham",
  648           /*16A0..16FF;*/ "Runic",
  649           /*1780..17FF;*/ "Khmer",
  650           /*1800..18AF;*/ "Mongolian",
  651           /*1E00..1EFF;*/ "Latin Extended Additional",
  652           /*1F00..1FFF;*/ "Greek Extended",
  653           /*2000..206F;*/ "General Punctuation",
  654           /*2070..209F;*/ "Superscripts and Subscripts",
  655           /*20A0..20CF;*/ "Currency Symbols",
  656           /*20D0..20FF;*/ "Combining Marks for Symbols",
  657           /*2100..214F;*/ "Letterlike Symbols",
  658           /*2150..218F;*/ "Number Forms",
  659           /*2190..21FF;*/ "Arrows",
  660           /*2200..22FF;*/ "Mathematical Operators",
  661           /*2300..23FF;*/ "Miscellaneous Technical",
  662           /*2400..243F;*/ "Control Pictures",
  663           /*2440..245F;*/ "Optical Character Recognition",
  664           /*2460..24FF;*/ "Enclosed Alphanumerics",
  665           /*2500..257F;*/ "Box Drawing",
  666           /*2580..259F;*/ "Block Elements",
  667           /*25A0..25FF;*/ "Geometric Shapes",
  668           /*2600..26FF;*/ "Miscellaneous Symbols",
  669           /*2700..27BF;*/ "Dingbats",
  670           /*2800..28FF;*/ "Braille Patterns",
  671           /*2E80..2EFF;*/ "CJK Radicals Supplement",
  672           /*2F00..2FDF;*/ "Kangxi Radicals",
  673           /*2FF0..2FFF;*/ "Ideographic Description Characters",
  674           /*3000..303F;*/ "CJK Symbols and Punctuation",
  675           /*3040..309F;*/ "Hiragana",
  676           /*30A0..30FF;*/ "Katakana",
  677           /*3100..312F;*/ "Bopomofo",
  678           /*3130..318F;*/ "Hangul Compatibility Jamo",
  679           /*3190..319F;*/ "Kanbun",
  680           /*31A0..31BF;*/ "Bopomofo Extended",
  681           /*3200..32FF;*/ "Enclosed CJK Letters and Months",
  682           /*3300..33FF;*/ "CJK Compatibility",
  683           /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
  684           /*4E00..9FFF;*/ "CJK Unified Ideographs",
  685           /*A000..A48F;*/ "Yi Syllables",
  686           /*A490..A4CF;*/ "Yi Radicals",
  687           /*AC00..D7A3;*/ "Hangul Syllables",
  688           /*E000..F8FF;*/ "Private Use",
  689           /*F900..FAFF;*/ "CJK Compatibility Ideographs",
  690           /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
  691           /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
  692           /*FE20..FE2F;*/ "Combining Half Marks",
  693           /*FE30..FE4F;*/ "CJK Compatibility Forms",
  694           /*FE50..FE6F;*/ "Small Form Variants",
  695           /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
  696           /*FEFF..FEFF;*/ "Specials",
  697           /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
  698            //missing Specials add manually
  699           /*10300..1032F;*/ "Old Italic",		// 84
  700           /*10330..1034F;*/ "Gothic",
  701           /*10400..1044F;*/ "Deseret",
  702           /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
  703           /*1D100..1D1FF;*/ "Musical Symbols",
  704           /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
  705           /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
  706           /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
  707           /*E0000..E007F;*/ "Tags",
  708           //missing 2 private use add manually
  709   
  710       };
  711       //ADD THOSE MANUALLY
  712       //F0000..FFFFD; "Private Use",
  713       //100000..10FFFD; "Private Use"
  714       //FFF0..FFFD; "Specials", 
  715       static final String blockRanges = 
  716          "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
  717           +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
  718           +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
  719           +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
  720           +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
  721           +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
  722           +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
  723           +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
  724           +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
  725           +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
  726           +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
  727       static final int[] nonBMPBlockRanges = {
  728           0x10300, 0x1032F,       // 84
  729           0x10330, 0x1034F,
  730           0x10400, 0x1044F,
  731           0x1D000, 0x1D0FF,
  732           0x1D100, 0x1D1FF,
  733           0x1D400, 0x1D7FF,
  734           0x20000, 0x2A6D6,
  735           0x2F800, 0x2FA1F,
  736           0xE0000, 0xE007F
  737       };
  738       private static final int NONBMP_BLOCK_START = 84;
  739   
  740       static protected RangeToken getRange(String name, boolean positive) {
  741           if (Token.categories.size() == 0) {
  742               synchronized (Token.categories) {
  743                   Token[] ranges = new Token[Token.categoryNames.length];
  744                   for (int i = 0;  i < ranges.length;  i ++) {
  745                       ranges[i] = Token.createRange();
  746                   }
  747                   int type;
  748                   for (int i = 0;  i < 0x10000;  i ++) {
  749                       type = Character.getType((char)i);
  750                       if (type == Character.START_PUNCTUATION || 
  751                           type == Character.END_PUNCTUATION) {
  752                           //build table of Pi values
  753                           if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
  754                               i == 0x201F || i == 0x2039) {
  755                               type = CHAR_INIT_QUOTE;
  756                           }
  757                           //build table of Pf values
  758                           if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
  759                               type = CHAR_FINAL_QUOTE;
  760                           }
  761                       }
  762                       ranges[type].addRange(i, i);
  763                       switch (type) {
  764                         case Character.UPPERCASE_LETTER:
  765                         case Character.LOWERCASE_LETTER:
  766                         case Character.TITLECASE_LETTER:
  767                         case Character.MODIFIER_LETTER:
  768                         case Character.OTHER_LETTER:
  769                           type = CHAR_LETTER;
  770                           break;
  771                         case Character.NON_SPACING_MARK:
  772                         case Character.COMBINING_SPACING_MARK:
  773                         case Character.ENCLOSING_MARK:
  774                           type = CHAR_MARK;
  775                           break;
  776                         case Character.DECIMAL_DIGIT_NUMBER:
  777                         case Character.LETTER_NUMBER:
  778                         case Character.OTHER_NUMBER:
  779                           type = CHAR_NUMBER;
  780                           break;
  781                         case Character.SPACE_SEPARATOR:
  782                         case Character.LINE_SEPARATOR:
  783                         case Character.PARAGRAPH_SEPARATOR:
  784                           type = CHAR_SEPARATOR;
  785                           break;
  786                         case Character.CONTROL:
  787                         case Character.FORMAT:
  788                         case Character.SURROGATE:
  789                         case Character.PRIVATE_USE:
  790                         case Character.UNASSIGNED:
  791                           type = CHAR_OTHER;
  792                           break;
  793                         case Character.CONNECTOR_PUNCTUATION:
  794                         case Character.DASH_PUNCTUATION:
  795                         case Character.START_PUNCTUATION:
  796                         case Character.END_PUNCTUATION:
  797                         case CHAR_INIT_QUOTE:
  798                         case CHAR_FINAL_QUOTE:
  799                         case Character.OTHER_PUNCTUATION:
  800                           type = CHAR_PUNCTUATION;
  801                           break;
  802                         case Character.MATH_SYMBOL:
  803                         case Character.CURRENCY_SYMBOL:
  804                         case Character.MODIFIER_SYMBOL:
  805                         case Character.OTHER_SYMBOL:
  806                           type = CHAR_SYMBOL;
  807                           break;
  808                         default:
  809                           throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
  810                       }
  811                       ranges[type].addRange(i, i);
  812                   } // for all characters
  813                   ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
  814   
  815                   for (int i = 0;  i < ranges.length;  i ++) {
  816                       if (Token.categoryNames[i] != null) {
  817                           if (i == Character.UNASSIGNED) { // Unassigned
  818                               ranges[i].addRange(0x10000, Token.UTF16_MAX);
  819                           }
  820                           Token.categories.put(Token.categoryNames[i], ranges[i]);
  821                           Token.categories2.put(Token.categoryNames[i],
  822                                                 Token.complementRanges(ranges[i]));
  823                       }
  824                   }
  825                   //REVISIT: do we really need to support block names as in Unicode 3.1
  826                   //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
  827                   //
  828                   StringBuffer buffer = new StringBuffer(50);
  829                   for (int i = 0;  i < Token.blockNames.length;  i ++) {
  830                       Token r1 = Token.createRange();
  831                       int location;
  832                       if (i < NONBMP_BLOCK_START) {
  833                           location = i*2;
  834                           int rstart = Token.blockRanges.charAt(location);
  835                           int rend = Token.blockRanges.charAt(location+1);
  836                           //DEBUGING
  837                           //System.out.println(n+" " +Integer.toHexString(rstart)
  838                           //                     +"-"+ Integer.toHexString(rend));
  839                           r1.addRange(rstart, rend);
  840                       } else {
  841                           location = (i - NONBMP_BLOCK_START) * 2;
  842                           r1.addRange(Token.nonBMPBlockRanges[location],
  843                                       Token.nonBMPBlockRanges[location + 1]);
  844                       }
  845                       String n = Token.blockNames[i];
  846                       if (n.equals("Specials"))
  847                           r1.addRange(0xfff0, 0xfffd);
  848                       if (n.equals("Private Use")) {
  849                           r1.addRange(0xF0000,0xFFFFD);
  850                           r1.addRange(0x100000,0x10FFFD);
  851                       }
  852                       Token.categories.put(n, r1);
  853                       Token.categories2.put(n, Token.complementRanges(r1));
  854                       buffer.setLength(0);
  855                       buffer.append("Is");
  856                       if (n.indexOf(' ') >= 0) {
  857                           for (int ci = 0;  ci < n.length();  ci ++)
  858                               if (n.charAt(ci) != ' ')  buffer.append((char)n.charAt(ci));
  859                       }
  860                       else {
  861                           buffer.append(n);
  862                       }
  863                       Token.setAlias(buffer.toString(), n, true);
  864                   }
  865   
  866                   // TR#18 1.2
  867                   Token.setAlias("ASSIGNED", "Cn", false);
  868                   Token.setAlias("UNASSIGNED", "Cn", true);
  869                   Token all = Token.createRange();
  870                   all.addRange(0, Token.UTF16_MAX);
  871                   Token.categories.put("ALL", all);
  872                   Token.categories2.put("ALL", Token.complementRanges(all));
  873                   Token.registerNonXS("ASSIGNED");
  874                   Token.registerNonXS("UNASSIGNED");
  875                   Token.registerNonXS("ALL");
  876   
  877                   Token isalpha = Token.createRange();
  878                   isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
  879                   isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
  880                   isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
  881                   Token.categories.put("IsAlpha", isalpha);
  882                   Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
  883                   Token.registerNonXS("IsAlpha");
  884   
  885                   Token isalnum = Token.createRange();
  886                   isalnum.mergeRanges(isalpha);   // Lu Ll Lo
  887                   isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
  888                   Token.categories.put("IsAlnum", isalnum);
  889                   Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
  890                   Token.registerNonXS("IsAlnum");
  891   
  892                   Token isspace = Token.createRange();
  893                   isspace.mergeRanges(Token.token_spaces);
  894                   isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
  895                   Token.categories.put("IsSpace", isspace);
  896                   Token.categories2.put("IsSpace", Token.complementRanges(isspace));
  897                   Token.registerNonXS("IsSpace");
  898   
  899                   Token isword = Token.createRange();
  900                   isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
  901                   isword.addRange('_', '_');
  902                   Token.categories.put("IsWord", isword);
  903                   Token.categories2.put("IsWord", Token.complementRanges(isword));
  904                   Token.registerNonXS("IsWord");
  905   
  906                   Token isascii = Token.createRange();
  907                   isascii.addRange(0, 127);
  908                   Token.categories.put("IsASCII", isascii);
  909                   Token.categories2.put("IsASCII", Token.complementRanges(isascii));
  910                   Token.registerNonXS("IsASCII");
  911   
  912                   Token isnotgraph = Token.createRange();
  913                   isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
  914                   isnotgraph.addRange(' ', ' ');
  915                   Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
  916                   Token.categories2.put("IsGraph", isnotgraph);
  917                   Token.registerNonXS("IsGraph");
  918   
  919                   Token isxdigit = Token.createRange();
  920                   isxdigit.addRange('0', '9');
  921                   isxdigit.addRange('A', 'F');
  922                   isxdigit.addRange('a', 'f');
  923                   Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
  924                   Token.categories2.put("IsXDigit", isxdigit);
  925                   Token.registerNonXS("IsXDigit");
  926   
  927                   Token.setAlias("IsDigit", "Nd", true);
  928                   Token.setAlias("IsUpper", "Lu", true);
  929                   Token.setAlias("IsLower", "Ll", true);
  930                   Token.setAlias("IsCntrl", "C", true);
  931                   Token.setAlias("IsPrint", "C", false);
  932                   Token.setAlias("IsPunct", "P", true);
  933                   Token.registerNonXS("IsDigit");
  934                   Token.registerNonXS("IsUpper");
  935                   Token.registerNonXS("IsLower");
  936                   Token.registerNonXS("IsCntrl");
  937                   Token.registerNonXS("IsPrint");
  938                   Token.registerNonXS("IsPunct");
  939   
  940                   Token.setAlias("alpha", "IsAlpha", true);
  941                   Token.setAlias("alnum", "IsAlnum", true);
  942                   Token.setAlias("ascii", "IsASCII", true);
  943                   Token.setAlias("cntrl", "IsCntrl", true);
  944                   Token.setAlias("digit", "IsDigit", true);
  945                   Token.setAlias("graph", "IsGraph", true);
  946                   Token.setAlias("lower", "IsLower", true);
  947                   Token.setAlias("print", "IsPrint", true);
  948                   Token.setAlias("punct", "IsPunct", true);
  949                   Token.setAlias("space", "IsSpace", true);
  950                   Token.setAlias("upper", "IsUpper", true);
  951                   Token.setAlias("word", "IsWord", true); // Perl extension
  952                   Token.setAlias("xdigit", "IsXDigit", true);
  953                   Token.registerNonXS("alpha");
  954                   Token.registerNonXS("alnum");
  955                   Token.registerNonXS("ascii");
  956                   Token.registerNonXS("cntrl");
  957                   Token.registerNonXS("digit");
  958                   Token.registerNonXS("graph");
  959                   Token.registerNonXS("lower");
  960                   Token.registerNonXS("print");
  961                   Token.registerNonXS("punct");
  962                   Token.registerNonXS("space");
  963                   Token.registerNonXS("upper");
  964                   Token.registerNonXS("word");
  965                   Token.registerNonXS("xdigit");
  966               } // synchronized
  967           } // if null
  968           RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
  969               : (RangeToken)Token.categories2.get(name);
  970           //if (tok == null) System.out.println(name);
  971           return tok;
  972       }
  973       static protected RangeToken getRange(String name, boolean positive, boolean xs) {
  974           RangeToken range = Token.getRange(name, positive);
  975           if (xs && range != null && Token.isRegisterNonXS(name))
  976               range = null;
  977           return range;
  978       }
  979   
  980       static Hashtable nonxs = null;
  981       /**
  982        * This method is called by only getRange().
  983        * So this method need not MT-safe.
  984        */
  985       static protected void registerNonXS(String name) {
  986           if (Token.nonxs == null)
  987               Token.nonxs = new Hashtable();
  988           Token.nonxs.put(name, name);
  989       }
  990       static protected boolean isRegisterNonXS(String name) {
  991           if (Token.nonxs == null)
  992               return false;
  993           //DEBUG
  994           //System.err.println("isRegisterNonXS: "+name);
  995           return Token.nonxs.containsKey(name);
  996       }
  997   
  998       private static void setAlias(String newName, String name, boolean positive) {
  999           Token t1 = (Token)Token.categories.get(name);
 1000           Token t2 = (Token)Token.categories2.get(name);
 1001           if (positive) {
 1002               Token.categories.put(newName, t1);
 1003               Token.categories2.put(newName, t2);
 1004           } else {
 1005               Token.categories2.put(newName, t1);
 1006               Token.categories.put(newName, t2);
 1007           }
 1008       }
 1009   
 1010       // ------------------------------------------------------
 1011   
 1012       static final String viramaString =
 1013       "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1014       +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1015       +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1016       +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1017       +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1018       +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1019       +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1020       +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1021       +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1022       +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
 1023       +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
 1024   
 1025       static private Token token_grapheme = null;
 1026       static synchronized Token getGraphemePattern() {
 1027           if (Token.token_grapheme != null)
 1028               return Token.token_grapheme;
 1029   
 1030           Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
 1031           base_char.mergeRanges(Token.getRange("ASSIGNED", true));
 1032           base_char.subtractRanges(Token.getRange("M", true));
 1033           base_char.subtractRanges(Token.getRange("C", true));
 1034   
 1035           Token virama = Token.createRange();
 1036           for (int i = 0;  i < Token.viramaString.length(); i++) {
 1037               virama.addRange(i, i);
 1038           }
 1039   
 1040           Token combiner_wo_virama = Token.createRange();
 1041           combiner_wo_virama.mergeRanges(Token.getRange("M", true));
 1042           combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
 1043           combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
 1044   
 1045           Token left = Token.createUnion();       // base_char?
 1046           left.addChild(base_char);
 1047           left.addChild(Token.token_empty);
 1048   
 1049           Token foo = Token.createUnion();
 1050           foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
 1051           foo.addChild(combiner_wo_virama);
 1052   
 1053           foo = Token.createClosure(foo);
 1054   
 1055           foo = Token.createConcat(left, foo);
 1056   
 1057           Token.token_grapheme = foo;
 1058           return Token.token_grapheme;
 1059       }
 1060   
 1061       /**
 1062        * Combing Character Sequence in Perl 5.6.
 1063        */
 1064       static private Token token_ccs = null;
 1065       static synchronized Token getCombiningCharacterSequence() {
 1066           if (Token.token_ccs != null)
 1067               return Token.token_ccs;
 1068   
 1069           Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
 1070           foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
 1071           Token.token_ccs = foo;
 1072           return Token.token_ccs;
 1073       }
 1074   
 1075       // ------------------------------------------------------
 1076   
 1077       // ------------------------------------------------------
 1078       /**
 1079        * This class represents a node in parse tree.
 1080        */
 1081       static class StringToken extends Token implements java.io.Serializable {
 1082   
 1083           private static final long serialVersionUID = -4614366944218504172L;
 1084           
 1085           String string;
 1086           final int refNumber;
 1087   
 1088           StringToken(int type, String str, int n) {
 1089               super(type);
 1090               this.string = str;
 1091               this.refNumber = n;
 1092           }
 1093   
 1094           int getReferenceNumber() {              // for STRING
 1095               return this.refNumber;
 1096           }
 1097           String getString() {                    // for STRING
 1098               return this.string;
 1099           }
 1100           
 1101           public String toString(int options) {
 1102               if (this.type == BACKREFERENCE)
 1103                   return "\\"+this.refNumber;
 1104               else
 1105                   return REUtil.quoteMeta(this.string);
 1106           }
 1107       }
 1108   
 1109       /**
 1110        * This class represents a node in parse tree.
 1111        */
 1112       static class ConcatToken extends Token implements java.io.Serializable {
 1113   
 1114           private static final long serialVersionUID = 8717321425541346381L;
 1115           
 1116           final Token child;
 1117           final Token child2;
 1118           
 1119           ConcatToken(Token t1, Token t2) {
 1120               super(Token.CONCAT);
 1121               this.child = t1;
 1122               this.child2 = t2;
 1123           }
 1124   
 1125           int size() {
 1126               return 2;
 1127           }
 1128           Token getChild(int index) {
 1129               return index == 0 ? this.child : this.child2;
 1130           }
 1131   
 1132           public String toString(int options) {
 1133               String ret;
 1134               if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
 1135                   ret = this.child.toString(options)+"+";
 1136               } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
 1137                   ret = this.child.toString(options)+"+?";
 1138               } else
 1139                   ret = this.child.toString(options)+this.child2.toString(options);
 1140               return ret;
 1141           }
 1142       }
 1143   
 1144       /**
 1145        * This class represents a node in parse tree.
 1146        */
 1147       static class CharToken extends Token implements java.io.Serializable {
 1148   
 1149           private static final long serialVersionUID = -4394272816279496989L;
 1150           
 1151           final int chardata;
 1152   
 1153           CharToken(int type, int ch) {
 1154               super(type);
 1155               this.chardata = ch;
 1156           }
 1157   
 1158           int getChar() {
 1159               return this.chardata;
 1160           }
 1161   
 1162           public String toString(int options) {
 1163               String ret;
 1164               switch (this.type) {
 1165                 case CHAR:
 1166                   switch (this.chardata) {
 1167                     case '|':  case '*':  case '+':  case '?':
 1168                     case '(':  case ')':  case '.':  case '[':
 1169                     case '{':  case '\\':
 1170                       ret = "\\"+(char)this.chardata;
 1171                       break;
 1172                     case '\f':  ret = "\\f";  break;
 1173                     case '\n':  ret = "\\n";  break;
 1174                     case '\r':  ret = "\\r";  break;
 1175                     case '\t':  ret = "\\t";  break;
 1176                     case 0x1b:  ret = "\\e";  break;
 1177                       //case 0x0b:  ret = "\\v";  break;
 1178                     default:
 1179                       if (this.chardata >= 0x10000) {
 1180                           String pre = "0"+Integer.toHexString(this.chardata);
 1181                           ret = "\\v"+pre.substring(pre.length()-6, pre.length());
 1182                       } else
 1183                           ret = ""+(char)this.chardata;
 1184                   }
 1185                   break;
 1186   
 1187                 case ANCHOR:
 1188                   if (this == Token.token_linebeginning || this == Token.token_lineend)
 1189                       ret = ""+(char)this.chardata;
 1190                   else 
 1191                       ret = "\\"+(char)this.chardata;
 1192                   break;
 1193   
 1194                 default:
 1195                   ret = null;
 1196               }
 1197               return ret;
 1198           }
 1199   
 1200           boolean match(int ch) {
 1201               if (this.type == CHAR) {
 1202                   return ch == this.chardata;
 1203               } else
 1204                   throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
 1205           }
 1206       }
 1207   
 1208       /**
 1209        * This class represents a node in parse tree.
 1210        */
 1211       static class ClosureToken extends Token implements java.io.Serializable {
 1212   
 1213           private static final long serialVersionUID = 1308971930673997452L;
 1214           
 1215           int min;
 1216           int max;
 1217           final Token child;
 1218   
 1219           ClosureToken(int type, Token tok) {
 1220               super(type);
 1221               this.child = tok;
 1222               this.setMin(-1);
 1223               this.setMax(-1);
 1224           }
 1225   
 1226           int size() {
 1227               return 1;
 1228           }
 1229           Token getChild(int index) {
 1230               return this.child;
 1231           }
 1232   
 1233           final void setMin(int min) {
 1234               this.min = min;
 1235           }
 1236           final void setMax(int max) {
 1237               this.max = max;
 1238           }
 1239           final int getMin() {
 1240               return this.min;
 1241           }
 1242           final int getMax() {
 1243               return this.max;
 1244           }
 1245   
 1246           public String toString(int options) {
 1247               String ret;
 1248               if (this.type == CLOSURE) {
 1249                   if (this.getMin() < 0 && this.getMax() < 0) {
 1250                       ret = this.child.toString(options)+"*";
 1251                   } else if (this.getMin() == this.getMax()) {
 1252                       ret = this.child.toString(options)+"{"+this.getMin()+"}";
 1253                   } else if (this.getMin() >= 0 && this.getMax() >= 0) {
 1254                       ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
 1255                   } else if (this.getMin() >= 0 && this.getMax() < 0) {
 1256                       ret = this.child.toString(options)+"{"+this.getMin()+",}";
 1257                   } else
 1258                       throw new RuntimeException("Token#toString(): CLOSURE "
 1259                                                  +this.getMin()+", "+this.getMax());
 1260               } else {
 1261                   if (this.getMin() < 0 && this.getMax() < 0) {
 1262                       ret = this.child.toString(options)+"*?";
 1263                   } else if (this.getMin() == this.getMax()) {
 1264                       ret = this.child.toString(options)+"{"+this.getMin()+"}?";
 1265                   } else if (this.getMin() >= 0 && this.getMax() >= 0) {
 1266                       ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
 1267                   } else if (this.getMin() >= 0 && this.getMax() < 0) {
 1268                       ret = this.child.toString(options)+"{"+this.getMin()+",}?";
 1269                   } else
 1270                       throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
 1271                                                  +this.getMin()+", "+this.getMax());
 1272               }
 1273               return ret;
 1274           }
 1275       }
 1276   
 1277       /**
 1278        * This class represents a node in parse tree.
 1279        */
 1280       static class ParenToken extends Token implements java.io.Serializable {
 1281   
 1282           private static final long serialVersionUID = -5938014719827987704L;
 1283           
 1284           final Token child;
 1285           final int parennumber;
 1286   
 1287           ParenToken(int type, Token tok, int paren) {
 1288               super(type);
 1289               this.child = tok;
 1290               this.parennumber = paren;
 1291           }
 1292   
 1293           int size() {
 1294               return 1;
 1295           }
 1296           Token getChild(int index) {
 1297               return this.child;
 1298           }
 1299   
 1300           int getParenNumber() {
 1301               return this.parennumber;
 1302           }
 1303   
 1304           public String toString(int options) {
 1305               String ret = null;
 1306               switch (this.type) {
 1307                 case PAREN:
 1308                   if (this.parennumber == 0) {
 1309                       ret = "(?:"+this.child.toString(options)+")";
 1310                   } else {
 1311                       ret = "("+this.child.toString(options)+")";
 1312                   }
 1313                   break;
 1314   
 1315                 case LOOKAHEAD:
 1316                   ret = "(?="+this.child.toString(options)+")";
 1317                   break;
 1318                 case NEGATIVELOOKAHEAD:
 1319                   ret = "(?!"+this.child.toString(options)+")";
 1320                   break;
 1321                 case LOOKBEHIND:
 1322                   ret = "(?<="+this.child.toString(options)+")";
 1323                   break;
 1324                 case NEGATIVELOOKBEHIND:
 1325                   ret = "(?<!"+this.child.toString(options)+")";
 1326                   break;
 1327                 case INDEPENDENT:
 1328                   ret = "(?>"+this.child.toString(options)+")";
 1329                   break;
 1330               }
 1331               return ret;
 1332           }
 1333       }
 1334   
 1335       /**
 1336        * (?(condition)yes-pattern|no-pattern)
 1337        */
 1338       static class ConditionToken extends Token implements java.io.Serializable {
 1339   
 1340           private static final long serialVersionUID = 4353765277910594411L;
 1341           
 1342           final int refNumber;
 1343           final Token condition;
 1344           final Token yes;
 1345           final Token no;
 1346           ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
 1347               super(Token.CONDITION);
 1348               this.refNumber = refno;
 1349               this.condition = cond;
 1350               this.yes = yespat;
 1351               this.no = nopat;
 1352           }
 1353           int size() {
 1354               return this.no == null ? 1 : 2;
 1355           }
 1356           Token getChild(int index) {
 1357               if (index == 0)  return this.yes;
 1358               if (index == 1)  return this.no;
 1359               throw new RuntimeException("Internal Error: "+index);
 1360           }
 1361   
 1362           public String toString(int options) {
 1363               String ret;
 1364               if (refNumber > 0) {
 1365                   ret = "(?("+refNumber+")";
 1366               } else if (this.condition.type == Token.ANCHOR) {
 1367                   ret = "(?("+this.condition+")";
 1368               } else {
 1369                   ret = "(?"+this.condition;
 1370               }
 1371   
 1372               if (this.no == null) {
 1373                   ret += this.yes+")";
 1374               } else {
 1375                   ret += this.yes+"|"+this.no+")";
 1376               }
 1377               return ret;
 1378           }
 1379       }
 1380   
 1381       /**
 1382        * (ims-ims: .... )
 1383        */
 1384       static class ModifierToken extends Token implements java.io.Serializable {
 1385   
 1386           private static final long serialVersionUID = -9114536559696480356L;
 1387           
 1388           final Token child;
 1389           final int add;
 1390           final int mask;
 1391   
 1392           ModifierToken(Token tok, int add, int mask) {
 1393               super(Token.MODIFIERGROUP);
 1394               this.child = tok;
 1395               this.add = add;
 1396               this.mask = mask;
 1397           }
 1398   
 1399           int size() {
 1400               return 1;
 1401           }
 1402           Token getChild(int index) {
 1403               return this.child;
 1404           }
 1405   
 1406           int getOptions() {
 1407               return this.add;
 1408           }
 1409           int getOptionsMask() {
 1410               return this.mask;
 1411           }
 1412   
 1413           public String toString(int options) {
 1414               return "(?"
 1415                   +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
 1416                   +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
 1417                   +":"
 1418                   +this.child.toString(options)
 1419                   +")";
 1420           }
 1421       }
 1422   
 1423       /**
 1424        * This class represents a node in parse tree.
 1425        * for UNION or CONCAT.
 1426        */
 1427       static class UnionToken extends Token implements java.io.Serializable {
 1428   
 1429           private static final long serialVersionUID = -2568843945989489861L;
 1430           
 1431           Vector children;
 1432   
 1433           UnionToken(int type) {
 1434               super(type);
 1435           }
 1436   
 1437           void addChild(Token tok) {
 1438               if (tok == null)  return;
 1439               if (this.children == null)  this.children = new Vector();
 1440               if (this.type == UNION) {
 1441                   this.children.addElement(tok);
 1442                   return;
 1443               }
 1444                                                   // This is CONCAT, and new child is CONCAT.
 1445               if (tok.type == CONCAT) {
 1446                   for (int i = 0;  i < tok.size();  i ++)
 1447                       this.addChild(tok.getChild(i)); // Recursion
 1448                   return;
 1449               }
 1450               int size = this.children.size();
 1451               if (size == 0) {
 1452                   this.children.addElement(tok);
 1453                   return;
 1454               }
 1455               Token previous = (Token)this.children.elementAt(size-1);
 1456               if (!((previous.type == CHAR || previous.type == STRING)
 1457                     && (tok.type == CHAR || tok.type == STRING))) {
 1458                   this.children.addElement(tok);
 1459                   return;
 1460               }
 1461               
 1462               //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
 1463   
 1464               StringBuffer buffer;
 1465               int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
 1466               if (previous.type == CHAR) {        // Replace previous token by STRING
 1467                   buffer = new StringBuffer(2 + nextMaxLength);
 1468                   int ch = previous.getChar();
 1469                   if (ch >= 0x10000)
 1470                       buffer.append(REUtil.decomposeToSurrogates(ch));
 1471                   else
 1472                       buffer.append((char)ch);
 1473                   previous = Token.createString(null);
 1474                   this.children.setElementAt(previous, size-1);
 1475               } else {                            // STRING
 1476                   buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
 1477                   buffer.append(previous.getString());
 1478               }
 1479   
 1480               if (tok.type == CHAR) {
 1481                   int ch = tok.getChar();
 1482                   if (ch >= 0x10000)
 1483                       buffer.append(REUtil.decomposeToSurrogates(ch));
 1484                   else
 1485                       buffer.append((char)ch);
 1486               } else {
 1487                   buffer.append(tok.getString());
 1488               }
 1489   
 1490               ((StringToken)previous).string = new String(buffer);
 1491           }
 1492   
 1493           int size() {
 1494               return this.children == null ? 0 : this.children.size();
 1495           }
 1496           Token getChild(int index) {
 1497               return (Token)this.children.elementAt(index);
 1498           }
 1499   
 1500           public String toString(int options) {
 1501               String ret;
 1502               if (this.type == CONCAT) {
 1503                   if (this.children.size() == 2) {
 1504                       Token ch = this.getChild(0);
 1505                       Token ch2 = this.getChild(1);
 1506                       if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
 1507                           ret = ch.toString(options)+"+";
 1508                       } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
 1509                           ret = ch.toString(options)+"+?";
 1510                       } else
 1511                           ret = ch.toString(options)+ch2.toString(options);
 1512                   } else {
 1513                       StringBuffer sb = new StringBuffer();
 1514                       for (int i = 0;  i < this.children.size();  i ++) {
 1515                           sb.append(((Token)this.children.elementAt(i)).toString(options));
 1516                       }
 1517                       ret = new String(sb);
 1518                   }
 1519                   return ret;
 1520               }
 1521               if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
 1522                   ret = this.getChild(0).toString(options)+"?";
 1523               } else if (this.children.size() == 2
 1524                          && this.getChild(0).type == EMPTY) {
 1525                   ret = this.getChild(1).toString(options)+"??";
 1526               } else {
 1527                   StringBuffer sb = new StringBuffer();
 1528                   sb.append(((Token)this.children.elementAt(0)).toString(options));
 1529                   for (int i = 1;  i < this.children.size();  i ++) {
 1530                       sb.append((char)'|');
 1531                       sb.append(((Token)this.children.elementAt(i)).toString(options));
 1532                   }
 1533                   ret = new String(sb);
 1534               }
 1535               return ret;
 1536           }
 1537       }
 1538   }

Home » Xerces-J-src.2.9.1 » org.apache.xerces » impl » xpath » regex » [javadoc | source]