Home » Xerces-J-src.2.9.1 » org.apache.xerces » impl » xpath » regex » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    * 
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    * 
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.xerces.impl.xpath.regex;
   19   
   20   /**
   21    * This class represents a character class such as [a-z] or a period.
   22    * 
   23    * @xerces.internal
   24    *
   25    * @version $Id: RangeToken.java 504431 2007-02-07 04:25:14Z mrglavas $
   26    */
   27   final class RangeToken extends Token implements java.io.Serializable {
   28   
   29       private static final long serialVersionUID = -553983121197679934L;
   30       
   31       int[] ranges;
   32       boolean sorted;
   33       boolean compacted;
   34       RangeToken icaseCache = null;
   35       int[] map = null;
   36       int nonMapIndex;
   37   
   38       RangeToken(int type) {
   39           super(type);
   40           this.setSorted(false);
   41       }
   42   
   43                                                   // for RANGE or NRANGE
   44       protected void addRange(int start, int end) {
   45           this.icaseCache = null;
   46           //System.err.println("Token#addRange(): "+start+" "+end);
   47           int r1, r2;
   48           if (start <= end) {
   49               r1 = start;
   50               r2 = end;
   51           } else {
   52               r1 = end;
   53               r2 = start;
   54           }
   55   
   56           int pos = 0;
   57           if (this.ranges == null) {
   58               this.ranges = new int[2];
   59               this.ranges[0] = r1;
   60               this.ranges[1] = r2;
   61               this.setSorted(true);
   62           } else {
   63               pos = this.ranges.length;
   64               if (this.ranges[pos-1]+1 == r1) {
   65                   this.ranges[pos-1] = r2;
   66                   return;
   67               }
   68               int[] temp = new int[pos+2];
   69               System.arraycopy(this.ranges, 0, temp, 0, pos);
   70               this.ranges = temp;
   71               if (this.ranges[pos-1] >= r1)
   72                   this.setSorted(false);
   73               this.ranges[pos++] = r1;
   74               this.ranges[pos] = r2;
   75               if (!this.sorted)
   76                   this.sortRanges();
   77           }
   78       }
   79   
   80       private final boolean isSorted() {
   81           return this.sorted;
   82       }
   83       private final void setSorted(boolean sort) {
   84           this.sorted = sort;
   85           if (!sort)  this.compacted = false;
   86       }
   87       private final boolean isCompacted() {
   88           return this.compacted;
   89       }
   90       private final void setCompacted() {
   91           this.compacted = true;
   92       }
   93   
   94       protected void sortRanges() {
   95           if (this.isSorted())
   96               return;
   97           if (this.ranges == null)
   98               return;
   99           //System.err.println("Do sorting: "+this.ranges.length);
  100   
  101                                                   // Bubble sort
  102                                                   // Why? -- In many cases,
  103                                                   //         this.ranges has few elements.
  104           for (int i = this.ranges.length-4;  i >= 0;  i -= 2) {
  105               for (int j = 0;  j <= i;  j += 2) {
  106                   if (this.ranges[j] > this.ranges[j+2]
  107                       || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
  108                       int tmp;
  109                       tmp = this.ranges[j+2];
  110                       this.ranges[j+2] = this.ranges[j];
  111                       this.ranges[j] = tmp;
  112                       tmp = this.ranges[j+3];
  113                       this.ranges[j+3] = this.ranges[j+1];
  114                       this.ranges[j+1] = tmp;
  115                   }
  116               }
  117           }
  118           this.setSorted(true);
  119       }
  120   
  121       /**
  122        * this.ranges is sorted.
  123        */
  124       protected void compactRanges() {
  125           boolean DEBUG = false;
  126           if (this.ranges == null || this.ranges.length <= 2)
  127               return;
  128           if (this.isCompacted())
  129               return;
  130           int base = 0;                           // Index of writing point
  131           int target = 0;                         // Index of processing point
  132   
  133           while (target < this.ranges.length) {
  134               if (base != target) {
  135                   this.ranges[base] = this.ranges[target++];
  136                   this.ranges[base+1] = this.ranges[target++];
  137               } else
  138                   target += 2;
  139               int baseend = this.ranges[base+1];
  140               while (target < this.ranges.length) {
  141                   if (baseend+1 < this.ranges[target])
  142                       break;
  143                   if (baseend+1 == this.ranges[target]) {
  144                       if (DEBUG)
  145                           System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  146                                              +", "+this.ranges[base+1]
  147                                              +"], ["+this.ranges[target]
  148                                              +", "+this.ranges[target+1]
  149                                              +"] -> ["+this.ranges[base]
  150                                              +", "+this.ranges[target+1]
  151                                              +"]");
  152                       this.ranges[base+1] = this.ranges[target+1];
  153                       baseend = this.ranges[base+1];
  154                       target += 2;
  155                   } else if (baseend >= this.ranges[target+1]) {
  156                       if (DEBUG)
  157                           System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  158                                              +", "+this.ranges[base+1]
  159                                              +"], ["+this.ranges[target]
  160                                              +", "+this.ranges[target+1]
  161                                              +"] -> ["+this.ranges[base]
  162                                              +", "+this.ranges[base+1]
  163                                              +"]");
  164                       target += 2;
  165                   } else if (baseend < this.ranges[target+1]) {
  166                       if (DEBUG)
  167                           System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  168                                              +", "+this.ranges[base+1]
  169                                              +"], ["+this.ranges[target]
  170                                              +", "+this.ranges[target+1]
  171                                              +"] -> ["+this.ranges[base]
  172                                              +", "+this.ranges[target+1]
  173                                              +"]");
  174                       this.ranges[base+1] = this.ranges[target+1];
  175                       baseend = this.ranges[base+1];
  176                       target += 2;
  177                   } else {
  178                       throw new RuntimeException("Token#compactRanges(): Internel Error: ["
  179                                                  +this.ranges[base]
  180                                                  +","+this.ranges[base+1]
  181                                                  +"] ["+this.ranges[target]
  182                                                  +","+this.ranges[target+1]+"]");
  183                   }
  184               } // while
  185               base += 2;
  186           }
  187   
  188           if (base != this.ranges.length) {
  189               int[] result = new int[base];
  190               System.arraycopy(this.ranges, 0, result, 0, base);
  191               this.ranges = result;
  192           }
  193           this.setCompacted();
  194       }
  195   
  196       protected void mergeRanges(Token token) {
  197           RangeToken tok = (RangeToken)token;
  198           this.sortRanges();
  199           tok.sortRanges();
  200           if (tok.ranges == null)
  201               return;
  202           this.icaseCache = null;
  203           this.setSorted(true);
  204           if (this.ranges == null) {
  205               this.ranges = new int[tok.ranges.length];
  206               System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
  207               return;
  208           }
  209           int[] result = new int[this.ranges.length+tok.ranges.length];
  210           for (int i = 0, j = 0, k = 0;  i < this.ranges.length || j < tok.ranges.length;) {
  211               if (i >= this.ranges.length) {
  212                   result[k++] = tok.ranges[j++];
  213                   result[k++] = tok.ranges[j++];
  214               } else if (j >= tok.ranges.length) {
  215                   result[k++] = this.ranges[i++];
  216                   result[k++] = this.ranges[i++];
  217               } else if (tok.ranges[j] < this.ranges[i]
  218                          || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
  219                   result[k++] = tok.ranges[j++];
  220                   result[k++] = tok.ranges[j++];
  221               } else {
  222                   result[k++] = this.ranges[i++];
  223                   result[k++] = this.ranges[i++];
  224               }
  225           }
  226           this.ranges = result;
  227       }
  228   
  229       protected void subtractRanges(Token token) {
  230           if (token.type == NRANGE) {
  231               this.intersectRanges(token);
  232               return;
  233           }
  234           RangeToken tok = (RangeToken)token;
  235           if (tok.ranges == null || this.ranges == null)
  236               return;
  237           this.icaseCache = null;
  238           this.sortRanges();
  239           this.compactRanges();
  240           tok.sortRanges();
  241           tok.compactRanges();
  242   
  243           //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
  244   
  245           int[] result = new int[this.ranges.length+tok.ranges.length];
  246           int wp = 0, src = 0, sub = 0;
  247           while (src < this.ranges.length && sub < tok.ranges.length) {
  248               int srcbegin = this.ranges[src];
  249               int srcend = this.ranges[src+1];
  250               int subbegin = tok.ranges[sub];
  251               int subend = tok.ranges[sub+1];
  252               if (srcend < subbegin) {            // Not overlapped
  253                                                   // src: o-----o
  254                                                   // sub:         o-----o
  255                                                   // res: o-----o
  256                                                   // Reuse sub
  257                   result[wp++] = this.ranges[src++];
  258                   result[wp++] = this.ranges[src++];
  259               } else if (srcend >= subbegin
  260                          && srcbegin <= subend) { // Overlapped
  261                                                   // src:    o--------o
  262                                                   // sub:  o----o
  263                                                   // sub:      o----o
  264                                                   // sub:          o----o
  265                                                   // sub:  o------------o
  266                   if (subbegin <= srcbegin && srcend <= subend) {
  267                                                   // src:    o--------o
  268                                                   // sub:  o------------o
  269                                                   // res: empty
  270                                                   // Reuse sub
  271                       src += 2;
  272                   } else if (subbegin <= srcbegin) {
  273                                                   // src:    o--------o
  274                                                   // sub:  o----o
  275                                                   // res:       o-----o
  276                                                   // Reuse src(=res)
  277                       this.ranges[src] = subend+1;
  278                       sub += 2;
  279                   } else if (srcend <= subend) {
  280                                                   // src:    o--------o
  281                                                   // sub:          o----o
  282                                                   // res:    o-----o
  283                                                   // Reuse sub
  284                       result[wp++] = srcbegin;
  285                       result[wp++] = subbegin-1;
  286                       src += 2;
  287                   } else {
  288                                                   // src:    o--------o
  289                                                   // sub:      o----o
  290                                                   // res:    o-o    o-o
  291                                                   // Reuse src(=right res)
  292                       result[wp++] = srcbegin;
  293                       result[wp++] = subbegin-1;
  294                       this.ranges[src] = subend+1;
  295                       sub += 2;
  296                   }
  297               } else if (subend < srcbegin) {
  298                                                   // Not overlapped
  299                                                   // src:          o-----o
  300                                                   // sub: o----o
  301                   sub += 2;
  302               } else {
  303                   throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
  304                                              +","+this.ranges[src+1]
  305                                              +"] - ["+tok.ranges[sub]
  306                                              +","+tok.ranges[sub+1]
  307                                              +"]");
  308               }
  309           }
  310           while (src < this.ranges.length) {
  311               result[wp++] = this.ranges[src++];
  312               result[wp++] = this.ranges[src++];
  313           }
  314           this.ranges = new int[wp];
  315           System.arraycopy(result, 0, this.ranges, 0, wp);
  316                                                   // this.ranges is sorted and compacted.
  317       }
  318   
  319       /**
  320        * @param tok Ignore whether it is NRANGE or not.
  321        */
  322       protected void intersectRanges(Token token) {
  323           RangeToken tok = (RangeToken)token;
  324           if (tok.ranges == null || this.ranges == null)
  325               return;
  326           this.icaseCache = null;
  327           this.sortRanges();
  328           this.compactRanges();
  329           tok.sortRanges();
  330           tok.compactRanges();
  331   
  332           int[] result = new int[this.ranges.length+tok.ranges.length];
  333           int wp = 0, src1 = 0, src2 = 0;
  334           while (src1 < this.ranges.length && src2 < tok.ranges.length) {
  335               int src1begin = this.ranges[src1];
  336               int src1end = this.ranges[src1+1];
  337               int src2begin = tok.ranges[src2];
  338               int src2end = tok.ranges[src2+1];
  339               if (src1end < src2begin) {          // Not overlapped
  340                                                   // src1: o-----o
  341                                                   // src2:         o-----o
  342                                                   // res:  empty
  343                                                   // Reuse src2
  344                   src1 += 2;
  345               } else if (src1end >= src2begin
  346                          && src1begin <= src2end) { // Overlapped
  347                                                   // src1:    o--------o
  348                                                   // src2:  o----o
  349                                                   // src2:      o----o
  350                                                   // src2:          o----o
  351                                                   // src2:  o------------o
  352                   if (src2begin <= src1begin && src1end <= src2end) {
  353                                                   // src1:    o--------o
  354                                                   // src2:  o------------o
  355                                                   // res:     o--------o
  356                                                   // Reuse src2
  357                       result[wp++] = src1begin;
  358                       result[wp++] = src1end;
  359                       src1 += 2;
  360                   } else if (src2begin <= src1begin) {
  361                                                   // src1:    o--------o
  362                                                   // src2:  o----o
  363                                                   // res:     o--o
  364                                                   // Reuse the rest of src1
  365                       result[wp++] = src1begin;
  366                       result[wp++] = src2end;
  367                       this.ranges[src1] = src2end+1;
  368                       src2 += 2;
  369                   } else if (src1end <= src2end) {
  370                                                   // src1:    o--------o
  371                                                   // src2:          o----o
  372                                                   // res:           o--o
  373                                                   // Reuse src2
  374                       result[wp++] = src2begin;
  375                       result[wp++] = src1end;
  376                       src1 += 2;
  377                   } else {
  378                                                   // src1:    o--------o
  379                                                   // src2:      o----o
  380                                                   // res:       o----o
  381                                                   // Reuse the rest of src1
  382                       result[wp++] = src2begin;
  383                       result[wp++] = src2end;
  384                       this.ranges[src1] = src2end+1;
  385                   }
  386               } else if (src2end < src1begin) {
  387                                                   // Not overlapped
  388                                                   // src1:          o-----o
  389                                                   // src2: o----o
  390                   src2 += 2;
  391               } else {
  392                   throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
  393                                              +this.ranges[src1]
  394                                              +","+this.ranges[src1+1]
  395                                              +"] & ["+tok.ranges[src2]
  396                                              +","+tok.ranges[src2+1]
  397                                              +"]");
  398               }
  399           }
  400           while (src1 < this.ranges.length) {
  401               result[wp++] = this.ranges[src1++];
  402               result[wp++] = this.ranges[src1++];
  403           }
  404           this.ranges = new int[wp];
  405           System.arraycopy(result, 0, this.ranges, 0, wp);
  406                                                   // this.ranges is sorted and compacted.
  407       }
  408   
  409       /**
  410        * for RANGE: Creates complement.
  411        * for NRANGE: Creates the same meaning RANGE.
  412        */
  413       static Token complementRanges(Token token) {
  414           if (token.type != RANGE && token.type != NRANGE)
  415               throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type);
  416           RangeToken tok = (RangeToken)token;
  417           tok.sortRanges();
  418           tok.compactRanges();
  419           int len = tok.ranges.length+2;
  420           if (tok.ranges[0] == 0)
  421               len -= 2;
  422           int last = tok.ranges[tok.ranges.length-1];
  423           if (last == UTF16_MAX)
  424               len -= 2;
  425           RangeToken ret = Token.createRange();
  426           ret.ranges = new int[len];
  427           int wp = 0;
  428           if (tok.ranges[0] > 0) {
  429               ret.ranges[wp++] = 0;
  430               ret.ranges[wp++] = tok.ranges[0]-1;
  431           }
  432           for (int i = 1;  i < tok.ranges.length-2;  i += 2) {
  433               ret.ranges[wp++] = tok.ranges[i]+1;
  434               ret.ranges[wp++] = tok.ranges[i+1]-1;
  435           }
  436           if (last != UTF16_MAX) {
  437               ret.ranges[wp++] = last+1;
  438               ret.ranges[wp] = UTF16_MAX;
  439           }
  440           ret.setCompacted();
  441           return ret;
  442       }
  443   
  444       synchronized RangeToken getCaseInsensitiveToken() {
  445           if (this.icaseCache != null)
  446               return this.icaseCache;
  447               
  448           RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
  449           for (int i = 0;  i < this.ranges.length;  i += 2) {
  450               for (int ch = this.ranges[i];  ch <= this.ranges[i+1];  ch ++) {
  451                   if (ch > 0xffff)
  452                       uppers.addRange(ch, ch);
  453                   else {
  454                       char uch = Character.toUpperCase((char)ch);
  455                       uppers.addRange(uch, uch);
  456                   }
  457               }
  458           }
  459           RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
  460           for (int i = 0;  i < uppers.ranges.length;  i += 2) {
  461               for (int ch = uppers.ranges[i];  ch <= uppers.ranges[i+1];  ch ++) {
  462                   if (ch > 0xffff)
  463                       lowers.addRange(ch, ch);
  464                   else {
  465                       char uch = Character.toUpperCase((char)ch);
  466                       lowers.addRange(uch, uch);
  467                   }
  468               }
  469           }
  470           lowers.mergeRanges(uppers);
  471           lowers.mergeRanges(this);
  472           lowers.compactRanges();
  473   
  474           this.icaseCache = lowers;
  475           return lowers;
  476       }
  477   
  478       void dumpRanges() {
  479           System.err.print("RANGE: ");
  480           if (this.ranges == null)
  481               System.err.println(" NULL");
  482           for (int i = 0;  i < this.ranges.length;  i += 2) {
  483               System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
  484           }
  485           System.err.println("");
  486       }
  487   
  488       boolean match(int ch) {
  489           if (this.map == null)  this.createMap();
  490           boolean ret;
  491           if (this.type == RANGE) {
  492               if (ch < MAPSIZE)
  493                   return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
  494               ret = false;
  495               for (int i = this.nonMapIndex;  i < this.ranges.length;  i += 2) {
  496                   if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
  497                       return true;
  498               }
  499           } else {
  500               if (ch < MAPSIZE)
  501                   return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
  502               ret = true;
  503               for (int i = this.nonMapIndex;  i < this.ranges.length;  i += 2) {
  504                   if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
  505                       return false;
  506               }
  507           }
  508           return ret;
  509       }
  510   
  511       private static final int MAPSIZE = 256;
  512       private void createMap() {
  513           int asize = MAPSIZE/32;                 // 32 is the number of bits in `int'.
  514           int [] map = new int[asize];
  515           int nonMapIndex = this.ranges.length;
  516           for (int i = 0; i < asize; ++i) {
  517               map[i] = 0;
  518           }
  519           for (int i = 0; i < this.ranges.length;  i += 2) {
  520               int s = this.ranges[i];
  521               int e = this.ranges[i+1];
  522               if (s < MAPSIZE) {
  523                   for (int j = s; j <= e && j < MAPSIZE; j++) {
  524                       map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
  525                   }
  526               } 
  527               else {
  528                   nonMapIndex = i;
  529                   break;
  530               }
  531               if (e >= MAPSIZE) {
  532                   nonMapIndex = i;
  533                   break;
  534               }
  535           }
  536           this.map = map;
  537           this.nonMapIndex = nonMapIndex;
  538           //for (int i = 0;  i < asize;  i ++)  System.err.println("Map: "+Integer.toString(this.map[i], 16));
  539       }
  540   
  541       public String toString(int options) {
  542           String ret;
  543           if (this.type == RANGE) {
  544               if (this == Token.token_dot)
  545                   ret = ".";
  546               else if (this == Token.token_0to9)
  547                   ret = "\\d";
  548               else if (this == Token.token_wordchars)
  549                   ret = "\\w";
  550               else if (this == Token.token_spaces)
  551                   ret = "\\s";
  552               else {
  553                   StringBuffer sb = new StringBuffer();
  554                   sb.append("[");
  555                   for (int i = 0;  i < this.ranges.length;  i += 2) {
  556                       if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0)  sb.append(",");
  557                       if (this.ranges[i] == this.ranges[i+1]) {
  558                           sb.append(escapeCharInCharClass(this.ranges[i]));
  559                       } else {
  560                           sb.append(escapeCharInCharClass(this.ranges[i]));
  561                           sb.append((char)'-');
  562                           sb.append(escapeCharInCharClass(this.ranges[i+1]));
  563                       }
  564                   }
  565                   sb.append("]");
  566                   ret = sb.toString();
  567               }
  568           } else {
  569               if (this == Token.token_not_0to9)
  570                   ret = "\\D";
  571               else if (this == Token.token_not_wordchars)
  572                   ret = "\\W";
  573               else if (this == Token.token_not_spaces)
  574                   ret = "\\S";
  575               else {
  576                   StringBuffer sb = new StringBuffer();
  577                   sb.append("[^");
  578                   for (int i = 0;  i < this.ranges.length;  i += 2) {
  579                       if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0)  sb.append(",");
  580                       if (this.ranges[i] == this.ranges[i+1]) {
  581                           sb.append(escapeCharInCharClass(this.ranges[i]));
  582                       } else {
  583                           sb.append(escapeCharInCharClass(this.ranges[i]));
  584                           sb.append('-');
  585                           sb.append(escapeCharInCharClass(this.ranges[i+1]));
  586                       }
  587                   }
  588                   sb.append("]");
  589                   ret = sb.toString();
  590               }
  591           }
  592           return ret;
  593       }
  594   
  595       private static String escapeCharInCharClass(int ch) {
  596           String ret;
  597           switch (ch) {
  598             case '[':  case ']':  case '-':  case '^':
  599             case ',':  case '\\':
  600               ret = "\\"+(char)ch;
  601               break;
  602             case '\f':  ret = "\\f";  break;
  603             case '\n':  ret = "\\n";  break;
  604             case '\r':  ret = "\\r";  break;
  605             case '\t':  ret = "\\t";  break;
  606             case 0x1b:  ret = "\\e";  break;
  607             //case 0x0b:  ret = "\\v";  break;
  608             default:
  609               if (ch < 0x20) {
  610                   String pre = "0"+Integer.toHexString(ch);
  611                   ret = "\\x"+pre.substring(pre.length()-2, pre.length());
  612               } else if (ch >= 0x10000) {
  613                   String pre = "0"+Integer.toHexString(ch);
  614                   ret = "\\v"+pre.substring(pre.length()-6, pre.length());
  615               } else
  616                   ret = ""+(char)ch;
  617           }
  618           return ret;
  619       }
  620   
  621   }

Home » Xerces-J-src.2.9.1 » org.apache.xerces » impl » xpath » regex » [javadoc | source]