1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.xerces.impl.xpath.regex;
19
20 /**
21 * This class represents a character class such as [a-z] or a period.
22 *
23 * @xerces.internal
24 *
25 * @version $Id: RangeToken.java 504431 2007-02-07 04:25:14Z mrglavas $
26 */
27 final class RangeToken extends Token implements java.io.Serializable {
28
29 private static final long serialVersionUID = -553983121197679934L;
30
31 int[] ranges;
32 boolean sorted;
33 boolean compacted;
34 RangeToken icaseCache = null;
35 int[] map = null;
36 int nonMapIndex;
37
38 RangeToken(int type) {
39 super(type);
40 this.setSorted(false);
41 }
42
43 // for RANGE or NRANGE
44 protected void addRange(int start, int end) {
45 this.icaseCache = null;
46 //System.err.println("Token#addRange(): "+start+" "+end);
47 int r1, r2;
48 if (start <= end) {
49 r1 = start;
50 r2 = end;
51 } else {
52 r1 = end;
53 r2 = start;
54 }
55
56 int pos = 0;
57 if (this.ranges == null) {
58 this.ranges = new int[2];
59 this.ranges[0] = r1;
60 this.ranges[1] = r2;
61 this.setSorted(true);
62 } else {
63 pos = this.ranges.length;
64 if (this.ranges[pos-1]+1 == r1) {
65 this.ranges[pos-1] = r2;
66 return;
67 }
68 int[] temp = new int[pos+2];
69 System.arraycopy(this.ranges, 0, temp, 0, pos);
70 this.ranges = temp;
71 if (this.ranges[pos-1] >= r1)
72 this.setSorted(false);
73 this.ranges[pos++] = r1;
74 this.ranges[pos] = r2;
75 if (!this.sorted)
76 this.sortRanges();
77 }
78 }
79
80 private final boolean isSorted() {
81 return this.sorted;
82 }
83 private final void setSorted(boolean sort) {
84 this.sorted = sort;
85 if (!sort) this.compacted = false;
86 }
87 private final boolean isCompacted() {
88 return this.compacted;
89 }
90 private final void setCompacted() {
91 this.compacted = true;
92 }
93
94 protected void sortRanges() {
95 if (this.isSorted())
96 return;
97 if (this.ranges == null)
98 return;
99 //System.err.println("Do sorting: "+this.ranges.length);
100
101 // Bubble sort
102 // Why? -- In many cases,
103 // this.ranges has few elements.
104 for (int i = this.ranges.length-4; i >= 0; i -= 2) {
105 for (int j = 0; j <= i; j += 2) {
106 if (this.ranges[j] > this.ranges[j+2]
107 || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
108 int tmp;
109 tmp = this.ranges[j+2];
110 this.ranges[j+2] = this.ranges[j];
111 this.ranges[j] = tmp;
112 tmp = this.ranges[j+3];
113 this.ranges[j+3] = this.ranges[j+1];
114 this.ranges[j+1] = tmp;
115 }
116 }
117 }
118 this.setSorted(true);
119 }
120
121 /**
122 * this.ranges is sorted.
123 */
124 protected void compactRanges() {
125 boolean DEBUG = false;
126 if (this.ranges == null || this.ranges.length <= 2)
127 return;
128 if (this.isCompacted())
129 return;
130 int base = 0; // Index of writing point
131 int target = 0; // Index of processing point
132
133 while (target < this.ranges.length) {
134 if (base != target) {
135 this.ranges[base] = this.ranges[target++];
136 this.ranges[base+1] = this.ranges[target++];
137 } else
138 target += 2;
139 int baseend = this.ranges[base+1];
140 while (target < this.ranges.length) {
141 if (baseend+1 < this.ranges[target])
142 break;
143 if (baseend+1 == this.ranges[target]) {
144 if (DEBUG)
145 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
146 +", "+this.ranges[base+1]
147 +"], ["+this.ranges[target]
148 +", "+this.ranges[target+1]
149 +"] -> ["+this.ranges[base]
150 +", "+this.ranges[target+1]
151 +"]");
152 this.ranges[base+1] = this.ranges[target+1];
153 baseend = this.ranges[base+1];
154 target += 2;
155 } else if (baseend >= this.ranges[target+1]) {
156 if (DEBUG)
157 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
158 +", "+this.ranges[base+1]
159 +"], ["+this.ranges[target]
160 +", "+this.ranges[target+1]
161 +"] -> ["+this.ranges[base]
162 +", "+this.ranges[base+1]
163 +"]");
164 target += 2;
165 } else if (baseend < this.ranges[target+1]) {
166 if (DEBUG)
167 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
168 +", "+this.ranges[base+1]
169 +"], ["+this.ranges[target]
170 +", "+this.ranges[target+1]
171 +"] -> ["+this.ranges[base]
172 +", "+this.ranges[target+1]
173 +"]");
174 this.ranges[base+1] = this.ranges[target+1];
175 baseend = this.ranges[base+1];
176 target += 2;
177 } else {
178 throw new RuntimeException("Token#compactRanges(): Internel Error: ["
179 +this.ranges[base]
180 +","+this.ranges[base+1]
181 +"] ["+this.ranges[target]
182 +","+this.ranges[target+1]+"]");
183 }
184 } // while
185 base += 2;
186 }
187
188 if (base != this.ranges.length) {
189 int[] result = new int[base];
190 System.arraycopy(this.ranges, 0, result, 0, base);
191 this.ranges = result;
192 }
193 this.setCompacted();
194 }
195
196 protected void mergeRanges(Token token) {
197 RangeToken tok = (RangeToken)token;
198 this.sortRanges();
199 tok.sortRanges();
200 if (tok.ranges == null)
201 return;
202 this.icaseCache = null;
203 this.setSorted(true);
204 if (this.ranges == null) {
205 this.ranges = new int[tok.ranges.length];
206 System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
207 return;
208 }
209 int[] result = new int[this.ranges.length+tok.ranges.length];
210 for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) {
211 if (i >= this.ranges.length) {
212 result[k++] = tok.ranges[j++];
213 result[k++] = tok.ranges[j++];
214 } else if (j >= tok.ranges.length) {
215 result[k++] = this.ranges[i++];
216 result[k++] = this.ranges[i++];
217 } else if (tok.ranges[j] < this.ranges[i]
218 || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
219 result[k++] = tok.ranges[j++];
220 result[k++] = tok.ranges[j++];
221 } else {
222 result[k++] = this.ranges[i++];
223 result[k++] = this.ranges[i++];
224 }
225 }
226 this.ranges = result;
227 }
228
229 protected void subtractRanges(Token token) {
230 if (token.type == NRANGE) {
231 this.intersectRanges(token);
232 return;
233 }
234 RangeToken tok = (RangeToken)token;
235 if (tok.ranges == null || this.ranges == null)
236 return;
237 this.icaseCache = null;
238 this.sortRanges();
239 this.compactRanges();
240 tok.sortRanges();
241 tok.compactRanges();
242
243 //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
244
245 int[] result = new int[this.ranges.length+tok.ranges.length];
246 int wp = 0, src = 0, sub = 0;
247 while (src < this.ranges.length && sub < tok.ranges.length) {
248 int srcbegin = this.ranges[src];
249 int srcend = this.ranges[src+1];
250 int subbegin = tok.ranges[sub];
251 int subend = tok.ranges[sub+1];
252 if (srcend < subbegin) { // Not overlapped
253 // src: o-----o
254 // sub: o-----o
255 // res: o-----o
256 // Reuse sub
257 result[wp++] = this.ranges[src++];
258 result[wp++] = this.ranges[src++];
259 } else if (srcend >= subbegin
260 && srcbegin <= subend) { // Overlapped
261 // src: o--------o
262 // sub: o----o
263 // sub: o----o
264 // sub: o----o
265 // sub: o------------o
266 if (subbegin <= srcbegin && srcend <= subend) {
267 // src: o--------o
268 // sub: o------------o
269 // res: empty
270 // Reuse sub
271 src += 2;
272 } else if (subbegin <= srcbegin) {
273 // src: o--------o
274 // sub: o----o
275 // res: o-----o
276 // Reuse src(=res)
277 this.ranges[src] = subend+1;
278 sub += 2;
279 } else if (srcend <= subend) {
280 // src: o--------o
281 // sub: o----o
282 // res: o-----o
283 // Reuse sub
284 result[wp++] = srcbegin;
285 result[wp++] = subbegin-1;
286 src += 2;
287 } else {
288 // src: o--------o
289 // sub: o----o
290 // res: o-o o-o
291 // Reuse src(=right res)
292 result[wp++] = srcbegin;
293 result[wp++] = subbegin-1;
294 this.ranges[src] = subend+1;
295 sub += 2;
296 }
297 } else if (subend < srcbegin) {
298 // Not overlapped
299 // src: o-----o
300 // sub: o----o
301 sub += 2;
302 } else {
303 throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
304 +","+this.ranges[src+1]
305 +"] - ["+tok.ranges[sub]
306 +","+tok.ranges[sub+1]
307 +"]");
308 }
309 }
310 while (src < this.ranges.length) {
311 result[wp++] = this.ranges[src++];
312 result[wp++] = this.ranges[src++];
313 }
314 this.ranges = new int[wp];
315 System.arraycopy(result, 0, this.ranges, 0, wp);
316 // this.ranges is sorted and compacted.
317 }
318
319 /**
320 * @param tok Ignore whether it is NRANGE or not.
321 */
322 protected void intersectRanges(Token token) {
323 RangeToken tok = (RangeToken)token;
324 if (tok.ranges == null || this.ranges == null)
325 return;
326 this.icaseCache = null;
327 this.sortRanges();
328 this.compactRanges();
329 tok.sortRanges();
330 tok.compactRanges();
331
332 int[] result = new int[this.ranges.length+tok.ranges.length];
333 int wp = 0, src1 = 0, src2 = 0;
334 while (src1 < this.ranges.length && src2 < tok.ranges.length) {
335 int src1begin = this.ranges[src1];
336 int src1end = this.ranges[src1+1];
337 int src2begin = tok.ranges[src2];
338 int src2end = tok.ranges[src2+1];
339 if (src1end < src2begin) { // Not overlapped
340 // src1: o-----o
341 // src2: o-----o
342 // res: empty
343 // Reuse src2
344 src1 += 2;
345 } else if (src1end >= src2begin
346 && src1begin <= src2end) { // Overlapped
347 // src1: o--------o
348 // src2: o----o
349 // src2: o----o
350 // src2: o----o
351 // src2: o------------o
352 if (src2begin <= src1begin && src1end <= src2end) {
353 // src1: o--------o
354 // src2: o------------o
355 // res: o--------o
356 // Reuse src2
357 result[wp++] = src1begin;
358 result[wp++] = src1end;
359 src1 += 2;
360 } else if (src2begin <= src1begin) {
361 // src1: o--------o
362 // src2: o----o
363 // res: o--o
364 // Reuse the rest of src1
365 result[wp++] = src1begin;
366 result[wp++] = src2end;
367 this.ranges[src1] = src2end+1;
368 src2 += 2;
369 } else if (src1end <= src2end) {
370 // src1: o--------o
371 // src2: o----o
372 // res: o--o
373 // Reuse src2
374 result[wp++] = src2begin;
375 result[wp++] = src1end;
376 src1 += 2;
377 } else {
378 // src1: o--------o
379 // src2: o----o
380 // res: o----o
381 // Reuse the rest of src1
382 result[wp++] = src2begin;
383 result[wp++] = src2end;
384 this.ranges[src1] = src2end+1;
385 }
386 } else if (src2end < src1begin) {
387 // Not overlapped
388 // src1: o-----o
389 // src2: o----o
390 src2 += 2;
391 } else {
392 throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
393 +this.ranges[src1]
394 +","+this.ranges[src1+1]
395 +"] & ["+tok.ranges[src2]
396 +","+tok.ranges[src2+1]
397 +"]");
398 }
399 }
400 while (src1 < this.ranges.length) {
401 result[wp++] = this.ranges[src1++];
402 result[wp++] = this.ranges[src1++];
403 }
404 this.ranges = new int[wp];
405 System.arraycopy(result, 0, this.ranges, 0, wp);
406 // this.ranges is sorted and compacted.
407 }
408
409 /**
410 * for RANGE: Creates complement.
411 * for NRANGE: Creates the same meaning RANGE.
412 */
413 static Token complementRanges(Token token) {
414 if (token.type != RANGE && token.type != NRANGE)
415 throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type);
416 RangeToken tok = (RangeToken)token;
417 tok.sortRanges();
418 tok.compactRanges();
419 int len = tok.ranges.length+2;
420 if (tok.ranges[0] == 0)
421 len -= 2;
422 int last = tok.ranges[tok.ranges.length-1];
423 if (last == UTF16_MAX)
424 len -= 2;
425 RangeToken ret = Token.createRange();
426 ret.ranges = new int[len];
427 int wp = 0;
428 if (tok.ranges[0] > 0) {
429 ret.ranges[wp++] = 0;
430 ret.ranges[wp++] = tok.ranges[0]-1;
431 }
432 for (int i = 1; i < tok.ranges.length-2; i += 2) {
433 ret.ranges[wp++] = tok.ranges[i]+1;
434 ret.ranges[wp++] = tok.ranges[i+1]-1;
435 }
436 if (last != UTF16_MAX) {
437 ret.ranges[wp++] = last+1;
438 ret.ranges[wp] = UTF16_MAX;
439 }
440 ret.setCompacted();
441 return ret;
442 }
443
444 synchronized RangeToken getCaseInsensitiveToken() {
445 if (this.icaseCache != null)
446 return this.icaseCache;
447
448 RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
449 for (int i = 0; i < this.ranges.length; i += 2) {
450 for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) {
451 if (ch > 0xffff)
452 uppers.addRange(ch, ch);
453 else {
454 char uch = Character.toUpperCase((char)ch);
455 uppers.addRange(uch, uch);
456 }
457 }
458 }
459 RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
460 for (int i = 0; i < uppers.ranges.length; i += 2) {
461 for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) {
462 if (ch > 0xffff)
463 lowers.addRange(ch, ch);
464 else {
465 char uch = Character.toUpperCase((char)ch);
466 lowers.addRange(uch, uch);
467 }
468 }
469 }
470 lowers.mergeRanges(uppers);
471 lowers.mergeRanges(this);
472 lowers.compactRanges();
473
474 this.icaseCache = lowers;
475 return lowers;
476 }
477
478 void dumpRanges() {
479 System.err.print("RANGE: ");
480 if (this.ranges == null)
481 System.err.println(" NULL");
482 for (int i = 0; i < this.ranges.length; i += 2) {
483 System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
484 }
485 System.err.println("");
486 }
487
488 boolean match(int ch) {
489 if (this.map == null) this.createMap();
490 boolean ret;
491 if (this.type == RANGE) {
492 if (ch < MAPSIZE)
493 return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
494 ret = false;
495 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
496 if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
497 return true;
498 }
499 } else {
500 if (ch < MAPSIZE)
501 return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
502 ret = true;
503 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
504 if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
505 return false;
506 }
507 }
508 return ret;
509 }
510
511 private static final int MAPSIZE = 256;
512 private void createMap() {
513 int asize = MAPSIZE/32; // 32 is the number of bits in `int'.
514 int [] map = new int[asize];
515 int nonMapIndex = this.ranges.length;
516 for (int i = 0; i < asize; ++i) {
517 map[i] = 0;
518 }
519 for (int i = 0; i < this.ranges.length; i += 2) {
520 int s = this.ranges[i];
521 int e = this.ranges[i+1];
522 if (s < MAPSIZE) {
523 for (int j = s; j <= e && j < MAPSIZE; j++) {
524 map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
525 }
526 }
527 else {
528 nonMapIndex = i;
529 break;
530 }
531 if (e >= MAPSIZE) {
532 nonMapIndex = i;
533 break;
534 }
535 }
536 this.map = map;
537 this.nonMapIndex = nonMapIndex;
538 //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
539 }
540
541 public String toString(int options) {
542 String ret;
543 if (this.type == RANGE) {
544 if (this == Token.token_dot)
545 ret = ".";
546 else if (this == Token.token_0to9)
547 ret = "\\d";
548 else if (this == Token.token_wordchars)
549 ret = "\\w";
550 else if (this == Token.token_spaces)
551 ret = "\\s";
552 else {
553 StringBuffer sb = new StringBuffer();
554 sb.append("[");
555 for (int i = 0; i < this.ranges.length; i += 2) {
556 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
557 if (this.ranges[i] == this.ranges[i+1]) {
558 sb.append(escapeCharInCharClass(this.ranges[i]));
559 } else {
560 sb.append(escapeCharInCharClass(this.ranges[i]));
561 sb.append((char)'-');
562 sb.append(escapeCharInCharClass(this.ranges[i+1]));
563 }
564 }
565 sb.append("]");
566 ret = sb.toString();
567 }
568 } else {
569 if (this == Token.token_not_0to9)
570 ret = "\\D";
571 else if (this == Token.token_not_wordchars)
572 ret = "\\W";
573 else if (this == Token.token_not_spaces)
574 ret = "\\S";
575 else {
576 StringBuffer sb = new StringBuffer();
577 sb.append("[^");
578 for (int i = 0; i < this.ranges.length; i += 2) {
579 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
580 if (this.ranges[i] == this.ranges[i+1]) {
581 sb.append(escapeCharInCharClass(this.ranges[i]));
582 } else {
583 sb.append(escapeCharInCharClass(this.ranges[i]));
584 sb.append('-');
585 sb.append(escapeCharInCharClass(this.ranges[i+1]));
586 }
587 }
588 sb.append("]");
589 ret = sb.toString();
590 }
591 }
592 return ret;
593 }
594
595 private static String escapeCharInCharClass(int ch) {
596 String ret;
597 switch (ch) {
598 case '[': case ']': case '-': case '^':
599 case ',': case '\\':
600 ret = "\\"+(char)ch;
601 break;
602 case '\f': ret = "\\f"; break;
603 case '\n': ret = "\\n"; break;
604 case '\r': ret = "\\r"; break;
605 case '\t': ret = "\\t"; break;
606 case 0x1b: ret = "\\e"; break;
607 //case 0x0b: ret = "\\v"; break;
608 default:
609 if (ch < 0x20) {
610 String pre = "0"+Integer.toHexString(ch);
611 ret = "\\x"+pre.substring(pre.length()-2, pre.length());
612 } else if (ch >= 0x10000) {
613 String pre = "0"+Integer.toHexString(ch);
614 ret = "\\v"+pre.substring(pre.length()-6, pre.length());
615 } else
616 ret = ""+(char)ch;
617 }
618 return ret;
619 }
620
621 }