1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.xerces.impl.xpath.regex; 19 20 import java.text.CharacterIterator; 21 22 /** 23 * @xerces.internal 24 * 25 * @version $Id: REUtil.java 446721 2006-09-15 20:35:34Z mrglavas $ 26 */ 27 public final class REUtil { 28 private REUtil() { 29 } 30 31 static final int composeFromSurrogates(int high, int low) { 32 return 0x10000 + ((high-0xd800)<<10) + low-0xdc00; 33 } 34 35 static final boolean isLowSurrogate(int ch) { 36 return (ch & 0xfc00) == 0xdc00; 37 } 38 39 static final boolean isHighSurrogate(int ch) { 40 return (ch & 0xfc00) == 0xd800; 41 } 42 43 static final String decomposeToSurrogates(int ch) { 44 char[] chs = new char[2]; 45 ch -= 0x10000; 46 chs[0] = (char)((ch>>10)+0xd800); 47 chs[1] = (char)((ch&0x3ff)+0xdc00); 48 return new String(chs); 49 } 50 51 static final String substring(CharacterIterator iterator, int begin, int end) { 52 char[] src = new char[end-begin]; 53 for (int i = 0; i < src.length; i ++) 54 src[i] = iterator.setIndex(i+begin); 55 return new String(src); 56 } 57 58 // ================================================================ 59 60 static final int getOptionValue(int ch) { 61 int ret = 0; 62 switch (ch) { 63 case 'i': 64 ret = RegularExpression.IGNORE_CASE; 65 break; 66 case 'm': 67 ret = RegularExpression.MULTIPLE_LINES; 68 break; 69 case 's': 70 ret = RegularExpression.SINGLE_LINE; 71 break; 72 case 'x': 73 ret = RegularExpression.EXTENDED_COMMENT; 74 break; 75 case 'u': 76 ret = RegularExpression.USE_UNICODE_CATEGORY; 77 break; 78 case 'w': 79 ret = RegularExpression.UNICODE_WORD_BOUNDARY; 80 break; 81 case 'F': 82 ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION; 83 break; 84 case 'H': 85 ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION; 86 break; 87 case 'X': 88 ret = RegularExpression.XMLSCHEMA_MODE; 89 break; 90 case ',': 91 ret = RegularExpression.SPECIAL_COMMA; 92 break; 93 default: 94 } 95 return ret; 96 } 97 98 static final int parseOptions(String opts) throws ParseException { 99 if (opts == null) return 0; 100 int options = 0; 101 for (int i = 0; i < opts.length(); i ++) { 102 int v = getOptionValue(opts.charAt(i)); 103 if (v == 0) 104 throw new ParseException("Unknown Option: "+opts.substring(i), -1); 105 options |= v; 106 } 107 return options; 108 } 109 110 static final String createOptionString(int options) { 111 StringBuffer sb = new StringBuffer(9); 112 if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0) 113 sb.append((char)'F'); 114 if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0) 115 sb.append((char)'H'); 116 if ((options & RegularExpression.XMLSCHEMA_MODE) != 0) 117 sb.append((char)'X'); 118 if ((options & RegularExpression.IGNORE_CASE) != 0) 119 sb.append((char)'i'); 120 if ((options & RegularExpression.MULTIPLE_LINES) != 0) 121 sb.append((char)'m'); 122 if ((options & RegularExpression.SINGLE_LINE) != 0) 123 sb.append((char)'s'); 124 if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0) 125 sb.append((char)'u'); 126 if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0) 127 sb.append((char)'w'); 128 if ((options & RegularExpression.EXTENDED_COMMENT) != 0) 129 sb.append((char)'x'); 130 if ((options & RegularExpression.SPECIAL_COMMA) != 0) 131 sb.append((char)','); 132 return sb.toString().intern(); 133 } 134 135 // ================================================================ 136 137 static String stripExtendedComment(String regex) { 138 int len = regex.length(); 139 StringBuffer buffer = new StringBuffer(len); 140 int offset = 0; 141 while (offset < len) { 142 int ch = regex.charAt(offset++); 143 // Skips a white space. 144 if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r' || ch == ' ') 145 continue; 146 147 if (ch == '#') { // Skips chracters between '#' and a line end. 148 while (offset < len) { 149 ch = regex.charAt(offset++); 150 if (ch == '\r' || ch == '\n') 151 break; 152 } 153 continue; 154 } 155 156 int next; // Strips an escaped white space. 157 if (ch == '\\' && offset < len) { 158 if ((next = regex.charAt(offset)) == '#' 159 || next == '\t' || next == '\n' || next == '\f' 160 || next == '\r' || next == ' ') { 161 buffer.append((char)next); 162 offset ++; 163 } else { // Other escaped character. 164 buffer.append((char)'\\'); 165 buffer.append((char)next); 166 offset ++; 167 } 168 } else // As is. 169 buffer.append((char)ch); 170 } 171 return buffer.toString(); 172 } 173 174 // ================================================================ 175 176 /** 177 * Sample entry. 178 * <div>Usage: <KBD>org.apache.xerces.utils.regex.REUtil <regex> <string></KBD></div> 179 */ 180 public static void main(String[] argv) { 181 String pattern = null; 182 try { 183 String options = ""; 184 String target = null; 185 if( argv.length == 0 ) { 186 System.out.println( "Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String" ); 187 System.exit( 0 ); 188 } 189 for (int i = 0; i < argv.length; i ++) { 190 if (argv[i].length() == 0 || argv[i].charAt(0) != '-') { 191 if (pattern == null) 192 pattern = argv[i]; 193 else if (target == null) 194 target = argv[i]; 195 else 196 System.err.println("Unnecessary: "+argv[i]); 197 } else if (argv[i].equals("-i")) { 198 options += "i"; 199 } else if (argv[i].equals("-m")) { 200 options += "m"; 201 } else if (argv[i].equals("-s")) { 202 options += "s"; 203 } else if (argv[i].equals("-u")) { 204 options += "u"; 205 } else if (argv[i].equals("-w")) { 206 options += "w"; 207 } else if (argv[i].equals("-X")) { 208 options += "X"; 209 } else { 210 System.err.println("Unknown option: "+argv[i]); 211 } 212 } 213 RegularExpression reg = new RegularExpression(pattern, options); 214 System.out.println("RegularExpression: "+reg); 215 Match match = new Match(); 216 reg.matches(target, match); 217 for (int i = 0; i < match.getNumberOfGroups(); i ++) { 218 if (i == 0 ) System.out.print("Matched range for the whole pattern: "); 219 else System.out.print("["+i+"]: "); 220 if (match.getBeginning(i) < 0) 221 System.out.println("-1"); 222 else { 223 System.out.print(match.getBeginning(i)+", "+match.getEnd(i)+", "); 224 System.out.println("\""+match.getCapturedText(i)+"\""); 225 } 226 } 227 } catch (ParseException pe) { 228 if (pattern == null) { 229 pe.printStackTrace(); 230 } else { 231 System.err.println("org.apache.xerces.utils.regex.ParseException: "+pe.getMessage()); 232 String indent = " "; 233 System.err.println(indent+pattern); 234 int loc = pe.getLocation(); 235 if (loc >= 0) { 236 System.err.print(indent); 237 for (int i = 0; i < loc; i ++) System.err.print("-"); 238 System.err.println("^"); 239 } 240 } 241 } catch (Exception e) { 242 e.printStackTrace(); 243 } 244 } 245 246 static final int CACHESIZE = 20; 247 static final RegularExpression[] regexCache = new RegularExpression[CACHESIZE]; 248 /** 249 * Creates a RegularExpression instance. 250 * This method caches created instances. 251 * 252 * @see RegularExpression#RegularExpression(java.lang.String, java.lang.String) 253 */ 254 public static RegularExpression createRegex(String pattern, String options) 255 throws ParseException { 256 RegularExpression re = null; 257 int intOptions = REUtil.parseOptions(options); 258 synchronized (REUtil.regexCache) { 259 int i; 260 for (i = 0; i < REUtil.CACHESIZE; i ++) { 261 RegularExpression cached = REUtil.regexCache[i]; 262 if (cached == null) { 263 i = -1; 264 break; 265 } 266 if (cached.equals(pattern, intOptions)) { 267 re = cached; 268 break; 269 } 270 } 271 if (re != null) { 272 if (i != 0) { 273 System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, i); 274 REUtil.regexCache[0] = re; 275 } 276 } else { 277 re = new RegularExpression(pattern, options); 278 System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, REUtil.CACHESIZE-1); 279 REUtil.regexCache[0] = re; 280 } 281 } 282 return re; 283 } 284 285 /** 286 * 287 * @see RegularExpression#matches(java.lang.String) 288 */ 289 public static boolean matches(String regex, String target) throws ParseException { 290 return REUtil.createRegex(regex, null).matches(target); 291 } 292 293 /** 294 * 295 * @see RegularExpression#matches(java.lang.String) 296 */ 297 public static boolean matches(String regex, String options, String target) throws ParseException { 298 return REUtil.createRegex(regex, options).matches(target); 299 } 300 301 // ================================================================ 302 303 /** 304 * 305 */ 306 public static String quoteMeta(String literal) { 307 int len = literal.length(); 308 StringBuffer buffer = null; 309 for (int i = 0; i < len; i ++) { 310 int ch = literal.charAt(i); 311 if (".*+?{[()|\\^$".indexOf(ch) >= 0) { 312 if (buffer == null) { 313 buffer = new StringBuffer(i+(len-i)*2); 314 if (i > 0) buffer.append(literal.substring(0, i)); 315 } 316 buffer.append((char)'\\'); 317 buffer.append((char)ch); 318 } else if (buffer != null) 319 buffer.append((char)ch); 320 } 321 return buffer != null ? buffer.toString() : literal; 322 } 323 324 // ================================================================ 325 326 static void dumpString(String v) { 327 for (int i = 0; i < v.length(); i ++) { 328 System.out.print(Integer.toHexString(v.charAt(i))); 329 System.out.print(" "); 330 } 331 System.out.println(); 332 } 333 }