1 package org.apache.lucene.analysis.ru;
2 /**
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /**
20 * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
21 * for russian characters in Unicode, KOI8 and CP1252.
22 * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
23 * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
24 * and adding logic to toLowerCase() method for that charset.
25 *
26 *
27 * @version $Id: RussianCharsets.java 656111 2008-05-14 05:37:45Z otis $
28 */
29 public class RussianCharsets
30 {
31 // Unicode Russian charset (lowercase letters only)
32 public static char[] UnicodeRussian = {
33 '\u0430',
34 '\u0431',
35 '\u0432',
36 '\u0433',
37 '\u0434',
38 '\u0435',
39 '\u0436',
40 '\u0437',
41 '\u0438',
42 '\u0439',
43 '\u043A',
44 '\u043B',
45 '\u043C',
46 '\u043D',
47 '\u043E',
48 '\u043F',
49 '\u0440',
50 '\u0441',
51 '\u0442',
52 '\u0443',
53 '\u0444',
54 '\u0445',
55 '\u0446',
56 '\u0447',
57 '\u0448',
58 '\u0449',
59 '\u044A',
60 '\u044B',
61 '\u044C',
62 '\u044D',
63 '\u044E',
64 '\u044F',
65 // upper case
66 '\u0410',
67 '\u0411',
68 '\u0412',
69 '\u0413',
70 '\u0414',
71 '\u0415',
72 '\u0416',
73 '\u0417',
74 '\u0418',
75 '\u0419',
76 '\u041A',
77 '\u041B',
78 '\u041C',
79 '\u041D',
80 '\u041E',
81 '\u041F',
82 '\u0420',
83 '\u0421',
84 '\u0422',
85 '\u0423',
86 '\u0424',
87 '\u0425',
88 '\u0426',
89 '\u0427',
90 '\u0428',
91 '\u0429',
92 '\u042A',
93 '\u042B',
94 '\u042C',
95 '\u042D',
96 '\u042E',
97 '\u042F',
98 // numbers
99 '0',
100 '1',
101 '2',
102 '3',
103 '4',
104 '5',
105 '6',
106 '7',
107 '8',
108 '9'
109 };
110
111 // KOI8 charset
112 public static char[] KOI8 = {
113 0xc1,
114 0xc2,
115 0xd7,
116 0xc7,
117 0xc4,
118 0xc5,
119 0xd6,
120 0xda,
121 0xc9,
122 0xca,
123 0xcb,
124 0xcc,
125 0xcd,
126 0xce,
127 0xcf,
128 0xd0,
129 0xd2,
130 0xd3,
131 0xd4,
132 0xd5,
133 0xc6,
134 0xc8,
135 0xc3,
136 0xde,
137 0xdb,
138 0xdd,
139 0xdf,
140 0xd9,
141 0xd8,
142 0xdc,
143 0xc0,
144 0xd1,
145 // upper case
146 0xe1,
147 0xe2,
148 0xf7,
149 0xe7,
150 0xe4,
151 0xe5,
152 0xf6,
153 0xfa,
154 0xe9,
155 0xea,
156 0xeb,
157 0xec,
158 0xed,
159 0xee,
160 0xef,
161 0xf0,
162 0xf2,
163 0xf3,
164 0xf4,
165 0xf5,
166 0xe6,
167 0xe8,
168 0xe3,
169 0xfe,
170 0xfb,
171 0xfd,
172 0xff,
173 0xf9,
174 0xf8,
175 0xfc,
176 0xe0,
177 0xf1,
178 // numbers
179 '0',
180 '1',
181 '2',
182 '3',
183 '4',
184 '5',
185 '6',
186 '7',
187 '8',
188 '9'
189 };
190
191 // CP1251 eharset
192 public static char[] CP1251 = {
193 0xE0,
194 0xE1,
195 0xE2,
196 0xE3,
197 0xE4,
198 0xE5,
199 0xE6,
200 0xE7,
201 0xE8,
202 0xE9,
203 0xEA,
204 0xEB,
205 0xEC,
206 0xED,
207 0xEE,
208 0xEF,
209 0xF0,
210 0xF1,
211 0xF2,
212 0xF3,
213 0xF4,
214 0xF5,
215 0xF6,
216 0xF7,
217 0xF8,
218 0xF9,
219 0xFA,
220 0xFB,
221 0xFC,
222 0xFD,
223 0xFE,
224 0xFF,
225 // upper case
226 0xC0,
227 0xC1,
228 0xC2,
229 0xC3,
230 0xC4,
231 0xC5,
232 0xC6,
233 0xC7,
234 0xC8,
235 0xC9,
236 0xCA,
237 0xCB,
238 0xCC,
239 0xCD,
240 0xCE,
241 0xCF,
242 0xD0,
243 0xD1,
244 0xD2,
245 0xD3,
246 0xD4,
247 0xD5,
248 0xD6,
249 0xD7,
250 0xD8,
251 0xD9,
252 0xDA,
253 0xDB,
254 0xDC,
255 0xDD,
256 0xDE,
257 0xDF,
258 // numbers
259 '0',
260 '1',
261 '2',
262 '3',
263 '4',
264 '5',
265 '6',
266 '7',
267 '8',
268 '9'
269 };
270
271 public static char toLowerCase(char letter, char[] charset)
272 {
273 if (charset == UnicodeRussian)
274 {
275 if (letter >= '\u0430' && letter <= '\u044F')
276 {
277 return letter;
278 }
279 if (letter >= '\u0410' && letter <= '\u042F')
280 {
281 return (char) (letter + 32);
282 }
283 }
284
285 if (charset == KOI8)
286 {
287 if (letter >= 0xe0 && letter <= 0xff)
288 {
289 return (char) (letter - 32);
290 }
291 if (letter >= 0xc0 && letter <= 0xdf)
292 {
293 return letter;
294 }
295
296 }
297
298 if (charset == CP1251)
299 {
300 if (letter >= 0xC0 && letter <= 0xDF)
301 {
302 return (char) (letter + 32);
303 }
304 if (letter >= 0xE0 && letter <= 0xFF)
305 {
306 return letter;
307 }
308
309 }
310
311 return Character.toLowerCase(letter);
312 }
313 }