1 package org.apache.lucene.analysis.ru;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 /**
21 * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
22 */
23 class RussianStemmer
24 {
25 // positions of RV, R1 and R2 respectively
26 private int RV, R1, R2;
27
28 // letters (currently unused letters are commented out)
29 private final static char A = '\u0430';
30 //private final static char B = '\u0431';
31 private final static char V = '\u0432';
32 private final static char G = '\u0433';
33 //private final static char D = '\u0434';
34 private final static char E = '\u0435';
35 //private final static char ZH = '\u0436';
36 //private final static char Z = '\u0437';
37 private final static char I = '\u0438';
38 private final static char I_ = '\u0439';
39 //private final static char K = '\u043A';
40 private final static char L = '\u043B';
41 private final static char M = '\u043C';
42 private final static char N = '\u043D';
43 private final static char O = '\u043E';
44 //private final static char P = '\u043F';
45 //private final static char R = '\u0440';
46 private final static char S = '\u0441';
47 private final static char T = '\u0442';
48 private final static char U = '\u0443';
49 //private final static char F = '\u0444';
50 private final static char X = '\u0445';
51 //private final static char TS = '\u0446';
52 //private final static char CH = '\u0447';
53 private final static char SH = '\u0448';
54 private final static char SHCH = '\u0449';
55 //private final static char HARD = '\u044A';
56 private final static char Y = '\u044B';
57 private final static char SOFT = '\u044C';
58 private final static char AE = '\u044D';
59 private final static char IU = '\u044E';
60 private final static char IA = '\u044F';
61
62 // stem definitions
63 private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
64
65 private static char[][] perfectiveGerundEndings1 = {
66 { V },
67 { V, SH, I },
68 { V, SH, I, S, SOFT }
69 };
70
71 private static char[][] perfectiveGerund1Predessors = {
72 { A },
73 { IA }
74 };
75
76 private static char[][] perfectiveGerundEndings2 = { { I, V }, {
77 Y, V }, {
78 I, V, SH, I }, {
79 Y, V, SH, I }, {
80 I, V, SH, I, S, SOFT }, {
81 Y, V, SH, I, S, SOFT }
82 };
83
84 private static char[][] adjectiveEndings = {
85 { E, E },
86 { I, E },
87 { Y, E },
88 { O, E },
89 { E, I_ },
90 { I, I_ },
91 { Y, I_ },
92 { O, I_ },
93 { E, M },
94 { I, M },
95 { Y, M },
96 { O, M },
97 { I, X },
98 { Y, X },
99 { U, IU },
100 { IU, IU },
101 { A, IA },
102 { IA, IA },
103 { O, IU },
104 { E, IU },
105 { I, M, I },
106 { Y, M, I },
107 { E, G, O },
108 { O, G, O },
109 { E, M, U },
110 {O, M, U }
111 };
112
113 private static char[][] participleEndings1 = {
114 { SHCH },
115 { E, M },
116 { N, N },
117 { V, SH },
118 { IU, SHCH }
119 };
120
121 private static char[][] participleEndings2 = {
122 { I, V, SH },
123 { Y, V, SH },
124 { U, IU, SHCH }
125 };
126
127 private static char[][] participle1Predessors = {
128 { A },
129 { IA }
130 };
131
132 private static char[][] reflexiveEndings = {
133 { S, IA },
134 { S, SOFT }
135 };
136
137 private static char[][] verbEndings1 = {
138 { I_ },
139 { L },
140 { N },
141 { L, O },
142 { N, O },
143 { E, T },
144 { IU, T },
145 { L, A },
146 { N, A },
147 { L, I },
148 { E, M },
149 { N, Y },
150 { E, T, E },
151 { I_, T, E },
152 { T, SOFT },
153 { E, SH, SOFT },
154 { N, N, O }
155 };
156
157 private static char[][] verbEndings2 = {
158 { IU },
159 { U, IU },
160 { E, N },
161 { E, I_ },
162 { IA, T },
163 { U, I_ },
164 { I, L },
165 { Y, L },
166 { I, M },
167 { Y, M },
168 { I, T },
169 { Y, T },
170 { I, L, A },
171 { Y, L, A },
172 { E, N, A },
173 { I, T, E },
174 { I, L, I },
175 { Y, L, I },
176 { I, L, O },
177 { Y, L, O },
178 { E, N, O },
179 { U, E, T },
180 { U, IU, T },
181 { E, N, Y },
182 { I, T, SOFT },
183 { Y, T, SOFT },
184 { I, SH, SOFT },
185 { E, I_, T, E },
186 { U, I_, T, E }
187 };
188
189 private static char[][] verb1Predessors = {
190 { A },
191 { IA }
192 };
193
194 private static char[][] nounEndings = {
195 { A },
196 { U },
197 { I_ },
198 { O },
199 { U },
200 { E },
201 { Y },
202 { I },
203 { SOFT },
204 { IA },
205 { E, V },
206 { O, V },
207 { I, E },
208 { SOFT, E },
209 { IA, X },
210 { I, IU },
211 { E, I },
212 { I, I },
213 { E, I_ },
214 { O, I_ },
215 { E, M },
216 { A, M },
217 { O, M },
218 { A, X },
219 { SOFT, IU },
220 { I, IA },
221 { SOFT, IA },
222 { I, I_ },
223 { IA, M },
224 { IA, M, I },
225 { A, M, I },
226 { I, E, I_ },
227 { I, IA, M },
228 { I, E, M },
229 { I, IA, X },
230 { I, IA, M, I }
231 };
232
233 private static char[][] superlativeEndings = {
234 { E, I_, SH },
235 { E, I_, SH, E }
236 };
237
238 private static char[][] derivationalEndings = {
239 { O, S, T },
240 { O, S, T, SOFT }
241 };
242
243 /**
244 * RussianStemmer constructor comment.
245 */
246 public RussianStemmer()
247 {
248 super();
249 }
250
251 /**
252 * Adjectival ending is an adjective ending,
253 * optionally preceded by participle ending.
254 * Creation date: (17/03/2002 12:14:58 AM)
255 * @param stemmingZone java.lang.StringBuilder
256 */
257 private boolean adjectival(StringBuilder stemmingZone)
258 {
259 // look for adjective ending in a stemming zone
260 if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
261 return false;
262 // if adjective ending was found, try for participle ending.
263 // variable r is unused, we are just interested in the side effect of
264 // findAndRemoveEnding():
265 boolean r =
266 findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
267 ||
268 findAndRemoveEnding(stemmingZone, participleEndings2);
269 return true;
270 }
271
272 /**
273 * Derivational endings
274 * Creation date: (17/03/2002 12:14:58 AM)
275 * @param stemmingZone java.lang.StringBuilder
276 */
277 private boolean derivational(StringBuilder stemmingZone)
278 {
279 int endingLength = findEnding(stemmingZone, derivationalEndings);
280 if (endingLength == 0)
281 // no derivational ending found
282 return false;
283 else
284 {
285 // Ensure that the ending locates in R2
286 if (R2 - RV <= stemmingZone.length() - endingLength)
287 {
288 stemmingZone.setLength(stemmingZone.length() - endingLength);
289 return true;
290 }
291 else
292 {
293 return false;
294 }
295 }
296 }
297
298 /**
299 * Finds ending among given ending class and returns the length of ending found(0, if not found).
300 * Creation date: (17/03/2002 8:18:34 PM)
301 */
302 private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
303 {
304 boolean match = false;
305 for (int i = theEndingClass.length - 1; i >= 0; i--)
306 {
307 char[] theEnding = theEndingClass[i];
308 // check if the ending is bigger than stemming zone
309 if (startIndex < theEnding.length - 1)
310 {
311 match = false;
312 continue;
313 }
314 match = true;
315 int stemmingIndex = startIndex;
316 for (int j = theEnding.length - 1; j >= 0; j--)
317 {
318 if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
319 {
320 match = false;
321 break;
322 }
323 }
324 // check if ending was found
325 if (match)
326 {
327 return theEndingClass[i].length; // cut ending
328 }
329 }
330 return 0;
331 }
332
333 private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass)
334 {
335 return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
336 }
337
338 /**
339 * Finds the ending among the given class of endings and removes it from stemming zone.
340 * Creation date: (17/03/2002 8:18:34 PM)
341 */
342 private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
343 {
344 int endingLength = findEnding(stemmingZone, theEndingClass);
345 if (endingLength == 0)
346 // not found
347 return false;
348 else {
349 stemmingZone.setLength(stemmingZone.length() - endingLength);
350 // cut the ending found
351 return true;
352 }
353 }
354
355 /**
356 * Finds the ending among the given class of endings, then checks if this ending was
357 * preceded by any of given predecessors, and if so, removes it from stemming zone.
358 * Creation date: (17/03/2002 8:18:34 PM)
359 */
360 private boolean findAndRemoveEnding(StringBuilder stemmingZone,
361 char[][] theEndingClass, char[][] thePredessors)
362 {
363 int endingLength = findEnding(stemmingZone, theEndingClass);
364 if (endingLength == 0)
365 // not found
366 return false;
367 else
368 {
369 int predessorLength =
370 findEnding(stemmingZone,
371 stemmingZone.length() - endingLength - 1,
372 thePredessors);
373 if (predessorLength == 0)
374 return false;
375 else {
376 stemmingZone.setLength(stemmingZone.length() - endingLength);
377 // cut the ending found
378 return true;
379 }
380 }
381
382 }
383
384 /**
385 * Marks positions of RV, R1 and R2 in a given word.
386 * Creation date: (16/03/2002 3:40:11 PM)
387 */
388 private void markPositions(String word)
389 {
390 RV = 0;
391 R1 = 0;
392 R2 = 0;
393 int i = 0;
394 // find RV
395 while (word.length() > i && !isVowel(word.charAt(i)))
396 {
397 i++;
398 }
399 if (word.length() - 1 < ++i)
400 return; // RV zone is empty
401 RV = i;
402 // find R1
403 while (word.length() > i && isVowel(word.charAt(i)))
404 {
405 i++;
406 }
407 if (word.length() - 1 < ++i)
408 return; // R1 zone is empty
409 R1 = i;
410 // find R2
411 while (word.length() > i && !isVowel(word.charAt(i)))
412 {
413 i++;
414 }
415 if (word.length() - 1 < ++i)
416 return; // R2 zone is empty
417 while (word.length() > i && isVowel(word.charAt(i)))
418 {
419 i++;
420 }
421 if (word.length() - 1 < ++i)
422 return; // R2 zone is empty
423 R2 = i;
424 }
425
426 /**
427 * Checks if character is a vowel..
428 * Creation date: (16/03/2002 10:47:03 PM)
429 * @return boolean
430 * @param letter char
431 */
432 private boolean isVowel(char letter)
433 {
434 for (int i = 0; i < vowels.length; i++)
435 {
436 if (letter == vowels[i])
437 return true;
438 }
439 return false;
440 }
441
442 /**
443 * Noun endings.
444 * Creation date: (17/03/2002 12:14:58 AM)
445 * @param stemmingZone java.lang.StringBuilder
446 */
447 private boolean noun(StringBuilder stemmingZone)
448 {
449 return findAndRemoveEnding(stemmingZone, nounEndings);
450 }
451
452 /**
453 * Perfective gerund endings.
454 * Creation date: (17/03/2002 12:14:58 AM)
455 * @param stemmingZone java.lang.StringBuilder
456 */
457 private boolean perfectiveGerund(StringBuilder stemmingZone)
458 {
459 return findAndRemoveEnding(
460 stemmingZone,
461 perfectiveGerundEndings1,
462 perfectiveGerund1Predessors)
463 || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
464 }
465
466 /**
467 * Reflexive endings.
468 * Creation date: (17/03/2002 12:14:58 AM)
469 * @param stemmingZone java.lang.StringBuilder
470 */
471 private boolean reflexive(StringBuilder stemmingZone)
472 {
473 return findAndRemoveEnding(stemmingZone, reflexiveEndings);
474 }
475
476 /**
477 * Insert the method's description here.
478 * Creation date: (17/03/2002 12:14:58 AM)
479 * @param stemmingZone java.lang.StringBuilder
480 */
481 private boolean removeI(StringBuilder stemmingZone)
482 {
483 if (stemmingZone.length() > 0
484 && stemmingZone.charAt(stemmingZone.length() - 1) == I)
485 {
486 stemmingZone.setLength(stemmingZone.length() - 1);
487 return true;
488 }
489 else
490 {
491 return false;
492 }
493 }
494
495 /**
496 * Insert the method's description here.
497 * Creation date: (17/03/2002 12:14:58 AM)
498 * @param stemmingZone java.lang.StringBuilder
499 */
500 private boolean removeSoft(StringBuilder stemmingZone)
501 {
502 if (stemmingZone.length() > 0
503 && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
504 {
505 stemmingZone.setLength(stemmingZone.length() - 1);
506 return true;
507 }
508 else
509 {
510 return false;
511 }
512 }
513
514 /**
515 * Finds the stem for given Russian word.
516 * Creation date: (16/03/2002 3:36:48 PM)
517 * @return java.lang.String
518 * @param input java.lang.String
519 */
520 public String stem(String input)
521 {
522 markPositions(input);
523 if (RV == 0)
524 return input; //RV wasn't detected, nothing to stem
525 StringBuilder stemmingZone = new StringBuilder(input.substring(RV));
526 // stemming goes on in RV
527 // Step 1
528
529 if (!perfectiveGerund(stemmingZone))
530 {
531 reflexive(stemmingZone);
532 // variable r is unused, we are just interested in the flow that gets
533 // created by logical expression: apply adjectival(); if that fails,
534 // apply verb() etc
535 boolean r =
536 adjectival(stemmingZone)
537 || verb(stemmingZone)
538 || noun(stemmingZone);
539 }
540 // Step 2
541 removeI(stemmingZone);
542 // Step 3
543 derivational(stemmingZone);
544 // Step 4
545 superlative(stemmingZone);
546 undoubleN(stemmingZone);
547 removeSoft(stemmingZone);
548 // return result
549 return input.substring(0, RV) + stemmingZone.toString();
550 }
551
552 /**
553 * Superlative endings.
554 * Creation date: (17/03/2002 12:14:58 AM)
555 * @param stemmingZone java.lang.StringBuilder
556 */
557 private boolean superlative(StringBuilder stemmingZone)
558 {
559 return findAndRemoveEnding(stemmingZone, superlativeEndings);
560 }
561
562 /**
563 * Undoubles N.
564 * Creation date: (17/03/2002 12:14:58 AM)
565 * @param stemmingZone java.lang.StringBuilder
566 */
567 private boolean undoubleN(StringBuilder stemmingZone)
568 {
569 char[][] doubleN = {
570 { N, N }
571 };
572 if (findEnding(stemmingZone, doubleN) != 0)
573 {
574 stemmingZone.setLength(stemmingZone.length() - 1);
575 return true;
576 }
577 else
578 {
579 return false;
580 }
581 }
582
583 /**
584 * Verb endings.
585 * Creation date: (17/03/2002 12:14:58 AM)
586 * @param stemmingZone java.lang.StringBuilder
587 */
588 private boolean verb(StringBuilder stemmingZone)
589 {
590 return findAndRemoveEnding(
591 stemmingZone,
592 verbEndings1,
593 verb1Predessors)
594 || findAndRemoveEnding(stemmingZone, verbEndings2);
595 }
596
597 /**
598 * Static method for stemming.
599 */
600 public static String stemWord(String theWord)
601 {
602 RussianStemmer stemmer = new RussianStemmer();
603 return stemmer.stem(theWord);
604 }
605 }