Skip to content
Snippets Groups Projects
Commit edc3bf87 authored by Andre Maroneze's avatar Andre Maroneze
Browse files

Merge branch 'add-stmr' into 'master'

[stmr] add case study

See merge request !43
parents 0e40a316 9c254423
No related branches found
No related tags found
1 merge request!43[stmr] add case study
Pipeline #59682 failed
Showing
with 48973 additions and 0 deletions
......@@ -174,6 +174,11 @@ solitaire:
TARGET: solitaire
<<: *make_job
stmr:
variables:
TARGET: stmr
<<: *make_job
tsvc:
variables:
TARGET: tsvc
......
......@@ -84,6 +84,7 @@ TARGETS=\
safestringlib \
semver \
solitaire \
stmr \
tsvc \
tweetnacl-usable \
verisec \
......
......@@ -166,6 +166,7 @@ when available. We also summarize the license of each directory below.
- `safestringlib`: MIT
- `semver`: MIT
- `solitaire`: public domain, see `solitaire.c`
- `stmr`: MIT
- `tsvc`: MIT, see `license.txt`
- `tweetnacl-usable`: public domain, see `LICENSE.txt`
- `verisec`: several, according to each app
......
# Makefile template for Frama-C/Eva case studies.
# For details and usage information, see the Frama-C User Manual.
### Prologue. Do not modify this block. #######################################
-include path.mk
FRAMAC ?= frama-c
include $(shell $(FRAMAC)-config -print-lib-path)/analysis-scripts/prologue.mk
###############################################################################
# Edit below as needed. Suggested flags are optional.
MACHDEP = x86_64
FRAMAC_SHARE = $(shell $(FRAMAC)-config -print-share-path)
## Preprocessing flags (for -cpp-extra-args)
CPPFLAGS += \
## General flags
FCFLAGS += \
-add-symbolic-path=..:. \
-kernel-warn-key annot:missing-spec=abort \
-kernel-warn-key typing:implicit-function-declaration=abort \
## Eva-specific flags
EVAFLAGS += \
-eva-warn-key builtins:missing-spec=abort \
-eva-precision 2 \
## GUI-only flags
FCGUIFLAGS += \
## Analysis targets (suffixed with .eva)
TARGETS = stmr.eva
### Each target <t>.eva needs a rule <t>.parse with source files as prerequisites
stmr.parse: \
../stmr.c \
../test.c \
$(FRAMAC_SHARE)/libc/stdio.c \
$(FRAMAC_SHARE)/libc/string.c \
### Epilogue. Do not modify this block. #######################################
include $(shell $(FRAMAC)-config -print-lib-path)/analysis-scripts/epilogue.mk
###############################################################################
# optional, for OSCS
-include ../../Makefile.common
directory file line function property kind status property
. stmr.c 66 isConsonant initialization Unknown \initialized(b + index_0)
. stmr.c 66 isConsonant mem_access Unknown \valid_read(b + index_0)
. stmr.c 108 getMeasure signed_overflow Unknown index_0 + 1 ≤ 2147483647
. stmr.c 111 getMeasure signed_overflow Unknown index_0 + 1 ≤ 2147483647
. stmr.c 123 getMeasure signed_overflow Unknown index_0 + 1 ≤ 2147483647
. stmr.c 126 getMeasure signed_overflow Unknown index_0 + 1 ≤ 2147483647
. stmr.c 127 getMeasure signed_overflow Unknown position + 1 ≤ 2147483647
. stmr.c 138 getMeasure signed_overflow Unknown index_0 + 1 ≤ 2147483647
. stmr.c 141 getMeasure signed_overflow Unknown index_0 + 1 ≤ 2147483647
. stmr.c 152 vowelInStem signed_overflow Unknown index_0 + 1 ≤ 2147483647
. stmr.c 164 isDoubleConsonant initialization Unknown \initialized(b + (int)(index_0 - 1))
. stmr.c 164 isDoubleConsonant initialization Unknown \initialized(b + index_0)
. stmr.c 164 isDoubleConsonant mem_access Unknown \valid_read(b + (int)(index_0 - 1))
. stmr.c 164 isDoubleConsonant mem_access Unknown \valid_read(b + index_0)
. stmr.c 189 cvc initialization Unknown \initialized(b + index_0)
. stmr.c 189 cvc mem_access Unknown \valid_read(b + index_0)
. stmr.c 204 ends initialization Unknown \initialized(b + k)
. stmr.c 204 ends mem_access Unknown \valid_read(b + k)
. stmr.c 208 ends signed_overflow Unknown (int)(k - k0) + 1 ≤ 2147483647
. stmr.c 227 setTo precondition of memmove Unknown valid_dest: valid_or_empty(dest, n)
. stmr.c 229 setTo signed_overflow Unknown j + length ≤ 2147483647
. stmr.c 266 step1ab initialization Unknown \initialized(b + k)
. stmr.c 266 step1ab mem_access Unknown \valid_read(b + k)
. stmr.c 271 step1ab initialization Unknown \initialized(b + (int)(k - 1))
. stmr.c 271 step1ab mem_access Unknown \valid_read(b + (int)(k - 1))
. stmr.c 292 step1ab initialization Unknown \initialized(b + k)
. stmr.c 292 step1ab mem_access Unknown \valid_read(b + k)
. stmr.c 308 step1c mem_access Unknown \valid(b + k)
. stmr.c 318 step2 initialization Unknown \initialized(b + (int)(k - 1))
. stmr.c 318 step2 mem_access Unknown \valid_read(b + (int)(k - 1))
. stmr.c 457 step3 initialization Unknown \initialized(b + k)
. stmr.c 457 step3 mem_access Unknown \valid_read(b + k)
. stmr.c 508 step4 initialization Unknown \initialized(b + (int)(k - 1))
. stmr.c 508 step4 mem_access Unknown \valid_read(b + (int)(k - 1))
. stmr.c 566 step4 initialization Unknown \initialized(b + j)
. stmr.c 566 step4 mem_access Unknown \valid_read(b + j)
. stmr.c 628 step5 initialization Unknown \initialized(b + k)
. stmr.c 628 step5 mem_access Unknown \valid_read(b + k)
. stmr.c 636 step5 initialization Unknown \initialized(b + k)
. stmr.c 636 step5 mem_access Unknown \valid_read(b + k)
. test.c 21 assertStem signed_overflow Unknown tmp_0 + 1 ≤ 2147483647
. test.c 21 assertStem precondition of strlen Unknown valid_string_s: valid_read_string(s)
. test.c 21 assertStem mem_access Unknown \valid(result + (int)(tmp_0 + 1))
. test.c 24 assertStem signed_overflow Unknown errorCount + 1 ≤ 2147483647
. test.c 27 assertStem precondition of fprintf_va_2 Unknown valid_read_string(param0)
. test.c 27 assertStem precondition of fprintf_va_2 Unknown valid_read_string(param1)
. test.c 27 assertStem precondition of fprintf_va_2 Unknown valid_read_string(param2)
. test.c 34 assertStem signed_overflow Unknown assertionCount + 1 ≤ 2147483647
. test.c 63 main precondition of strlen Unknown valid_string_s: valid_read_string(s)
. test.c 63 main mem_access Unknown \valid(lineIn + (size_t)(tmp - 1))
. test.c 64 main precondition of strlen Unknown valid_string_s: valid_read_string(s)
. test.c 64 main mem_access Unknown \valid(lineOut + (size_t)(tmp_0 - 1))
FRAMAC_SHARE/libc stdio.c 90 getline mem_access Unknown \valid(*lineptr + tmp_2)
FRAMAC_SHARE/libc stdio.c 93 getline mem_access Unknown \valid(*lineptr + cur)
FRAMAC_SHARE/libc stdio.h 207 fprintf_va_2 precondition Unknown valid_read_string(param0)
FRAMAC_SHARE/libc stdio.h 207 fprintf_va_2 precondition Unknown valid_read_string(param1)
FRAMAC_SHARE/libc stdio.h 207 fprintf_va_2 precondition Unknown valid_read_string(param2)
FRAMAC_SHARE/libc string.c 146 strcmp initialization Unknown \initialized(s1 + i)
FRAMAC_SHARE/libc string.c 146 strcmp initialization Unknown \initialized(s2 + i)
FRAMAC_SHARE/libc string.c 146 strcmp mem_access Unknown \valid_read(s1 + i)
FRAMAC_SHARE/libc string.c 146 strcmp mem_access Unknown \valid_read(s2 + i)
FRAMAC_SHARE/libc string.c 149 strcmp initialization Unknown \initialized((unsigned char *)s1 + i)
FRAMAC_SHARE/libc string.c 149 strcmp initialization Unknown \initialized((unsigned char *)s2 + i)
FRAMAC_SHARE/libc string.c 149 strcmp mem_access Unknown \valid_read((unsigned char *)s1 + i)
FRAMAC_SHARE/libc string.c 149 strcmp mem_access Unknown \valid_read((unsigned char *)s2 + i)
FRAMAC_SHARE/libc string.c 169 memcmp initialization Unknown \initialized(p1 + i)
FRAMAC_SHARE/libc string.c 169 memcmp mem_access Unknown \valid_read(p1 + i)
FRAMAC_SHARE/libc string.c 318 strdup precondition of strlen Unknown valid_string_s: valid_read_string(s)
FRAMAC_SHARE/libc string.c 324 strdup precondition of memcpy Unknown valid_dest: valid_or_empty(dest, n)
FRAMAC_SHARE/libc string.c 324 strdup precondition of memcpy Unknown valid_src: valid_read_or_empty(src, n)
FRAMAC_SHARE/libc string.h 95 memcpy precondition Unknown valid_dest: valid_or_empty(dest, n)
FRAMAC_SHARE/libc string.h 96 memcpy precondition Unknown valid_src: valid_read_or_empty(src, n)
FRAMAC_SHARE/libc string.h 120 memmove precondition Unknown valid_dest: valid_or_empty(dest, n)
FRAMAC_SHARE/libc string.h 141 strlen precondition Unknown valid_string_s: valid_read_string(s)
[metrics] Eva coverage statistics
=======================
Syntactically reachable functions = 17 (out of 17)
Semantically reached functions = 17
Coverage estimation = 100.0%
[metrics] References to non-analyzed functions
------------------------------------
[metrics] Statements analyzed by Eva
--------------------------
500 stmts in analyzed functions, 497 stmts analyzed (99.4%)
assertStem: 23 stmts out of 23 (100.0%)
cvc: 26 stmts out of 26 (100.0%)
ends: 17 stmts out of 17 (100.0%)
getMeasure: 39 stmts out of 39 (100.0%)
isDoubleConsonant: 7 stmts out of 7 (100.0%)
main: 36 stmts out of 36 (100.0%)
replace: 4 stmts out of 4 (100.0%)
setTo: 4 stmts out of 4 (100.0%)
stem: 16 stmts out of 16 (100.0%)
step1ab: 48 stmts out of 48 (100.0%)
step1c: 6 stmts out of 6 (100.0%)
step2: 93 stmts out of 93 (100.0%)
step3: 34 stmts out of 34 (100.0%)
step5: 17 stmts out of 17 (100.0%)
vowelInStem: 15 stmts out of 15 (100.0%)
step4: 93 stmts out of 94 (98.9%)
isConsonant: 19 stmts out of 21 (90.5%)
stmr.c:74:[eva] warning: Using specification of function isConsonant for recursive calls.
Analysis of function isConsonant is thus incomplete and its soundness
relies on the written specification.
This diff is collapsed.
[metrics] Defined functions (17)
======================
assertStem (3 calls); cvc (2 calls); ends (56 calls); getMeasure (6 calls);
isConsonant (9 calls); isDoubleConsonant (2 calls); main (0 call);
replace (28 calls); setTo (6 calls); stem (1 call); step1ab (1 call);
step1c (1 call); step2 (1 call); step3 (1 call); step4 (1 call);
step5 (1 call); vowelInStem (2 calls);
Specified-only functions (0)
============================
Undefined and unspecified functions (0)
=======================================
'Extern' global variables (0)
=============================
Potential entry points (1)
==========================
main;
Global metrics
==============
Sloc = 500
Decision point = 132
Global variables = 6
If = 106
Loop = 6
Goto = 37
Assignment = 154
Exit point = 17
Function = 17
Function call = 152
Pointer dereferencing = 22
Cyclomatic complexity = 149
TEST = test.c stmr.c
OBJ_TEST = $(TEST:.c=.o)
CFLAGS = -D_GNU_SOURCE -std=c99
LFLAGS = -Wall -Wno-format-y2k -W -Wstrict-prototypes -Wmissing-prototypes \
-Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch \
-Wshadow -Wcast-align -Wbad-function-cast -Wchar-subscripts -Winline \
-Wnested-externs -Wredundant-decls
COVFLAGS = -Wall -fprofile-arcs -ftest-coverage
test: $(OBJ_TEST)
$(CC) $(OBJ_TEST) -o $@
coverage: $(OBJ_TEST)
gcc $(COVFLAGS) $(TEST) -o $@
.SUFFIXES: .c .o
.c.o:
$(CC) $< $(CFLAGS) $(LFLAGS) -c -o $@
run-coverage: coverage
./coverage && gcov stmr
run-test: test
./test
clean:
rm -f coverage test $(OBJ_TEST) *.gc{ov,da,no}
.PHONY: clean run-coverage run-test
Martin Porter’s Stemming algorithm as a C library
https://github.com/wooorm/stmr.c
HAS_DYN_ALLOC, HAS_RECURSION, NO_FLOAT
This diff is collapsed.
This diff is collapsed.
github: wooorm
(The MIT License)
Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
'Software'), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
{
"name": "stmr.c",
"version": "1.0.0",
"description": "The Porter Stemmer algorithm",
"license": "MIT",
"keywords": [
"martin",
"porter",
"stemmer",
"algorithm"
],
"repo": "wooorm/stmr.c",
"src": [
"stmr.h",
"stmr.c"
]
}
# stmr(3) [![Build Status][travis-badge]][travis] [![Coverage Status][coveralls-badge]][coveralls]
Martin Porter’s [Stemming algorithm][algo] as a C library.
There’s also a CLI: [stmr(1)][cli].
## Installation
[clib][]:
```bash
clib install wooorm/stmr.c
```
Or clone the repo.
## Usage
### `int stem(char *pointer, int start, int end)`
```c
#include <stdio.h>
#include <string.h>
#include "stmr.h"
int
main(int argc, char **argv) {
char *word = argv[1];
int end = stem(word, 0, strlen(word) - 1);
word[end + 1] = 0;
printf("%s", word);
}
```
## Related
* [`stemmer`][lib] — Same algorithm in JavaScript
* [`stmr`][cli]
— CLI in C
## License
[MIT][license] © [Titus Wormer][author]
<!-- Definitions -->
[travis-badge]: https://img.shields.io/travis/wooorm/stmr.c.svg
[travis]: https://travis-ci.org/wooorm/stmr.c
[coveralls-badge]: https://img.shields.io/coveralls/wooorm/stmr.c.svg
[coveralls]: https://coveralls.io/github/wooorm/stmr.c
[license]: license
[author]: http://wooorm.com
[algo]: http://tartarus.org/martin/PorterStemmer/
[cli]: https://github.com/wooorm/stmr
[lib]: https://github.com/words/stemmer
[clib]: https://github.com/clibs/clib
/* This is the Porter stemming algorithm, coded up in ANSI C by the
* author. It may be be regarded as canonical, in that it follows the
* algorithm presented in
*
* Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
* no. 3, pp 130-137,
*
* only differing from it at the points marked --DEPARTURE-- below.
*
* See also http://www.tartarus.org/~martin/PorterStemmer
*
* The algorithm as described in the paper could be exactly replicated
* by adjusting the points of DEPARTURE, but this is barely necessary,
* because (a) the points of DEPARTURE are definitely improvements, and
* (b) no encoding of the Porter stemmer I have seen is anything like
* as exact as this version, even with the points of DEPARTURE!
*
* You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
* 'stem' takes a list of inputs and sends the stemmed equivalent to
* stdout.
*
* The algorithm as encoded here is particularly fast.
*
* Release 1: was many years ago
* Release 2: 11 Apr 2013
* fixes a bug noted by Matt Patenaude <matt@mattpatenaude.com>,
*
* case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break;
* ==>
* case 'o': if (ends("\03" "ion") && j >= k0 && (b[j] == 's' || b[j] == 't')) break;
*
* to avoid accessing b[k0-1] when the word in b is "ion".
* Release 3: 25 Mar 2014
* fixes a similar bug noted by Klemens Baum <klemensbaum@gmail.com>,
* that if step1ab leaves a one letter result (ied -> i, aing -> a etc),
* step2 and step4 access the byte before the first letter. So we skip
* steps after step1ab unless k > k0. */
#include <string.h>
#include "stmr.h"
/* The main part of the stemming algorithm starts here. b is a buffer
* holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
* ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
* downwards as the stemming progresses. Zero termination is not in fact
* used in the algorithm.
*
* Note that only lower case sequences are stemmed. Forcing to lower case
* should be done before stem(...) is called. */
/* buffer for word to be stemmed */
static char *b;
static int k;
static int k0;
/* j is a general offset into the string */
static int j;
/**
* TRUE when `b[i]` is a consonant.
*/
/*@ assigns \result \from indirect:b[index], indirect:index; */
static int
isConsonant(int index) {
switch (b[index]) {
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
return FALSE;
case 'y':
return (index == k0) ? TRUE : !isConsonant(index - 1);
default:
return TRUE;
}
}
/* Measure the number of consonant sequences between
* `k0` and `j`. If C is a consonant sequence and V
* a vowel sequence, and <..> indicates arbitrary
* presence:
*
* <C><V> gives 0
* <C>VC<V> gives 1
* <C>VCVC<V> gives 2
* <C>VCVCVC<V> gives 3
* ....
*/
static int
getMeasure() {
int position;
int index;
position = 0;
index = k0;
while (TRUE) {
if (index > j) {
return position;
}
if (!isConsonant(index)) {
break;
}
index++;
}
index++;
while (TRUE) {
while (TRUE) {
if (index > j) {
return position;
}
if (isConsonant(index)) {
break;
}
index++;
}
index++;
position++;
while (TRUE) {
if (index > j) {
return position;
}
if (!isConsonant(index)) {
break;
}
index++;
}
index++;
}
}
/* `TRUE` when `k0, ... j` contains a vowel. */
static int
vowelInStem() {
int index;
index = k0 - 1;
while (++index <= j) {
if (!isConsonant(index)) {
return TRUE;
}
}
return FALSE;
}
/* `TRUE` when `j` and `(j-1)` are the same consonant. */
static int
isDoubleConsonant(int index) {
if (b[index] != b[index - 1]) {
return FALSE;
}
return isConsonant(index);
}
/* `TRUE` when `i - 2, i - 1, i` has the form
* `consonant - vowel - consonant` and also if the second
* C is not `"w"`, `"x"`, or `"y"`. this is used when
* trying to restore an `e` at the end of a short word.
*
* Such as:
*
* `cav(e)`, `lov(e)`, `hop(e)`, `crim(e)`, but `snow`,
* `box`, `tray`.
*/
static int
cvc(int index) {
int character;
if (index < k0 + 2 || !isConsonant(index) || isConsonant(index - 1) || !isConsonant(index - 2)) {
return FALSE;
}
character = b[index];
if (character == 'w' || character == 'x' || character == 'y') {
return FALSE;
}
return TRUE;
}
/* `ends(s)` is `TRUE` when `k0, ...k` ends with `value`. */
static int
ends(const char *value) {
int length = value[0];
/* Tiny speed-up. */
if (value[length] != b[k]) {
return FALSE;
}
if (length > k - k0 + 1) {
return FALSE;
}
if (memcmp(b + k - length + 1, value + 1, length) != 0) {
return FALSE;
}
j = k - length;
return TRUE;
}
/* `setTo(value)` sets `(j + 1), ...k` to the characters in
* `value`, readjusting `k`. */
static void
setTo(const char *value) {
int length = value[0];
memmove(b + j + 1, value + 1, length);
k = j + length;
}
/* Set string. */
static void
replace(const char *value) {
if (getMeasure() > 0) {
setTo(value);
}
}
/* `step1ab()` gets rid of plurals, `-ed`, `-ing`.
*
* Such as:
*
* caresses -> caress
* ponies -> poni
* ties -> ti
* caress -> caress
* cats -> cat
*
* feed -> feed
* agreed -> agree
* disabled -> disable
*
* matting -> mat
* mating -> mate
* meeting -> meet
* milling -> mill
* messing -> mess
*
* meetings -> meet
*/
static void
step1ab() {
int character;
if (b[k] == 's') {
if (ends("\04" "sses")) {
k -= 2;
} else if (ends("\03" "ies")) {
setTo("\01" "i");
} else if (b[k - 1] != 's') {
k--;
}
}
if (ends("\03" "eed")) {
if (getMeasure() > 0) {
k--;
}
} else if ((ends("\02" "ed") || ends("\03" "ing")) && vowelInStem()) {
k = j;
if (ends("\02" "at")) {
setTo("\03" "ate");
} else if (ends("\02" "bl")) {
setTo("\03" "ble");
} else if (ends("\02" "iz")) {
setTo("\03" "ize");
} else if (isDoubleConsonant(k)) {
k--;
character = b[k];
if (character == 'l' || character == 's' || character == 'z') {
k++;
}
} else if (getMeasure() == 1 && cvc(k)) {
setTo("\01" "e");
}
}
}
/* `step1c()` turns terminal `"y"` to `"i"` when there
* is another vowel in the stem. */
static void
step1c() {
if (ends("\01" "y") && vowelInStem()) {
b[k] = 'i';
}
}
/* `step2()` maps double suffices to single ones.
* so -ization ( = -ize plus -ation) maps to -ize etc.
* note that the string before the suffix must give
* getMeasure() > 0. */
static void
step2() {
switch (b[k - 1]) {
case 'a':
if (ends("\07" "ational")) {
replace("\03" "ate");
break;
}
if (ends("\06" "tional")) {
replace("\04" "tion");
break;
}
break;
case 'c':
if (ends("\04" "enci")) {
replace("\04" "ence");
break;
}
if (ends("\04" "anci")) {
replace("\04" "ance");
break;
}
break;
case 'e':
if (ends("\04" "izer")) {
replace("\03" "ize");
break;
}
break;
case 'l':
/* --DEPARTURE--: To match the published algorithm,
* replace this line with:
*
* ```
* if (ends("\04" "abli")) {
* replace("\04" "able");
*
* break;
* }
* ```
*/
if (ends("\03" "bli")) {
replace("\03" "ble");
break;
}
if (ends("\04" "alli")) {
replace("\02" "al");
break;
}
if (ends("\05" "entli")) {
replace("\03" "ent");
break;
}
if (ends("\03" "eli")) {
replace("\01" "e");
break;
}
if (ends("\05" "ousli")) {
replace("\03" "ous");
break;
}
break;
case 'o':
if (ends("\07" "ization")) {
replace("\03" "ize");
break;
}
if (ends("\05" "ation")) {
replace("\03" "ate");
break;
}
if (ends("\04" "ator")) {
replace("\03" "ate");
break;
}
break;
case 's':
if (ends("\05" "alism")) {
replace("\02" "al");
break;
}
if (ends("\07" "iveness")) {
replace("\03" "ive");
break;
}
if (ends("\07" "fulness")) {
replace("\03" "ful");
break;
}
if (ends("\07" "ousness")) {
replace("\03" "ous");
break;
}
break;
case 't':
if (ends("\05" "aliti")) {
replace("\02" "al");
break;
}
if (ends("\05" "iviti")) {
replace("\03" "ive");
break;
}
if (ends("\06" "biliti")) {
replace("\03" "ble");
break;
}
break;
/* --DEPARTURE--: To match the published algorithm, delete this line. */
case 'g':
if (ends("\04" "logi")) {
replace("\03" "log");
break;
}
}
}
/* `step3()` deals with -ic-, -full, -ness etc.
* similar strategy to step2. */
static void
step3() {
switch (b[k]) {
case 'e':
if (ends("\05" "icate")) {
replace("\02" "ic");
break;
}
if (ends("\05" "ative")) {
replace("\00" "");
break;
}
if (ends("\05" "alize")) {
replace("\02" "al");
break;
}
break;
case 'i':
if (ends("\05" "iciti")) {
replace("\02" "ic");
break;
}
break;
case 'l':
if (ends("\04" "ical")) {
replace("\02" "ic");
break;
}
if (ends("\03" "ful")) {
replace("\00" "");
break;
}
break;
case 's':
if (ends("\04" "ness")) {
replace("\00" "");
break;
}
break;
}
}
/* `step4()` takes off -ant, -ence etc., in
* context <c>vcvc<v>. */
static void
step4() {
switch (b[k - 1]) {
case 'a':
if (ends("\02" "al")) {
break;
}
return;
case 'c':
if (ends("\04" "ance")) {
break;
}
if (ends("\04" "ence")) {
break;
}
return;
case 'e':
if (ends("\02" "er")) {
break;
}
return;
case 'i':
if (ends("\02" "ic")) {
break;
}
return;
case 'l':
if (ends("\04" "able")) {
break;
}
if (ends("\04" "ible")) {
break;
}
return;
case 'n':
if (ends("\03" "ant")) {
break;
}
if (ends("\05" "ement")) {
break;
}
if (ends("\04" "ment")) {
break;
}
if (ends("\03" "ent")) {
break;
}
return;
case 'o':
if (ends("\03" "ion") && j >= k0 && (b[j] == 's' || b[j] == 't')) {
break;
}
/* takes care of -ous */
if (ends("\02" "ou")) {
break;
}
return;
case 's':
if (ends("\03" "ism")) {
break;
}
return;
case 't':
if (ends("\03" "ate")) {
break;
}
if (ends("\03" "iti")) {
break;
}
return;
case 'u':
if (ends("\03" "ous")) {
break;
}
return;
case 'v':
if (ends("\03" "ive")) {
break;
}
return;
case 'z':
if (ends("\03" "ize")) {
break;
}
return;
default:
return;
}
if (getMeasure() > 1) {
k = j;
}
}
/* `step5()` removes a final `-e` if `getMeasure()` is
* greater than `1`, and changes `-ll` to `-l` if
* `getMeasure()` is greater than `1`. */
static void
step5() {
int a;
j = k;
if (b[k] == 'e') {
a = getMeasure();
if (a > 1 || (a == 1 && !cvc(k - 1))) {
k--;
}
}
if (b[k] == 'l' && isDoubleConsonant(k) && getMeasure() > 1) {
k--;
}
}
/* In `stem(p, i, j)`, `p` is a `char` pointer, and the
* string to be stemmed is from `p[i]` to
* `p[j]` (inclusive).
*
* Typically, `i` is zero and `j` is the offset to the
* last character of a string, `(p[j + 1] == '\0')`.
* The stemmer adjusts the characters `p[i]` ... `p[j]`
* and returns the new end-point of the string, `k`.
*
* Stemming never increases word length, so `i <= k <= j`.
*
* To turn the stemmer into a module, declare 'stem' as
* extern, and delete the remainder of this file. */
int
stem(char *p, int index, int position) {
/* Copy the parameters into statics. */
b = p;
k = position;
k0 = index;
if (k <= k0 + 1) {
return k; /* --DEPARTURE-- */
}
/* With this line, strings of length 1 or 2 don't
* go through the stemming process, although no
* mention is made of this in the published
* algorithm. Remove the line to match the published
* algorithm. */
step1ab();
if (k > k0) {
step1c();
step2();
step3();
step4();
step5();
}
return k;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment