Home Reference Source

src/stemmers/stemmer.js

import { stopwords } from '../util/stopwords.js';

import { AggressiveTokenizer as Tokenizer } from '../tokenizers/aggressive-tokenizer.js';

export const stemmer = function() {
    const stemmer = this;

    stemmer.stem = token => token;

    stemmer.addStopWord = stopWord => {
        stopwords.words.push(stopWord);
    };

    stemmer.addStopWords = moreStopWords => {
        stopwords.words = stopwords.words.concat(moreStopWords);
    };

    stemmer.removeStopWord = function(stopWord) {
        this.removeStopWords([stopWord])
    };

    stemmer.removeStopWords = moreStopWords => {
        moreStopWords.forEach(stopWord => {
            const idx = stopwords.words.indexOf(stopWord);
            if (idx >= 0) {
                stopwords.words.splice(idx, 1);
            }
        });

    };


    stemmer.tokenizeAndStem = (text, keepStops) => {
        const stemmedTokens = [];
        const lowercaseText = text.toLowerCase();
        const tokens = new Tokenizer().tokenize(lowercaseText);

        if (keepStops) {
            tokens.forEach(token => {
                stemmedTokens.push(stemmer.stem(token));
            });
        }

        else {
            tokens.forEach(token => {
                if (!stopwords.words.includes(token))
                    stemmedTokens.push(stemmer.stem(token));
            });
        }

        return stemmedTokens;
    };

    stemmer.attach = () => {
        String.prototype.stem = function() {
            return stemmer.stem(this);
        };

        String.prototype.tokenizeAndStem = function(keepStops) {
            return stemmer.tokenizeAndStem(this, keepStops);
        };
    };
};