termFilter property Null safety

TermFilter termFilter
override

A filter function that returns a collection of terms from term:

  • return an empty collection if the term is to be excluded from analysis;
  • return multiple terms if the term is split; and/or
  • return modified term(s), such as applying a stemmer algorithm.

Implementation

@override
TermFilter get termFilter => (Term term) async {
      // remove white-space from start and end of term
      term = term.trim();
      final terms = <String>{};
      // exclude empty terms and that are stopwords
      var exception = termExceptions[term]?.trim();
      if (term.isNotEmpty && !stopWords.contains(term)) {
        if (abbreviations.keys.contains(term)) {
          // return the abbreviation and a version with no punctuation.
          terms.addAll({term, term.replaceAll('.', '').trim()});
        } else if (exception != null) {
          terms.add(exception);
        } else {
          {
            // Cleans the term as follows:
            // - change all quote marks to single apostrophe +U0027;
            // - remove enclosing quote marks;
            // - change all dashes to single standard hyphen;
            // - remove all characters except letters and numbers at end of term
            term = characterFilter(term);
            // check the resulting term is longer than 1 characters and not
            // contained in [stopWords]
            if (!stopWords.contains(term) && term.length > 1) {
              // - insert [term] in the return value
              terms.add(term);
              // insert a version without apostrophes and/or hyphens
              final unHyphenated =
                  term.replaceAll(RegExp(r"['\-]"), '').trim();
              terms.add(unHyphenated);
              // split at all non-word characters unless preceded and ended by a number.
              final splitTerms = term.split(RegExp(
                  r'(?<=[^0-9\b])[^a-zA-Z0-9À-öø-ÿ]+|[^a-zA-Z0-9À-öø-ÿ]+(?=[^0-9\b])'));
              for (var splitTerm in splitTerms) {
                exception = termExceptions[splitTerm.trim()]?.trim();
                // var tokenTerm = splitTerm;
                if (exception != null) {
                  // add the exception
                  terms.add(exception);
                } else if (splitTerm.isNotEmpty) {
                  if (!stopWords.contains(splitTerm) &&
                      splitTerm.length > 1) {
                    // only add terms longer than 1 character to exclude possesives etc.
                    terms.add(splitTerm);
                  }
                }
              }
            }
          }
        }
      }
      final retVal = (terms.map((e) {
        final exception = termExceptions[e];
        if (exception != null) {
          return exception;
        }
        final stemmedTerm = stemmer(lemmatizer(e.trim())).trim();
        return termExceptions[stemmedTerm] ?? stemmedTerm;
      }).toSet());
      retVal.removeWhere((e) => e.isEmpty);
      return retVal;
    };