Analyzer

picturesafe-search supports the definition of custom analyzers for the Elasticsearch text analysis feature.

Default analyzer

In the default configuration, a picturesafe-search specific built-in analyzer is activated by default:

{
  "picturesafe-search-sample-20200424-115854-030": {
    "settings": {
      "index": {
        "mapping": {
          "total_fields": {
            "limit": "1000"
          }
        },
        "number_of_shards": "1",
        "provided_name": "picturesafe-search-sample-20200424-115854-030",
        "max_result_window": "10000",
        "creation_date": "1587722334143",
        "analysis": {
          "filter": {
            "filter_word_delimiter": {
              "type": "word_delimiter_graph",
              "split_on_numerics": "false",
              "split_on_case_change": "false"
            }
          },
          "analyzer": {
            "default": {
              "filter": [
                "filter_word_delimiter",
                "lowercase",
                "trim"
              ],
              "char_filter": [
                "umlaut_mapping"
              ],
              "tokenizer": "standard"
            }
          },
          "char_filter": {
            "umlaut_mapping": {
              "type": "mapping",
              "mappings": [
                "ä=>ae",
                "Ä=>Ae",
                "ö=>oe",
                "Ö=>Oe",
                "ü=>ue",
                "Ü=>Ue",
                "ß=>ss"
              ]
            }
          }
        },
        "number_of_replicas": "0",
        "uuid": "jTRyYSkRS5S7xuh9CzMrlQ",
        "version": {
          "created": "7060199"
        }
      }
    }
  }
}

This analyzer can be deactivated in the elasticsearch.properties:

elasticsearch.index.default_analyzer.enabled=false

If you disable the picturesafe-search default analyzer, Elasticsearch uses its built-in Standard Analyzer.

Create custom analyzer

When the picturesafe-search or Elasticsearch built-in analyzers do not fulfill your needs, you can create custom analyzers.

If the default configuration is used, the DefaultIndexConfiguration can easily be derived, as the following example shows:

public class CustomIndexConfiguration extends DefaultIndexConfiguration {

    public static String CUSTOM_ANALYZER_NAME = "file_name";

    @Bean
    @Override
    public StandardIndexPresetConfiguration indexPresetConfiguration() {

        StandardIndexPresetConfiguration cfg = super.indexPresetConfiguration();
        try {
            IndexSettingsObject fileNameTokenizer 
                = new IndexSettingsObject("file_name_tokenizer");
            fileNameTokenizer.content().startObject()
                    .field("type", "char_group")
                    .array("tokenize_on_chars", "whitespace", ".", "-", "_", "\n")
                    .endObject();
            IndexSettingsObject fileNameAnalyzer 
                = new IndexSettingsObject(CUSTOM_ANALYZER_NAME);
            fileNameAnalyzer.content().startObject()
                    .field("type", "custom")
                    .field("tokenizer", "file_name_tokenizer")
                    .array("filter", "lowercase")
                    .endObject();
            cfg.addCustomTokenizers(fileNameTokenizer);
            cfg.addCustomAnalyzers(fileNameAnalyzer);
        } catch (IOException e) {
            throw new RuntimeException("Failed to set custom analyzer!", e);
        }
        return cfg;
    }

    // Override method to disable the picturesafe-search built-in default anaylzer.
    // Alternatively add the following property to elasticsearch.properties: 
    // elasticsearch.index.default_analyzer.enabled=false
    @Override
    protected boolean isDefaultAnalyzerEnabled() {
        return false;
    }
}

Above CustomIndexConfiguration has to be imported and can then be used for a field configuration:

@Configuration
@ComponentScan(basePackages = {"de.picturesafe.search.elasticsearch"})
@Import({DefaultClientConfiguration.class, 
    CustomIndexConfiguration.class, DefaultQueryConfiguration.class})
public class Config {

    @Bean
    List<FieldConfiguration> fieldConfigurations() {
        return Collections.singletonList(
            StandardFieldConfiguration.builder("filename", ElasticsearchType.TEXT)
                .analyzer(CUSTOM_ANALYZER_NAME).build()
        );
    }
}

Via the StandardFieldConfiguration builder, a defined custom analyzer can be assigned to the field for which the analyzer is to be used:

StandardFieldConfiguration.builder("filename", ElasticsearchType.TEXT)
    .analyzer(CUSTOM_ANALYZER_NAME).build()

This field configuration leads to the following Elasticsearch mapping:

{
  "picturesafe-search-sample-20200424-134252-455": {
    "mappings": {
      "properties": {
        "filename": {
          "type": "text",
          "analyzer": "file_name"
        },
        "fulltext": {
          "type": "text"
        }
      }
    }
  }
}

A custom analyzer sample can be found here.