码迷,mamicode.com
首页 > Web开发 > 详细

Lucene 7.2.1 自定义Analyzer和TokenFilter

时间:2018-03-14 16:57:23      阅读:499      评论:0      收藏:0      [点我收藏+]

标签:throw   span   push   size   string   gets   eclips   throws   跟踪   

1.自定义Analyzer:

import com.dys.lucene.filter.SameWordTokenFilter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;

public class SameWordAnalyzer extends Analyzer {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {

        StandardTokenizer standardTokenizer = new StandardTokenizer();

        SameWordTokenFilter sameWordTokenFilter = new SameWordTokenFilter(standardTokenizer);

        TokenStreamComponents tokenStreamComponents = new TokenStreamComponents(standardTokenizer, sameWordTokenFilter);

        return tokenStreamComponents;
    }
}

2.自定义TokenFilter

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;

public class SameWordTokenFilter extends TokenFilter {

    private CharTermAttribute charTermAttribute;
    private PositionIncrementAttribute positionIncrementAttribute;
    private State state;
    private Stack<String> stack;

    public SameWordTokenFilter(TokenStream input) {
        super(input);
        this.stack = new Stack<>();
        this.charTermAttribute = this.addAttribute(CharTermAttribute.class);
        this.positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class);
        this.stack = new Stack<>();
    }

    @Override
    public final boolean incrementToken() throws IOException {

        while (this.stack.size() > 0) {

            this.restoreState(this.state);

            this.charTermAttribute.setEmpty();
            this.charTermAttribute.append(this.stack.pop());

            this.positionIncrementAttribute.setPositionIncrement(0);

            return true;
        }

        if (!this.input.incrementToken()) {
            return false;
        }

        String term = this.charTermAttribute.toString();

        if (this.getSameWords(term)) {
            this.state = this.captureState();
        }

        return true;
    }

    private boolean getSameWords(String name) {

        Map<String, String[]> map = new HashMap<>();
        map.put("美", new String[]{"美丽", "好看"});
        map.put("花", new String[]{"鲜花", "花朵"});

        String[] words = map.get(name);

        if (words != null) {
            for (String word : words) {
                this.stack.push(word);
            }

            return true;
        }

        return false;
    }
}

3.使用自定义Analyzer和自定义TokenFilter

ArrayList<String> strings = new ArrayList<String>() {{
            this.add("小鬼子");
            this.add("美国佬");
        }};
        Analyzer analyzer = new CustomStandardAnalyzer(strings);
        String content = "小鬼子 and 美国佬 are playing together!";
        TokenStream tokenStream = analyzer.tokenStream("myfield", content);
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            // 已经过滤掉自定义停用词
            // 输出:playing   together
            System.out.println(charTermAttribute.toString());
        }
        tokenStream.end();
        tokenStream.close();
        
        analyzer.close();

4.代码解释,具体Analyzer和 TokenFilter之间的关联,用Eclipse的DEBUG功能,跟踪理解。

Lucene 7.2.1 自定义Analyzer和TokenFilter

标签:throw   span   push   size   string   gets   eclips   throws   跟踪   

原文地址:https://www.cnblogs.com/dingyingsi/p/8568104.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!