Как выделить результаты поиска с 3 или 4 словами до и после в Lucene? - PullRequest
0 голосов
/ 23 мая 2019

Я новичок в Lucene. Я пытаюсь выделить результаты поиска и добавить только 3 или 4 слова до и после найденных результатов.

Я попробовал приложенный пример кода и расширил его, чтобы включить в него различные фрагменты, также попробовал Spans и попробовал использовать оба класса Highlighter и UnifiedHighlighter.

Различные примеры, которые я нашел в Сети, были для Lucene v3 и v4, и ни одна из них не работает с текущей версией.

// package com.makble.lucenesearchhighlight;

// Downloaded from: http://makble.com/how-to-do-lucene-search-highlight-example

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.search.uhighlight.*;
import org.apache.lucene.search.spans.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class makble {

    public static Analyzer analyzer = new StandardAnalyzer();
    public static IndexWriterConfig config = new IndexWriterConfig(
            analyzer);
    public static RAMDirectory ramDirectory = new RAMDirectory();
    public static IndexWriter indexWriter;

    public static Query queryToSearch = null;
    // Control which fragmenter to use, both false means use default
    // SimpleFragmenter
    public static boolean useNullFrag = false;
    public static boolean useSpanFrag = false;

    public static IndexReader idxReader;
    public static IndexSearcher idxSearcher;
    public static TopDocs hits;
    public static String content_file =
      "<path to some file>";
    public static String doc_title = "How to read child UTF8 text file into"
                       + " String type in Java";
    public static String query_field = "title";
    public static String alt_query_field = "content";
    public static String queryString = "child implied";
    public static String query_type = "parse";
    // slop value for phrase queries
    public static int pq_slop = 0;

    // max edits value for fuzzy queries
    public static int max_edits = 2;

    // Maximum number of fragments to search through
    public static int maxNumFragments = 1000;

    // Show all tokens
    public static boolean showTokens = false;
    public static boolean unifiedhighlight = false;

    public static String readFileString(String file) {
        StringBuffer text = new StringBuffer();
        try {

            BufferedReader in = new BufferedReader(new InputStreamReader(
                    new FileInputStream(new File(file)), "UTF8"));
            String line;
            while ((line = in.readLine()) != null) {
                text.append(line + "\r\n");
            }

        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        return text.toString();
    }

    private static void do_unifiedhighlight() {
        System.out.println("\nUnified Highlighter results.\n");
        UnifiedHighlighter highlighter = new UnifiedHighlighter(
                idxSearcher, analyzer);
        try {
            String[] fragments = highlighter.highlight(query_field,
                                                       queryToSearch, hits);

            System.out.println("Fragments.\n");
            for(String f : fragments)
            {
                System.out.println(f);
            }

            //To get which fragment belong to which doc/file
            System.out.println("\nDocs and Fragments.\n");

            for (int i = 0; i < hits.scoreDocs.length; i++)
            {
                int docid = hits.scoreDocs[i].doc;
                Document doc = idxSearcher.doc(docid);

                String filePath = doc.get("path");
                System.out.println(filePath);
                System.out.println(fragments[i]);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private static void do_highlight() {
        try {
            SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
            QueryScorer qScorer = new QueryScorer(queryToSearch);
            Highlighter highlighter = new Highlighter(
                    htmlFormatter, qScorer);
            if (useNullFrag) {
                NullFragmenter nullFrag = new NullFragmenter();
                highlighter.setTextFragmenter(nullFrag);
            }
            if (useSpanFrag) {
                SimpleSpanFragmenter spanFrag =
                    new SimpleSpanFragmenter(qScorer);
                highlighter.setTextFragmenter(spanFrag);
            }

            System.out.println("reader maxDoc is " + idxReader.maxDoc());
            System.out.println("scoreDoc size: " + hits.scoreDocs.length);
            int numTotalHits = Math.toIntExact(hits.totalHits.value);
            for (int i = 0; i < numTotalHits; i++) {
                System.out.println("\nstart highlight the ALT field - "
                                   + alt_query_field);
                int id = hits.scoreDocs[i].doc;
                Document docHit = idxSearcher.doc(id);
                String text = docHit.get(alt_query_field);
                TokenStream tokenStream = TokenSources.getAnyTokenStream(idxReader, id, alt_query_field, analyzer);
                TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, maxNumFragments);
                for (int j = 0; j < frag.length; j++) {
                    if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                        System.out.println(frag[j].getFragNum() + ": (" +
                                           frag[j].toString().length() +
                                           ") " + frag[j].toString());
                    }
                }

                System.out.println("\nstart highlight the field - "
                                   + query_field);
                // Term vector
                text = docHit.get(query_field);
                tokenStream = TokenSources.getAnyTokenStream(
                        idxSearcher.getIndexReader(), hits.scoreDocs[i].doc,
                        query_field, analyzer);
                frag = highlighter.getBestTextFragments(tokenStream, text,
                        false, maxNumFragments);
                for (int j = 0; j < frag.length; j++) {
                    if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                        System.out.println(frag[j].getFragNum() + ": (" +
                                           frag[j].toString().length() +
                                           ") " + frag[j].toString());
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InvalidTokenOffsetsException e) {
            e.printStackTrace();
        }
    }

    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws IOException {
        /**
         * Parse args
         */
        String usage =
            "Usage:\tjava -classpath \"<path to lucene jars>\" makble [-h] [-help]\n"
                    + " [-field 'title|content'] [-title t]\n"
                    + " [-file filename] [-query q]\n"
                    + " [-nullFrag | -spanFrag]\n"
                    + " [-showTokens] [-unifiedHighlighter]\n"
                    + " [[-query_type 'parse'] | [-query_type 'term'] |\n"
                    + " [-query_type 'span'] |\n"
                    + " [-query_type 'phrase' [-slop <number>]] |\n"
                    + " [-query_type 'fuzzy' [-maxedits <number>]] |\n"
                    + " [-query_type 'regex'] |\n"
                    + " [-query_type 'boolean']]\n\n";
        if (args.length > 0 && ("-h".equals(args[0]) ||
                                "-help".equals(args[0]))) {
            System.out.println(usage);
            System.exit(0);
        }

        for(int i = 0;i < args.length;i++) {
            if ("-field".equals(args[i])) {
                query_field = args[i+1];
                if (query_field.equals("title")) {
                    alt_query_field = "content";
                } else if (query_field.equals("content")) {
                    alt_query_field = "title";
                } else {
                    System.out.println(usage);
                    System.exit(0);
                }
                i++;
            } else if ("-title".equals(args[i])) {
                doc_title = args[i+1];
                i++;
            } else if ("-file".equals(args[i])) {
                content_file = args[i+1];
                i++;
            } else if ("-query".equals(args[i])) {
                queryString = args[i+1];
                i++;
            } else if ("-unifiedHighlighter".equals(args[i])) {
                unifiedhighlight = true;
            } else if ("-nullFrag".equals(args[i])) {
                if (useSpanFrag) {
                    System.out.println(
                        "Can NOT specify both -nullFrag and -spanFrag\n");
                    System.out.println(usage);
                    System.exit(0);
                } else {
                    useNullFrag = true;
                }
            } else if ("-spanFrag".equals(args[i])) {
                if (useNullFrag) {
                    System.out.println(
                        "Can NOT specify both -nullFrag and -spanFrag\n");
                    System.out.println(usage);
                    System.exit(0);
                } else {
                    useSpanFrag = true;
                }
            } else if ("-showTokens".equals(args[i])) {
                showTokens = true;
            } else if ("-query_type".equals(args[i])) {
                query_type = args[i+1];
                if (!(query_type.equals("parse") ||
                      query_type.equals("term") ||
                      query_type.equals("span") ||
                      query_type.equals("regex") ||
                      query_type.equals("phrase") ||
                      query_type.equals("fuzzy") ||
                      query_type.equals("boolean"))) {
                    System.out.println(usage);
                    System.exit(0);
                }
                i++;
            } else if ("-slop".equals(args[i])) {
                // slop is only valid for phrase queries
                if (query_type.equals("phrase")) {
                    try {
                        pq_slop = Integer.parseInt(args[i+1]);
                        i++;
                    } catch (NumberFormatException e) {
                        System.out.println("\n\n-slop MUST be an integer\n\n");
                        System.out.println(usage);
                        System.exit(0);
                    }
                } else {
                    System.out.println("\n\n-slop can only be specified with -query_type 'phrase'\n\n");
                    System.out.println(usage);
                    System.exit(0);
                }
            } else if ("-maxedits".equals(args[i])) {
                // maxedits is only valid for fuzzy queries
                if (query_type.equals("fuzzy")) {
                    try {
                        max_edits = Integer.parseInt(args[i+1]);
                        i++;
                    } catch (NumberFormatException e) {
                        System.out.println("\n\n-maxedits MUST be an integer\n\n");
                        System.out.println(usage);
                        System.exit(0);
                    }
                } else {
                    System.out.println("\n\n-maxedits can only be specified with -query_type 'fuzzy'\n\n");
                    System.out.println(usage);
                    System.exit(0);
                }
            }
        }
        System.out.println("Content file      is\n  " + content_file);
        System.out.println("Document title    is\n  " + doc_title);
        System.out.println("Query field       is " + query_field);
        System.out.println("Alt Query field   is " + alt_query_field);
        System.out.println("Query String      is '" + queryString + "'");
        System.out.println("Query Type        is " + query_type);
        if (query_type.equals("phrase")) {
            System.out.println("Phrase Query slop is " + pq_slop);
        }
        if (query_type.equals("fuzzy")) {
            System.out.println("Fuzzy Query max edits is " + max_edits);
        }
        if (useNullFrag) {
            System.out.println("\nUsing NullFragmenter");
        }
        if (useSpanFrag) {
            System.out.println("\nUsing SpanFragmenter");
        }
        if (unifiedhighlight) {
            System.out.println("\nUsing Unified Highlighter");
        }

        Document doc = new Document(); // create a new document

        /**
         * Create a field with term vector enabled
         */
        FieldType type = new FieldType();
        type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        type.setStored(true);
        type.setStoreTermVectors(true);
        type.setTokenized(true);
        type.setStoreTermVectorOffsets(true);

        //term vector enabled
        String fileString = readFileString(content_file);
        Field field = new Field("title", doc_title, type);
        Field f = new TextField("content", fileString,
                Field.Store.YES);
        doc.add(field);
        doc.add(f);

        try {
            indexWriter = new IndexWriter(ramDirectory, config);
            indexWriter.addDocument(doc);
            indexWriter.close();

            idxReader = DirectoryReader.open(ramDirectory);
            idxSearcher = new IndexSearcher(idxReader);
            if (query_type.equals("parse")) {
                QueryParser qp = new QueryParser(query_field, analyzer);
                queryToSearch = qp.parse(queryString);
            } else if (query_type.equals("term")) {
                Term nt = new Term(query_field, queryString);
                queryToSearch = new TermQuery(nt);
            } else if (query_type.equals("span")) {
                Term nt = new Term(query_field, queryString);
                queryToSearch = new SpanTermQuery(nt);
            } else if (query_type.equals("regex")) {
                Term rxt = new Term(query_field, queryString);
                queryToSearch = new RegexpQuery(rxt);
            } else if (query_type.equals("phrase")) {
                queryToSearch = new PhraseQuery(pq_slop, query_field,
                                                queryString.split(" "));
            } else if (query_type.equals("fuzzy")) {
                Term nt = new Term(query_field, queryString);
                queryToSearch = new FuzzyQuery(nt, max_edits);
            } else if (query_type.equals("boolean")) {
                System.out.println("\n\nboolean query is NOT implemented yet!\n\n");
                System.exit(0);
            }
            if (queryToSearch!= null) {
                System.out.println("\nQuery : " + queryToSearch.toString()
                                   + "\n");
            }

            // Here is where the searching, etc starts
            hits = idxSearcher.search(queryToSearch, idxReader.maxDoc());

// spans = stq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f).getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);
            // look at the spans
            if (query_type.equals("span")) {
                SpanTermQuery stq = (SpanTermQuery )queryToSearch;
                SpanWeight spw = stq.createWeight(idxSearcher,
                          ScoreMode.COMPLETE_NO_SCORES, 1f);
                Spans spans =
                    spw.getSpans(idxReader.leaves().get(0),
                                 SpanWeight.Postings.POSITIONS);
                System.out.println("\n\nspans: " + spans.toString() + "\n\n");
                int s_next = spans.nextStartPosition();
                if (s_next != Spans.NO_MORE_POSITIONS) {
                    while (s_next != Spans.NO_MORE_POSITIONS) {
                        // Document doc = reader.document(spans.doc());
                        // String body = doc.get("body");
                        System.out.println("\n\nspan start: "
                            + spans.startPosition()
                            + ", end: " + spans.endPosition());
                    }
                } else {
                    System.out.println("\n\nOnly one span\n\n");
                    System.out.println("\n\nspan start: "
                        + spans.startPosition()
                        + ", end: " + spans.endPosition());
                }
            }
            if (unifiedhighlight) {
                do_unifiedhighlight();
            } else {
                do_highlight();
            }

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        if (showTokens) {
            System.out.println("\n\n--------------------\n\n");

            StringReader newSR =
                new StringReader(readFileString(content_file));
            TokenStream ts = analyzer.tokenStream("myfield", newSR);
            // Attributes to view about the TokenStream
            OffsetAttribute offsetAtt =
                ts.addAttribute(OffsetAttribute.class);
            CharTermAttribute chartermAtt =
                ts.addAttribute(CharTermAttribute.class);
            TermToBytesRefAttribute tobytesAtt =
                ts.addAttribute(TermToBytesRefAttribute.class);
            PositionIncrementAttribute posincrAtt =
                ts.addAttribute(PositionIncrementAttribute.class);
            try {
                int absPos = 0;
                // Resets this stream to the beginning. (Required)
                ts.reset();
                while (ts.incrementToken()) {
                    // Use AttributeSource.reflectAsString(boolean)
                    // for token stream debugging.
                    // System.out.println("token: "
                    //                    + ts.reflectAsString(true));
                    int sOff = offsetAtt.startOffset();
                    int eOff = offsetAtt.endOffset();
                    int tLen = chartermAtt.length();
                    absPos += posincrAtt.getPositionIncrement();

                    System.out.println("token term  : "
                                       + chartermAtt.toString());
                    System.out.println("token bytes : "
                                       + tobytesAtt.getBytesRef());
                    System.out.println("  token pos, start, end, len: "
                                       + absPos + ", " + sOff + ", " + eOff
                                       + ", " + tLen);
                }
                // Perform end-of-stream operations,
                // e.g. set the final offset.
                ts.end();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                ts.close(); // Release resources associated with this stream.
            }
        }
    }
}

Когда я запускаю это с моим образцом контента (которым я не могу поделиться), я выделяю 4 результата, но результаты выделяются в виде большого фрагмента (около 100 символов). Когда я запускаю тот же контент с помощью UnifiedHighlighter, я получаю только 1 результат, а не 4, показанные с помощью Highlighter.

Любые предложения, которые вы можете предоставить, о том, как будут оценены выделенные результаты поиска с 3 или 4 словами до и после.

Я также приму предложения по ресурсам для чтения. Документы по API хороши, но их недостаточно, чтобы понять, как лучше всего использовать API.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...