Question

Я новичок в Lucene. Я пытаюсь выделить результаты поиска и добавить только 3 или 4 слова до и после найденных результатов.

Я попробовал приложенный пример кода и расширил его, чтобы включить в него различные фрагменты, также попробовал Spans и попробовал использовать оба класса Highlighter и UnifiedHighlighter.

Различные примеры, которые я нашел в Сети, были для Lucene v3 и v4, и ни одна из них не работает с текущей версией.

// package com.makble.lucenesearchhighlight;

// Downloaded from: http://makble.com/how-to-do-lucene-search-highlight-example

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.search.uhighlight.*;
import org.apache.lucene.search.spans.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class makble {

    public static Analyzer analyzer = new StandardAnalyzer();
    public static IndexWriterConfig config = new IndexWriterConfig(
            analyzer);
    public static RAMDirectory ramDirectory = new RAMDirectory();
    public static IndexWriter indexWriter;

    public static Query queryToSearch = null;
    // Control which fragmenter to use, both false means use default
    // SimpleFragmenter
    public static boolean useNullFrag = false;
    public static boolean useSpanFrag = false;

    public static IndexReader idxReader;
    public static IndexSearcher idxSearcher;
    public static TopDocs hits;
    public static String content_file =
      "<path to some file>";
    public static String doc_title = "How to read child UTF8 text file into"
                       + " String type in Java";
    public static String query_field = "title";
    public static String alt_query_field = "content";
    public static String queryString = "child implied";
    public static String query_type = "parse";
    // slop value for phrase queries
    public static int pq_slop = 0;

    // max edits value for fuzzy queries
    public static int max_edits = 2;

    // Maximum number of fragments to search through
    public static int maxNumFragments = 1000;

    // Show all tokens
    public static boolean showTokens = false;
    public static boolean unifiedhighlight = false;

    public static String readFileString(String file) {
        StringBuffer text = new StringBuffer();
        try {

            BufferedReader in = new BufferedReader(new InputStreamReader(
                    new FileInputStream(new File(file)), "UTF8"));
            String line;
            while ((line = in.readLine()) != null) {
                text.append(line + "\r\n");
            }

        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        return text.toString();
    }

    private static void do_unifiedhighlight() {
        System.out.println("\nUnified Highlighter results.\n");
        UnifiedHighlighter highlighter = new UnifiedHighlighter(
                idxSearcher, analyzer);
        try {
            String[] fragments = highlighter.highlight(query_field,
                                                       queryToSearch, hits);

            System.out.println("Fragments.\n");
            for(String f : fragments)
            {
                System.out.println(f);
            }

            //To get which fragment belong to which doc/file
            System.out.println("\nDocs and Fragments.\n");

            for (int i = 0; i < hits.scoreDocs.length; i++)
            {
                int docid = hits.scoreDocs[i].doc;
                Document doc = idxSearcher.doc(docid);

                String filePath = doc.get("path");
                System.out.println(filePath);
                System.out.println(fragments[i]);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private static void do_highlight() {
        try {
            SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
            QueryScorer qScorer = new QueryScorer(queryToSearch);
            Highlighter highlighter = new Highlighter(
                    htmlFormatter, qScorer);
            if (useNullFrag) {
                NullFragmenter nullFrag = new NullFragmenter();
                highlighter.setTextFragmenter(nullFrag);
            }
            if (useSpanFrag) {
                SimpleSpanFragmenter spanFrag =
                    new SimpleSpanFragmenter(qScorer);
                highlighter.setTextFragmenter(spanFrag);
            }

            System.out.println("reader maxDoc is " + idxReader.maxDoc());
            System.out.println("scoreDoc size: " + hits.scoreDocs.length);
            int numTotalHits = Math.toIntExact(hits.totalHits.value);
            for (int i = 0; i < numTotalHits; i++) {
                System.out.println("\nstart highlight the ALT field - "
                                   + alt_query_field);
                int id = hits.scoreDocs[i].doc;
                Document docHit = idxSearcher.doc(id);
                String text = docHit.get(alt_query_field);
                TokenStream tokenStream = TokenSources.getAnyTokenStream(idxReader, id, alt_query_field, analyzer);
                TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, maxNumFragments);
                for (int j = 0; j < frag.length; j++) {
                    if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                        System.out.println(frag[j].getFragNum() + ": (" +
                                           frag[j].toString().length() +
                                           ") " + frag[j].toString());
                    }
                }

                System.out.println("\nstart highlight the field - "
                                   + query_field);
                // Term vector
                text = docHit.get(query_field);
                tokenStream = TokenSources.getAnyTokenStream(
                        idxSearcher.getIndexReader(), hits.scoreDocs[i].doc,
                        query_field, analyzer);
                frag = highlighter.getBestTextFragments(tokenStream, text,
                        false, maxNumFragments);
                for (int j = 0; j < frag.length; j++) {
                    if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                        System.out.println(frag[j].getFragNum() + ": (" +
                                           frag[j].toString().length() +
                                           ") " + frag[j].toString());
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InvalidTokenOffsetsException e) {
            e.printStackTrace();
        }
    }

    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws IOException {
        /**
         * Parse args
         */
        String usage =
            "Usage:\tjava -classpath \"<path to lucene jars>\" makble [-h] [-help]\n"
                    + " [-field 'title|content'] [-title t]\n"
                    + " [-file filename] [-query q]\n"
                    + " [-nullFrag | -spanFrag]\n"
                    + " [-showTokens] [-unifiedHighlighter]\n"
                    + " [[-query_type 'parse'] | [-query_type 'term'] |\n"
                    + " [-query_type 'span'] |\n"
                    + " [-query_type 'phrase' [-slop <number>]] |\n"
                    + " [-query_type 'fuzzy' [-maxedits <number>]] |\n"
                    + " [-query_type 'regex'] |\n"
                    + " [-query_type 'boolean']]\n\n";
        if (args.length > 0 && ("-h".equals(args[0]) ||
                                "-help".equals(args[0]))) {
            System.out.println(usage);
            System.exit(0);
        }

        for(int i = 0;i < args.length;i++) {
            if ("-field".equals(args[i])) {
                query_field = args[i+1];
                if (query_field.equals("title")) {
                    alt_query_field = "content";
                } else if (query_field.equals("content")) {
                    alt_query_field = "title";
                } else {
                    System.out.println(usage);
                    System.exit(0);
                }
                i++;
            } else if ("-title".equals(args[i])) {
                doc_title = args[i+1];
                i++;
            } else if ("-file".equals(args[i])) {
                content_file = args[i+1];
                i++;
            } else if ("-query".equals(args[i])) {
                queryString = args[i+1];
                i++;
            } else if ("-unifiedHighlighter".equals(args[i])) {
                unifiedhighlight = true;
            } else if ("-nullFrag".equals(args[i])) {
                if (useSpanFrag) {
                    System.out.println(
                        "Can NOT specify both -nullFrag and -spanFrag\n");
                    System.out.println(usage);
                    System.exit(0);
                } else {
                    useNullFrag = true;
                }
            } else if ("-spanFrag".equals(args[i])) {
                if (useNullFrag) {
                    System.out.println(
                        "Can NOT specify both -nullFrag and -spanFrag\n");
                    System.out.println(usage);
                    System.exit(0);
                } else {
                    useSpanFrag = true;
                }
            } else if ("-showTokens".equals(args[i])) {
                showTokens = true;
            } else if ("-query_type".equals(args[i])) {
                query_type = args[i+1];
                if (!(query_type.equals("parse") ||
                      query_type.equals("term") ||
                      query_type.equals("span") ||
                      query_type.equals("regex") ||
                      query_type.equals("phrase") ||
                      query_type.equals("fuzzy") ||
                      query_type.equals("boolean"))) {
                    System.out.println(usage);
                    System.exit(0);
                }
                i++;
            } else if ("-slop".equals(args[i])) {
                // slop is only valid for phrase queries
                if (query_type.equals("phrase")) {
                    try {
                        pq_slop = Integer.parseInt(args[i+1]);
                        i++;
                    } catch (NumberFormatException e) {
                        System.out.println("\n\n-slop MUST be an integer\n\n");
                        System.out.println(usage);
                        System.exit(0);
                    }
                } else {
                    System.out.println("\n\n-slop can only be specified with -query_type 'phrase'\n\n");
                    System.out.println(usage);
                    System.exit(0);
                }
            } else if ("-maxedits".equals(args[i])) {
                // maxedits is only valid for fuzzy queries
                if (query_type.equals("fuzzy")) {
                    try {
                        max_edits = Integer.parseInt(args[i+1]);
                        i++;
                    } catch (NumberFormatException e) {
                        System.out.println("\n\n-maxedits MUST be an integer\n\n");
                        System.out.println(usage);
                        System.exit(0);
                    }
                } else {
                    System.out.println("\n\n-maxedits can only be specified with -query_type 'fuzzy'\n\n");
                    System.out.println(usage);
                    System.exit(0);
                }
            }
        }
        System.out.println("Content file      is\n  " + content_file);
        System.out.println("Document title    is\n  " + doc_title);
        System.out.println("Query field       is " + query_field);
        System.out.println("Alt Query field   is " + alt_query_field);
        System.out.println("Query String      is '" + queryString + "'");
        System.out.println("Query Type        is " + query_type);
        if (query_type.equals("phrase")) {
            System.out.println("Phrase Query slop is " + pq_slop);
        }
        if (query_type.equals("fuzzy")) {
            System.out.println("Fuzzy Query max edits is " + max_edits);
        }
        if (useNullFrag) {
            System.out.println("\nUsing NullFragmenter");
        }
        if (useSpanFrag) {
            System.out.println("\nUsing SpanFragmenter");
        }
        if (unifiedhighlight) {
            System.out.println("\nUsing Unified Highlighter");
        }

        Document doc = new Document(); // create a new document

        /**
         * Create a field with term vector enabled
         */
        FieldType type = new FieldType();
        type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        type.setStored(true);
        type.setStoreTermVectors(true);
        type.setTokenized(true);
        type.setStoreTermVectorOffsets(true);

        //term vector enabled
        String fileString = readFileString(content_file);
        Field field = new Field("title", doc_title, type);
        Field f = new TextField("content", fileString,
                Field.Store.YES);
        doc.add(field);
        doc.add(f);

        try {
            indexWriter = new IndexWriter(ramDirectory, config);
            indexWriter.addDocument(doc);
            indexWriter.close();

            idxReader = DirectoryReader.open(ramDirectory);
            idxSearcher = new IndexSearcher(idxReader);
            if (query_type.equals("parse")) {
                QueryParser qp = new QueryParser(query_field, analyzer);
                queryToSearch = qp.parse(queryString);
            } else if (query_type.equals("term")) {
                Term nt = new Term(query_field, queryString);
                queryToSearch = new TermQuery(nt);
            } else if (query_type.equals("span")) {
                Term nt = new Term(query_field, queryString);
                queryToSearch = new SpanTermQuery(nt);
            } else if (query_type.equals("regex")) {
                Term rxt = new Term(query_field, queryString);
                queryToSearch = new RegexpQuery(rxt);
            } else if (query_type.equals("phrase")) {
                queryToSearch = new PhraseQuery(pq_slop, query_field,
                                                queryString.split(" "));
            } else if (query_type.equals("fuzzy")) {
                Term nt = new Term(query_field, queryString);
                queryToSearch = new FuzzyQuery(nt, max_edits);
            } else if (query_type.equals("boolean")) {
                System.out.println("\n\nboolean query is NOT implemented yet!\n\n");
                System.exit(0);
            }
            if (queryToSearch!= null) {
                System.out.println("\nQuery : " + queryToSearch.toString()
                                   + "\n");
            }

            // Here is where the searching, etc starts
            hits = idxSearcher.search(queryToSearch, idxReader.maxDoc());

// spans = stq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f).getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);
            // look at the spans
            if (query_type.equals("span")) {
                SpanTermQuery stq = (SpanTermQuery )queryToSearch;
                SpanWeight spw = stq.createWeight(idxSearcher,
                          ScoreMode.COMPLETE_NO_SCORES, 1f);
                Spans spans =
                    spw.getSpans(idxReader.leaves().get(0),
                                 SpanWeight.Postings.POSITIONS);
                System.out.println("\n\nspans: " + spans.toString() + "\n\n");
                int s_next = spans.nextStartPosition();
                if (s_next != Spans.NO_MORE_POSITIONS) {
                    while (s_next != Spans.NO_MORE_POSITIONS) {
                        // Document doc = reader.document(spans.doc());
                        // String body = doc.get("body");
                        System.out.println("\n\nspan start: "
                            + spans.startPosition()
                            + ", end: " + spans.endPosition());
                    }
                } else {
                    System.out.println("\n\nOnly one span\n\n");
                    System.out.println("\n\nspan start: "
                        + spans.startPosition()
                        + ", end: " + spans.endPosition());
                }
            }
            if (unifiedhighlight) {
                do_unifiedhighlight();
            } else {
                do_highlight();
            }

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        if (showTokens) {
            System.out.println("\n\n--------------------\n\n");

            StringReader newSR =
                new StringReader(readFileString(content_file));
            TokenStream ts = analyzer.tokenStream("myfield", newSR);
            // Attributes to view about the TokenStream
            OffsetAttribute offsetAtt =
                ts.addAttribute(OffsetAttribute.class);
            CharTermAttribute chartermAtt =
                ts.addAttribute(CharTermAttribute.class);
            TermToBytesRefAttribute tobytesAtt =
                ts.addAttribute(TermToBytesRefAttribute.class);
            PositionIncrementAttribute posincrAtt =
                ts.addAttribute(PositionIncrementAttribute.class);
            try {
                int absPos = 0;
                // Resets this stream to the beginning. (Required)
                ts.reset();
                while (ts.incrementToken()) {
                    // Use AttributeSource.reflectAsString(boolean)
                    // for token stream debugging.
                    // System.out.println("token: "
                    //                    + ts.reflectAsString(true));
                    int sOff = offsetAtt.startOffset();
                    int eOff = offsetAtt.endOffset();
                    int tLen = chartermAtt.length();
                    absPos += posincrAtt.getPositionIncrement();

                    System.out.println("token term  : "
                                       + chartermAtt.toString());
                    System.out.println("token bytes : "
                                       + tobytesAtt.getBytesRef());
                    System.out.println("  token pos, start, end, len: "
                                       + absPos + ", " + sOff + ", " + eOff
                                       + ", " + tLen);
                }
                // Perform end-of-stream operations,
                // e.g. set the final offset.
                ts.end();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                ts.close(); // Release resources associated with this stream.
            }
        }
    }
}

Когда я запускаю это с моим образцом контента (которым я не могу поделиться), я выделяю 4 результата, но результаты выделяются в виде большого фрагмента (около 100 символов). Когда я запускаю тот же контент с помощью UnifiedHighlighter, я получаю только 1 результат, а не 4, показанные с помощью Highlighter.

Любые предложения, которые вы можете предоставить, о том, как будут оценены выделенные результаты поиска с 3 или 4 словами до и после.

Я также приму предложения по ресурсам для чтения. Документы по API хороши, но их недостаточно, чтобы понять, как лучше всего использовать API.

Как выделить результаты поиска с 3 или 4 словами до и после в Lucene?

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Как выделить результаты поиска с 3 или 4 словами до и после в Lucene?

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Нет похожих вопросов