Я новичок в Lucene. Я пытаюсь выделить результаты поиска и добавить только 3 или 4 слова до и после найденных результатов.
Я попробовал приложенный пример кода и расширил его, чтобы включить в него различные фрагменты, также попробовал Spans и попробовал использовать оба класса Highlighter и UnifiedHighlighter.
Различные примеры, которые я нашел в Сети, были для Lucene v3 и v4, и ни одна из них не работает с текущей версией.
// package com.makble.lucenesearchhighlight;
// Downloaded from: http://makble.com/how-to-do-lucene-search-highlight-example
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.search.uhighlight.*;
import org.apache.lucene.search.spans.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
public class makble {
public static Analyzer analyzer = new StandardAnalyzer();
public static IndexWriterConfig config = new IndexWriterConfig(
analyzer);
public static RAMDirectory ramDirectory = new RAMDirectory();
public static IndexWriter indexWriter;
public static Query queryToSearch = null;
// Control which fragmenter to use, both false means use default
// SimpleFragmenter
public static boolean useNullFrag = false;
public static boolean useSpanFrag = false;
public static IndexReader idxReader;
public static IndexSearcher idxSearcher;
public static TopDocs hits;
public static String content_file =
"<path to some file>";
public static String doc_title = "How to read child UTF8 text file into"
+ " String type in Java";
public static String query_field = "title";
public static String alt_query_field = "content";
public static String queryString = "child implied";
public static String query_type = "parse";
// slop value for phrase queries
public static int pq_slop = 0;
// max edits value for fuzzy queries
public static int max_edits = 2;
// Maximum number of fragments to search through
public static int maxNumFragments = 1000;
// Show all tokens
public static boolean showTokens = false;
public static boolean unifiedhighlight = false;
public static String readFileString(String file) {
StringBuffer text = new StringBuffer();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream(new File(file)), "UTF8"));
String line;
while ((line = in.readLine()) != null) {
text.append(line + "\r\n");
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return text.toString();
}
private static void do_unifiedhighlight() {
System.out.println("\nUnified Highlighter results.\n");
UnifiedHighlighter highlighter = new UnifiedHighlighter(
idxSearcher, analyzer);
try {
String[] fragments = highlighter.highlight(query_field,
queryToSearch, hits);
System.out.println("Fragments.\n");
for(String f : fragments)
{
System.out.println(f);
}
//To get which fragment belong to which doc/file
System.out.println("\nDocs and Fragments.\n");
for (int i = 0; i < hits.scoreDocs.length; i++)
{
int docid = hits.scoreDocs[i].doc;
Document doc = idxSearcher.doc(docid);
String filePath = doc.get("path");
System.out.println(filePath);
System.out.println(fragments[i]);
}
} catch (IOException e) {
e.printStackTrace();
}
}
private static void do_highlight() {
try {
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
QueryScorer qScorer = new QueryScorer(queryToSearch);
Highlighter highlighter = new Highlighter(
htmlFormatter, qScorer);
if (useNullFrag) {
NullFragmenter nullFrag = new NullFragmenter();
highlighter.setTextFragmenter(nullFrag);
}
if (useSpanFrag) {
SimpleSpanFragmenter spanFrag =
new SimpleSpanFragmenter(qScorer);
highlighter.setTextFragmenter(spanFrag);
}
System.out.println("reader maxDoc is " + idxReader.maxDoc());
System.out.println("scoreDoc size: " + hits.scoreDocs.length);
int numTotalHits = Math.toIntExact(hits.totalHits.value);
for (int i = 0; i < numTotalHits; i++) {
System.out.println("\nstart highlight the ALT field - "
+ alt_query_field);
int id = hits.scoreDocs[i].doc;
Document docHit = idxSearcher.doc(id);
String text = docHit.get(alt_query_field);
TokenStream tokenStream = TokenSources.getAnyTokenStream(idxReader, id, alt_query_field, analyzer);
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, maxNumFragments);
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
System.out.println(frag[j].getFragNum() + ": (" +
frag[j].toString().length() +
") " + frag[j].toString());
}
}
System.out.println("\nstart highlight the field - "
+ query_field);
// Term vector
text = docHit.get(query_field);
tokenStream = TokenSources.getAnyTokenStream(
idxSearcher.getIndexReader(), hits.scoreDocs[i].doc,
query_field, analyzer);
frag = highlighter.getBestTextFragments(tokenStream, text,
false, maxNumFragments);
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
System.out.println(frag[j].getFragNum() + ": (" +
frag[j].toString().length() +
") " + frag[j].toString());
}
}
}
} catch (IOException e) {
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
e.printStackTrace();
}
}
@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException {
/**
* Parse args
*/
String usage =
"Usage:\tjava -classpath \"<path to lucene jars>\" makble [-h] [-help]\n"
+ " [-field 'title|content'] [-title t]\n"
+ " [-file filename] [-query q]\n"
+ " [-nullFrag | -spanFrag]\n"
+ " [-showTokens] [-unifiedHighlighter]\n"
+ " [[-query_type 'parse'] | [-query_type 'term'] |\n"
+ " [-query_type 'span'] |\n"
+ " [-query_type 'phrase' [-slop <number>]] |\n"
+ " [-query_type 'fuzzy' [-maxedits <number>]] |\n"
+ " [-query_type 'regex'] |\n"
+ " [-query_type 'boolean']]\n\n";
if (args.length > 0 && ("-h".equals(args[0]) ||
"-help".equals(args[0]))) {
System.out.println(usage);
System.exit(0);
}
for(int i = 0;i < args.length;i++) {
if ("-field".equals(args[i])) {
query_field = args[i+1];
if (query_field.equals("title")) {
alt_query_field = "content";
} else if (query_field.equals("content")) {
alt_query_field = "title";
} else {
System.out.println(usage);
System.exit(0);
}
i++;
} else if ("-title".equals(args[i])) {
doc_title = args[i+1];
i++;
} else if ("-file".equals(args[i])) {
content_file = args[i+1];
i++;
} else if ("-query".equals(args[i])) {
queryString = args[i+1];
i++;
} else if ("-unifiedHighlighter".equals(args[i])) {
unifiedhighlight = true;
} else if ("-nullFrag".equals(args[i])) {
if (useSpanFrag) {
System.out.println(
"Can NOT specify both -nullFrag and -spanFrag\n");
System.out.println(usage);
System.exit(0);
} else {
useNullFrag = true;
}
} else if ("-spanFrag".equals(args[i])) {
if (useNullFrag) {
System.out.println(
"Can NOT specify both -nullFrag and -spanFrag\n");
System.out.println(usage);
System.exit(0);
} else {
useSpanFrag = true;
}
} else if ("-showTokens".equals(args[i])) {
showTokens = true;
} else if ("-query_type".equals(args[i])) {
query_type = args[i+1];
if (!(query_type.equals("parse") ||
query_type.equals("term") ||
query_type.equals("span") ||
query_type.equals("regex") ||
query_type.equals("phrase") ||
query_type.equals("fuzzy") ||
query_type.equals("boolean"))) {
System.out.println(usage);
System.exit(0);
}
i++;
} else if ("-slop".equals(args[i])) {
// slop is only valid for phrase queries
if (query_type.equals("phrase")) {
try {
pq_slop = Integer.parseInt(args[i+1]);
i++;
} catch (NumberFormatException e) {
System.out.println("\n\n-slop MUST be an integer\n\n");
System.out.println(usage);
System.exit(0);
}
} else {
System.out.println("\n\n-slop can only be specified with -query_type 'phrase'\n\n");
System.out.println(usage);
System.exit(0);
}
} else if ("-maxedits".equals(args[i])) {
// maxedits is only valid for fuzzy queries
if (query_type.equals("fuzzy")) {
try {
max_edits = Integer.parseInt(args[i+1]);
i++;
} catch (NumberFormatException e) {
System.out.println("\n\n-maxedits MUST be an integer\n\n");
System.out.println(usage);
System.exit(0);
}
} else {
System.out.println("\n\n-maxedits can only be specified with -query_type 'fuzzy'\n\n");
System.out.println(usage);
System.exit(0);
}
}
}
System.out.println("Content file is\n " + content_file);
System.out.println("Document title is\n " + doc_title);
System.out.println("Query field is " + query_field);
System.out.println("Alt Query field is " + alt_query_field);
System.out.println("Query String is '" + queryString + "'");
System.out.println("Query Type is " + query_type);
if (query_type.equals("phrase")) {
System.out.println("Phrase Query slop is " + pq_slop);
}
if (query_type.equals("fuzzy")) {
System.out.println("Fuzzy Query max edits is " + max_edits);
}
if (useNullFrag) {
System.out.println("\nUsing NullFragmenter");
}
if (useSpanFrag) {
System.out.println("\nUsing SpanFragmenter");
}
if (unifiedhighlight) {
System.out.println("\nUsing Unified Highlighter");
}
Document doc = new Document(); // create a new document
/**
* Create a field with term vector enabled
*/
FieldType type = new FieldType();
type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
type.setStored(true);
type.setStoreTermVectors(true);
type.setTokenized(true);
type.setStoreTermVectorOffsets(true);
//term vector enabled
String fileString = readFileString(content_file);
Field field = new Field("title", doc_title, type);
Field f = new TextField("content", fileString,
Field.Store.YES);
doc.add(field);
doc.add(f);
try {
indexWriter = new IndexWriter(ramDirectory, config);
indexWriter.addDocument(doc);
indexWriter.close();
idxReader = DirectoryReader.open(ramDirectory);
idxSearcher = new IndexSearcher(idxReader);
if (query_type.equals("parse")) {
QueryParser qp = new QueryParser(query_field, analyzer);
queryToSearch = qp.parse(queryString);
} else if (query_type.equals("term")) {
Term nt = new Term(query_field, queryString);
queryToSearch = new TermQuery(nt);
} else if (query_type.equals("span")) {
Term nt = new Term(query_field, queryString);
queryToSearch = new SpanTermQuery(nt);
} else if (query_type.equals("regex")) {
Term rxt = new Term(query_field, queryString);
queryToSearch = new RegexpQuery(rxt);
} else if (query_type.equals("phrase")) {
queryToSearch = new PhraseQuery(pq_slop, query_field,
queryString.split(" "));
} else if (query_type.equals("fuzzy")) {
Term nt = new Term(query_field, queryString);
queryToSearch = new FuzzyQuery(nt, max_edits);
} else if (query_type.equals("boolean")) {
System.out.println("\n\nboolean query is NOT implemented yet!\n\n");
System.exit(0);
}
if (queryToSearch!= null) {
System.out.println("\nQuery : " + queryToSearch.toString()
+ "\n");
}
// Here is where the searching, etc starts
hits = idxSearcher.search(queryToSearch, idxReader.maxDoc());
// spans = stq.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f).getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);
// look at the spans
if (query_type.equals("span")) {
SpanTermQuery stq = (SpanTermQuery )queryToSearch;
SpanWeight spw = stq.createWeight(idxSearcher,
ScoreMode.COMPLETE_NO_SCORES, 1f);
Spans spans =
spw.getSpans(idxReader.leaves().get(0),
SpanWeight.Postings.POSITIONS);
System.out.println("\n\nspans: " + spans.toString() + "\n\n");
int s_next = spans.nextStartPosition();
if (s_next != Spans.NO_MORE_POSITIONS) {
while (s_next != Spans.NO_MORE_POSITIONS) {
// Document doc = reader.document(spans.doc());
// String body = doc.get("body");
System.out.println("\n\nspan start: "
+ spans.startPosition()
+ ", end: " + spans.endPosition());
}
} else {
System.out.println("\n\nOnly one span\n\n");
System.out.println("\n\nspan start: "
+ spans.startPosition()
+ ", end: " + spans.endPosition());
}
}
if (unifiedhighlight) {
do_unifiedhighlight();
} else {
do_highlight();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (showTokens) {
System.out.println("\n\n--------------------\n\n");
StringReader newSR =
new StringReader(readFileString(content_file));
TokenStream ts = analyzer.tokenStream("myfield", newSR);
// Attributes to view about the TokenStream
OffsetAttribute offsetAtt =
ts.addAttribute(OffsetAttribute.class);
CharTermAttribute chartermAtt =
ts.addAttribute(CharTermAttribute.class);
TermToBytesRefAttribute tobytesAtt =
ts.addAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posincrAtt =
ts.addAttribute(PositionIncrementAttribute.class);
try {
int absPos = 0;
// Resets this stream to the beginning. (Required)
ts.reset();
while (ts.incrementToken()) {
// Use AttributeSource.reflectAsString(boolean)
// for token stream debugging.
// System.out.println("token: "
// + ts.reflectAsString(true));
int sOff = offsetAtt.startOffset();
int eOff = offsetAtt.endOffset();
int tLen = chartermAtt.length();
absPos += posincrAtt.getPositionIncrement();
System.out.println("token term : "
+ chartermAtt.toString());
System.out.println("token bytes : "
+ tobytesAtt.getBytesRef());
System.out.println(" token pos, start, end, len: "
+ absPos + ", " + sOff + ", " + eOff
+ ", " + tLen);
}
// Perform end-of-stream operations,
// e.g. set the final offset.
ts.end();
} catch (IOException e) {
e.printStackTrace();
} finally {
ts.close(); // Release resources associated with this stream.
}
}
}
}
Когда я запускаю это с моим образцом контента (которым я не могу поделиться), я выделяю 4 результата, но результаты выделяются в виде большого фрагмента (около 100 символов).
Когда я запускаю тот же контент с помощью UnifiedHighlighter, я получаю только 1 результат, а не 4, показанные с помощью Highlighter.
Любые предложения, которые вы можете предоставить, о том, как будут оценены выделенные результаты поиска с 3 или 4 словами до и после.
Я также приму предложения по ресурсам для чтения. Документы по API хороши, но их недостаточно, чтобы понять, как лучше всего использовать API.