Question

Хотелось бы, чтобы в дополнение к стандартному поиску по термину с tf-idf сходством над полем текстового контента, оценивались по «сходству» числовых полей. Это сходство будет зависеть от расстояния между значением в запросе и в документе (например, гауссовский с m = [пользовательский ввод], s = 0,5)

т.е. скажем, документы представляют людей, а документ person имеет два поля:

описание (полный текст)
возраст (числовой).

Я хочу найти документы типа

описание: (x y z) возраст: 30

но возраст должен быть не фильтр , а скорее часть балла (для лица в возрасте 30 лет множитель будет 1,0, для 25-летнего человека 0,8 и т. Д.)

Может ли это быть достигнуто разумным способом?

РЕДАКТИРОВАТЬ: Наконец я обнаружил, что это можно сделать, обернув ValueSourceQuery и TermQuery с CustomScoreQuery. Смотрите мое решение ниже.

РЕДАКТИРОВАТЬ 2: В быстро меняющихся версиях Lucene я просто хочу добавить, что он был протестирован на Lucene 3.0 (Java).

jakub.g · Answer 1 · 09 мая 2011

Хорошо, вот (немного подробное) подтверждение концепции как полный тест JUnit. Пока еще не тестировали его эффективность для большого индекса, но, как я понял, вероятно, после прогрева он должен работать хорошо, при условии, что имеется достаточно оперативной памяти для кэширования числовых полей.

  package tests;

  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.WhitespaceAnalyzer;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.document.NumericField;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.queryParser.QueryParser;
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.search.Query;
  import org.apache.lucene.search.ScoreDoc;
  import org.apache.lucene.search.TopDocs;
  import org.apache.lucene.search.function.CustomScoreQuery;
  import org.apache.lucene.search.function.IntFieldSource;
  import org.apache.lucene.search.function.ValueSourceQuery;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.store.RAMDirectory;
  import org.apache.lucene.util.Version;

  import junit.framework.TestCase;

  public class AgeAndContentScoreQueryTest extends TestCase
  {
     public class AgeAndContentScoreQuery extends CustomScoreQuery
     {
        protected float peakX;
        protected float sigma;

        public AgeAndContentScoreQuery(Query subQuery, ValueSourceQuery valSrcQuery, float peakX, float sigma) {
           super(subQuery, valSrcQuery);
           this.setStrict(true); // do not normalize score values from ValueSourceQuery!
           this.peakX = peakX;   // age for which the age-relevance is best
           this.sigma = sigma;
        }

        @Override
        public float customScore(int doc, float subQueryScore, float valSrcScore){
           // subQueryScore is td-idf score from content query
           float contentScore = subQueryScore;

           // valSrcScore is a value of date-of-birth field, represented as a float
           // let's convert age value to gaussian-like age relevance score
           float x = (2011 - valSrcScore); // age
           float ageScore = (float) Math.exp(-Math.pow(x - peakX, 2) / 2*sigma*sigma);

           float finalScore = ageScore * contentScore;

           System.out.println("#contentScore: " + contentScore);
           System.out.println("#ageValue:     " + (int)valSrcScore);
           System.out.println("#ageScore:     " + ageScore);
           System.out.println("#finalScore:   " + finalScore);
           System.out.println("+++++++++++++++++");

           return finalScore;
        }
     }

     protected Directory directory;
     protected Analyzer analyzer = new WhitespaceAnalyzer();
     protected String fieldNameContent = "content";
     protected String fieldNameDOB = "dob";

     protected void setUp() throws Exception
     {
        directory = new RAMDirectory();
        analyzer = new WhitespaceAnalyzer();

        // indexed documents
        String[] contents = {"foo baz1", "foo baz2 baz3", "baz4"};
        int[] dobs = {1991, 1981, 1987}; // date of birth

        IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
        for (int i = 0; i < contents.length; i++) 
        {
           Document doc = new Document();
           doc.add(new Field(fieldNameContent, contents[i], Field.Store.YES, Field.Index.ANALYZED)); // store & index
           doc.add(new NumericField(fieldNameDOB, Field.Store.YES, true).setIntValue(dobs[i]));      // store & index
           writer.addDocument(doc);
        }
        writer.close();
     }

     public void testSearch() throws Exception
     {
        String inputTextQuery = "foo bar";
        float peak = 27.0f;
        float sigma = 0.1f;

        QueryParser parser = new QueryParser(Version.LUCENE_30, fieldNameContent, analyzer);
        Query contentQuery = parser.parse(inputTextQuery);

        ValueSourceQuery dobQuery = new ValueSourceQuery( new IntFieldSource(fieldNameDOB) );
         // or: FieldScoreQuery dobQuery = new FieldScoreQuery(fieldNameDOB,Type.INT);

        CustomScoreQuery finalQuery = new AgeAndContentScoreQuery(contentQuery, dobQuery, peak, sigma);

        IndexSearcher searcher = new IndexSearcher(directory);
        TopDocs docs = searcher.search(finalQuery, 10);

        System.out.println("\nDocuments found:\n");
        for(ScoreDoc match : docs.scoreDocs)
        {
           Document d = searcher.doc(match.doc);
           System.out.println("CONTENT: " + d.get(fieldNameContent) );
           System.out.println("D.O.B.:  " + d.get(fieldNameDOB) );
           System.out.println("SCORE:   " + match.score );
           System.out.println("-----------------");
        }
     }
  }

Пользовательская оценка Lucene для числовых полей

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пользовательская оценка Lucene для числовых полей

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы