Вот реализация Okapi BM25 . Использование этого в сочетании с предложениями на SQLite.org поможет вам сгенерировать запрос MATCH по релевантности. Все это было написано на VB.Net, и запрос был вызван с использованием System.Data.SQLite
функций. Пользовательский SQLiteFunction
в конце может быть вызван из кода SQL без проблем, если код SQL вызывается с использованием System.Data.SQLite
функций.
Public Class MatchInfo
Property matchablePhrases As Integer
Property userDefinedColumns As Integer
Property totalDocuments As Integer
Private _int32HitData As List(Of Integer)
Private _longestSubsequencePhraseMatches As New List(Of Integer)
Private _tokensInDocument As New List(Of Integer)
Private _averageTokensInDocument As New List(Of Integer)
Private _max_hits_this_row As Integer?
Public ReadOnly Property max_hits_this_row As Integer
Get
If _max_hits_this_row Is Nothing Then
_max_hits_this_row = 0
For p = 0 To matchablePhrases - 1
For c = 0 To userDefinedColumns - 1
Dim myHitsThisRow As Integer = hits_this_row(p, c)
If myHitsThisRow > _max_hits_this_row Then
_max_hits_this_row = myHitsThisRow
End If
Next
Next
End If
Return _max_hits_this_row
End Get
End Property
Private _max_hits_all_rows As Integer?
Public ReadOnly Property max_hits_all_rows As Integer
Get
If _max_hits_all_rows Is Nothing Then
_max_hits_all_rows = 0
For p = 0 To matchablePhrases - 1
For c = 0 To userDefinedColumns - 1
Dim myHitsAllRows As Integer = hits_all_rows(p, c)
If myHitsAllRows > _max_hits_all_rows Then
_max_hits_all_rows = myHitsAllRows
End If
Next
Next
End If
Return _max_hits_all_rows
End Get
End Property
Private _max_docs_with_hits As Integer?
Public ReadOnly Property max_docs_with_hits As Integer
Get
If _max_docs_with_hits Is Nothing Then
_max_docs_with_hits = 0
For p = 0 To matchablePhrases - 1
For c = 0 To userDefinedColumns - 1
Dim myDocsWithHits As Integer = docs_with_hits(p, c)
If myDocsWithHits > _max_docs_with_hits Then
_max_docs_with_hits = myDocsWithHits
End If
Next
Next
End If
Return _max_docs_with_hits
End Get
End Property
Private _BM25Rank As Double?
Public ReadOnly Property BM25Rank As Double
Get
If _BM25Rank Is Nothing Then
_BM25Rank = 0
'calculate BM25 Rank
'http://en.wikipedia.org/wiki/Okapi_BM25
'k1, calibrates the document term frequency scaling. Having k1 as 0 corresponds to a binary model – no term frequency. Increasing k1 will give rare words more boost.
'b, calibrates the scaling by document length, and can take values from 0 to 1, where having 0 means no length normalization and having 1 corresponds to fully scaling the term weight by the document length.
Dim k1 As Double = 1.2
Dim b As Double = 0.75
For column = 0 To userDefinedColumns - 1
For phrase = 0 To matchablePhrases - 1
Dim IDF As Double = Math.Log((totalDocuments - hits_all_rows(phrase, column) + 0.5) / (hits_all_rows(phrase, column) + 0.5))
Dim score As Double = (IDF * ((hits_this_row(phrase, column) * (k1 + 1)) / (hits_this_row(phrase, column) + k1 * (1 - b + b * _tokensInDocument(column) / _averageTokensInDocument(column)))))
If score < 0 Then
score = 0
End If
_BM25Rank += score
Next
Next
End If
Return _BM25Rank
End Get
End Property
Public Sub New(raw_pcnalsx_MatchInfo As Byte())
Dim int32_pcsx_MatchInfo As New List(Of Integer)
For i = 0 To raw_pcnalsx_MatchInfo.Length - 1 Step 4
int32_pcsx_MatchInfo.Add(BitConverter.ToUInt32(raw_pcnalsx_MatchInfo, i))
Next
'take the raw data and parse it out
Me.matchablePhrases = int32_pcsx_MatchInfo(0)
int32_pcsx_MatchInfo.RemoveAt(0)
Me.userDefinedColumns = int32_pcsx_MatchInfo(0)
int32_pcsx_MatchInfo.RemoveAt(0)
Me.totalDocuments = int32_pcsx_MatchInfo(0)
int32_pcsx_MatchInfo.RemoveAt(0)
'remember that the columns are 0-based
For i = 0 To userDefinedColumns - 1
_averageTokensInDocument.Add(int32_pcsx_MatchInfo(0))
int32_pcsx_MatchInfo.RemoveAt(0)
Next
For i = 0 To userDefinedColumns - 1
_tokensInDocument.Add(int32_pcsx_MatchInfo(0))
int32_pcsx_MatchInfo.RemoveAt(0)
Next
For i = 0 To userDefinedColumns - 1
_longestSubsequencePhraseMatches.Add(int32_pcsx_MatchInfo(0))
int32_pcsx_MatchInfo.RemoveAt(0)
Next
_int32HitData = New List(Of Integer)(int32_pcsx_MatchInfo)
End Sub
Public Function hits_this_row(phrase As Integer, column As Integer) As Integer
Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 0)
End Function
Public Function hits_all_rows(phrase As Integer, column As Integer) As Integer
Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 1)
End Function
Public Function docs_with_hits(phrase As Integer, column As Integer) As Integer
Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 2)
End Function
End Class
<SQLiteFunction("Rank", 1, FunctionType.Scalar)>
Public Class Rank
Inherits SQLiteFunction
Public Overrides Function Invoke(args() As Object) As Object
Return New MatchInfo(args(0)).BM25Rank
End Function
End Class