SQLite全文搜索相关性排名

时间:2011-09-01 14:44:39

标签: search sqlite indexing full-text-search

我正在使用sqlite3的fts4扩展来启用全文索引和文本数据搜索。它运作良好,但我注意到结果根本没有相关性。我想我已经习惯了Lucene。我已经看到了一些使用matchinfo()结果编写自定义排名方法的简短建议,但我不清楚这是如何完成的,或者是否有任何复杂的例子。别人怎么处理这个?

4 个答案:

答案 0 :(得分:7)

文档中有一个完整的示例look at the end of appendix a。您需要做更多的工作才能获得良好的相关性排名,因为所提供的功能仅适用于入门。例如,matchinfo(table,'pcnalx')有足够的信息来实现Okapi BM25

答案 1 :(得分:6)

似乎缺乏关于如何在C中实现Okapi BM25的文档,看起来这是一个不言而喻的事情,实现留给用户练习。

好吧,我找到了程序员“Radford'rads'史密斯”的兄弟,他把这个放在了GitHub上

https://github.com/rads/sqlite-okapi-bm25

虽然我现在正在研究BM25F的调整,但它只实现了BM25 ....

....在这里。

https://github.com/neozenith/sqlite-okapi-bm25

答案 2 :(得分:1)

以下是Okapi BM25的实现。将此与SQLite.org中的建议结合使用可帮助您生成相关性排序的MATCH查询。这是用VB.Net编写的,查询是使用System.Data.SQLite函数调用的。只要使用SQLiteFunction函数调用SQL代码,就可以从SQL代码中调用最终的自定义System.Data.SQLite

Public Class MatchInfo
    Property matchablePhrases As Integer
    Property userDefinedColumns As Integer
    Property totalDocuments As Integer
    Private _int32HitData As List(Of Integer)
    Private _longestSubsequencePhraseMatches As New List(Of Integer)
    Private _tokensInDocument As New List(Of Integer)
    Private _averageTokensInDocument As New List(Of Integer)

    Private _max_hits_this_row As Integer?
    Public ReadOnly Property max_hits_this_row As Integer
        Get
            If _max_hits_this_row Is Nothing Then
                _max_hits_this_row = 0
                For p = 0 To matchablePhrases - 1
                    For c = 0 To userDefinedColumns - 1
                        Dim myHitsThisRow As Integer = hits_this_row(p, c)
                        If myHitsThisRow > _max_hits_this_row Then
                            _max_hits_this_row = myHitsThisRow
                        End If
                    Next
                Next
            End If

            Return _max_hits_this_row
        End Get
    End Property

    Private _max_hits_all_rows As Integer?
    Public ReadOnly Property max_hits_all_rows As Integer
        Get
            If _max_hits_all_rows Is Nothing Then
                _max_hits_all_rows = 0
                For p = 0 To matchablePhrases - 1
                    For c = 0 To userDefinedColumns - 1
                        Dim myHitsAllRows As Integer = hits_all_rows(p, c)
                        If myHitsAllRows > _max_hits_all_rows Then
                            _max_hits_all_rows = myHitsAllRows
                        End If
                    Next
                Next
            End If

            Return _max_hits_all_rows
        End Get
    End Property

    Private _max_docs_with_hits As Integer?
    Public ReadOnly Property max_docs_with_hits As Integer
        Get
            If _max_docs_with_hits Is Nothing Then
                _max_docs_with_hits = 0
                For p = 0 To matchablePhrases - 1
                    For c = 0 To userDefinedColumns - 1
                        Dim myDocsWithHits As Integer = docs_with_hits(p, c)
                        If myDocsWithHits > _max_docs_with_hits Then
                            _max_docs_with_hits = myDocsWithHits
                        End If
                    Next
                Next
            End If

            Return _max_docs_with_hits
        End Get
    End Property

    Private _BM25Rank As Double?
    Public ReadOnly Property BM25Rank As Double
        Get
            If _BM25Rank Is Nothing Then
                _BM25Rank = 0
                'calculate BM25 Rank
                'http://en.wikipedia.org/wiki/Okapi_BM25

                'k1, calibrates the document term frequency scaling. Having k1 as 0 corresponds to a binary model – no term frequency. Increasing k1 will give rare words more boost.
                'b, calibrates the scaling by document length, and can take values from 0 to 1, where having 0 means no length normalization and having 1 corresponds to fully scaling the term weight by the document length.

                Dim k1 As Double = 1.2
                Dim b As Double = 0.75

                For column = 0 To userDefinedColumns - 1
                    For phrase = 0 To matchablePhrases - 1
                        Dim IDF As Double = Math.Log((totalDocuments - hits_all_rows(phrase, column) + 0.5) / (hits_all_rows(phrase, column) + 0.5))
                        Dim score As Double = (IDF * ((hits_this_row(phrase, column) * (k1 + 1)) / (hits_this_row(phrase, column) + k1 * (1 - b + b * _tokensInDocument(column) / _averageTokensInDocument(column)))))
                        If score < 0 Then
                            score = 0
                        End If
                        _BM25Rank += score
                    Next
                Next

            End If

            Return _BM25Rank
        End Get
    End Property

    Public Sub New(raw_pcnalsx_MatchInfo As Byte())
        Dim int32_pcsx_MatchInfo As New List(Of Integer)
        For i = 0 To raw_pcnalsx_MatchInfo.Length - 1 Step 4
            int32_pcsx_MatchInfo.Add(BitConverter.ToUInt32(raw_pcnalsx_MatchInfo, i))
        Next

        'take the raw data and parse it out
        Me.matchablePhrases = int32_pcsx_MatchInfo(0)
        int32_pcsx_MatchInfo.RemoveAt(0)

        Me.userDefinedColumns = int32_pcsx_MatchInfo(0)
        int32_pcsx_MatchInfo.RemoveAt(0)

        Me.totalDocuments = int32_pcsx_MatchInfo(0)
        int32_pcsx_MatchInfo.RemoveAt(0)

        'remember that the columns are 0-based
        For i = 0 To userDefinedColumns - 1
            _averageTokensInDocument.Add(int32_pcsx_MatchInfo(0))
            int32_pcsx_MatchInfo.RemoveAt(0)
        Next

        For i = 0 To userDefinedColumns - 1
            _tokensInDocument.Add(int32_pcsx_MatchInfo(0))
            int32_pcsx_MatchInfo.RemoveAt(0)
        Next

        For i = 0 To userDefinedColumns - 1
            _longestSubsequencePhraseMatches.Add(int32_pcsx_MatchInfo(0))
            int32_pcsx_MatchInfo.RemoveAt(0)
        Next

        _int32HitData = New List(Of Integer)(int32_pcsx_MatchInfo)

    End Sub

    Public Function hits_this_row(phrase As Integer, column As Integer) As Integer
        Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 0)
    End Function

    Public Function hits_all_rows(phrase As Integer, column As Integer) As Integer
        Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 1)
    End Function

    Public Function docs_with_hits(phrase As Integer, column As Integer) As Integer
        Return _int32HitData(3 * (column + phrase * userDefinedColumns) + 2)
    End Function
End Class

<SQLiteFunction("Rank", 1, FunctionType.Scalar)>
Public Class Rank
    Inherits SQLiteFunction

    Public Overrides Function Invoke(args() As Object) As Object
        Return New MatchInfo(args(0)).BM25Rank
    End Function

End Class

答案 3 :(得分:1)

对于FTS5,根据SQLite FTS5 Extension

  • FTS5没有matchinfo()
  • FTS5支持ORDER BY rank

非常简单,类似

SELECT * FROM email WHERE email MATCH 'fts5' ORDER BY rank;

没有DESC有用。