在solr搜索中更准确的结果集

时间:2014-01-07 10:19:53

标签: search solr lucene solr4

我刚在电子商务应用程序中实现了apache's solr search。我需要微调搜索列表以获得更好的用户体验。我需要指导,因为我是solir search的新手。

我需要做的是更相关的结果确切的结果。举个例子,如果用户输入'pen',它应该列出

  • pens
  • 笔架
  • 笔式驱动器(也很好,但笔应该是高优先级)

但显示

  • 笔架
  • 笔式驱动器
  • 掌柜

前3是好的但是掌柜,单词笔位于单词的中间(dis pen sers),因此无需使用笔列表列出分配器。怎么做到这一点?

更新-1:

schema.xml
<types>
        <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true" />
        <!-- boolean type: "true" or "false" -->
        <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true" />
        <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
        <fieldtype name="binary" class="solr.BinaryField" />
        <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0" />
        <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0" />
        <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0" />
        <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0" />
        <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0" />
        <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0" />
        <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0" />
        <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0" />
        <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0" />
        <!-- A Trie based date field for faster date range queries and date faceting. -->
        <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0" />
        <fieldType name="pint" class="solr.IntField" omitNorms="true" />
        <fieldType name="plong" class="solr.LongField" omitNorms="true" />
        <fieldType name="pfloat" class="solr.FloatField" omitNorms="true" />
        <fieldType name="pdouble" class="solr.DoubleField" omitNorms="true" />
        <fieldType name="pdate" class="solr.DateField" sortMissingLast="true" omitNorms="true" />
        <fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true" />
        <fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true" />
        <fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true" />
        <fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true" />
        <fieldType name="random" class="solr.RandomSortField" indexed="true" />
        <!-- A text field that only splits on whitespace for exact matching of words -->
        <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
            <analyzer>
                <tokenizer class="solr.WhitespaceTokenizerFactory" />
                <filter class="solr.LowerCaseFilterFactory" />
            </analyzer>
        </fieldType>
        <!-- Text keyword -->
        <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
            <analyzer>
                <tokenizer class="solr.KeywordTokenizerFactory" />
            </analyzer>
        </fieldType>
        <!-- Text path -->
        <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
            <analyzer>
                <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="\" replace="/" />
            </analyzer>
        </fieldType>
        <!--FOR AUTO SUGGESTION FIELD -->
        <fieldType name="edgytext" class="solr.TextField" positionIncrementGap="100">
            <analyzer type="index">
                <tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="50"/>
                <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false" />
                <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
                <filter class="solr.LowerCaseFilterFactory" />
                <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.KeywordTokenizerFactory" />
                <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false" />
                <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
                <filter class="solr.LowerCaseFilterFactory" />
            </analyzer>
        </fieldType>
        <!-- Text Shingle -->
        <fieldType name="text_shingle" class="solr.TextField" positionIncrementGap="100">
            <analyzer>
                <tokenizer class="solr.WhitespaceTokenizerFactory" />
                <filter class="solr.ShingleFilterFactory" maxShingleSize="7" outputUnigrams="true"/>
                <filter class="solr.LowerCaseFilterFactory" />
            </analyzer>
        </fieldType>
    </types>
    <!-- Fields definition -->
        <fields>
            <field name="unique_id" type="string" indexed="true" required="true" />
            <field name="products_id" type="string" indexed="true" stored="true" required="true" />
            <field name="sku" type="text_keyword" indexed="true" stored="true" omitNorms="true" />
            <field name="category_id" type="int" indexed="true" stored="true" multiValued="true" />
            <field name="store_id" type="int" indexed="true" stored="true" />
            <field name="website_id" type="int" indexed="true" stored="true" />
            <field name="product_status" type="int" indexed="true" stored="true" />
            <field name="category_path" type="text_path" indexed="true" stored="true" multiValued="true" omitNorms="true" />
            <field name="textSpell" type="text_ws" stored="true" indexed="true" multiValued="true" omitNorms="true" />
            <field name="textSearch" type="edgytext" indexed="true" stored="true" multiValued="true" omitNorms="true" omitTermFreqAndPositions="true" />
            <field name="textSearchText" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" omitTermFreqAndPositions="true" />
            <field name="_version_" type="long" indexed="true" stored="true" />
            <!--Dynamic fields -->
            <dynamicField name="*_int" type="int" indexed="true" stored="true" />
            <dynamicField name="*_varchar" type="text_keyword" indexed="true" stored="true" omitNorms="true" omitTermFreqAndPositions="true" />
            <dynamicField name="*_text" type="edgytext" indexed="true" stored="true" multiValued="true" omitNorms="true" omitTermFreqAndPositions="true" />
            <dynamicField name="*_decimal" type="float" indexed="true" stored="true" />
            <dynamicField name="*_datetime" type="date" indexed="true" stored="true" />
            <dynamicField name="*_static" type="string" indexed="true" stored="true" />
            <dynamicField name="*_boost" type="edgytext" indexed="true" stored="true" multiValued="true" />
            <dynamicField name="*_boost_exact" type="text_ws" indexed="true" stored="true" multiValued="true" />
            <dynamicField name="*_facet" type="text_keyword" indexed="true" stored="true" multiValued="true" omitNorms="true" />
        </fields>

更新-2:

solrconfig.xml

2 个答案:

答案 0 :(得分:1)

edgytext类型的textSearch字段基于字母级ngrams,即

在:“自行车”

Out:“bicy”,“bicyc”,“icyc”,“icycl”,“cycl”,“cycle”,“ycle”

示例来自: https://cwiki.apache.org/confluence/display/solr/Tokenizers#Tokenizers-N-GramTokenizer

这将允许您看到的令牌间搜索结果。如果您希望令牌级别的n-gram根据text_shingle定义搜索字段。顺便说一句,小心maxShingleSize。值7似乎有点过高,它将显着影响索引大小。

答案 1 :(得分:1)

我建议这样做以获得最佳效果:

  1. this复制粘贴text_general FieldType并放置在schema.xml的标记内

  2. 将此行从当前schema.xml更改为<field name="textSearch" type="text_general" indexed="true" stored="true" multiValued="true" omitNorms="true" omitTermFreqAndPositions="true" />

  3. 重新启动Solr服务器并重新索引数据。

  4. 示例搜索查询 - http://solr-server:8983/solr/english/select?q=pen&defType=edismax&qf=textSearch