Lucene特殊字符搜索

时间:2018-10-09 11:55:26

标签: java apache lucene

我正在尝试从数据库搜索转到Lucene搜索。我的文本文件很少,有数据,其中一个文本文件中的示例数据是

  

N =以太网,L =无效,IM = XX123,SN = 286-054-754,HBF =无效,BON =无效,   VSR = null,DUID = null,MID = 2,IP = 10.21.122.136,MAC = 60:C7:98:17:57:80,   SYNC = false,GN = null,CustParam3 = null,CustParam2 = null,VV = 1.06.0007,   CustParam5 = null,CustParam4 = null,CustParam7 = null,CustParam6 = null,   BUNAME =空,PN = M132-409-01-R,CustParam8 =空,CS = 2015-09-30   19:49:25.0,CST =不活动,BL = 3.2,EE =关闭,TID = 190,PRL = VEM,PAV =空,   FAV = null,MON = 2016-04-06 11:13:40.507,DON = null,LPDR = 2015-09-30   19:50:23.85,SSID = null,PIP = null,DID = null,MDATE = null,   OV = rel-20120625-SC-3.1.2-B,CID = null,ICBI = false,TID = null,   LCR = 2015-10-01 01:50:30.297,SS =无近期通信,CBU =空,   GMVR =,LID = store,FF = 167340,HFP = RATNERCO >> blore,ISA = false,   TF = null,FAM = null,LDPDR = 2015-09-30 19:50:39.113,STVER = True,   SID = null,LHB = 2015-09-30 21:50:30.297,IDSS = false,FR = 81796,   LMOS = 2015-09-30 19:49:50.503,LCUS = null,MNAME = XX 123,BBUID = null,   CON = null,DBUN = null,ISDRA = false,POSV = null,UUID = 2,TRAM = null,   SPOL = 000000000,CustomField1 = null,CustomField2 = null,   CustomField3 = null,MUID = 2DE02CF3-0663-420A-8918-7A550E29F570,   CustomField4 = null,CustomField5 = null,HNAME = blore,customparam1 = null,   HID = 1048,LBDT = 2015-07-06 12:03:45.0,DIC = null,AT = None,LID = null,   IDSA = false,LMPS = 2015-09-30 15:49:50.457,MBUN = System,CNC = Ethernet,   LOC = null

我正在创建索引并使用StandardAnalyzer进行搜索,但无法使用字符串UUID = 1进行搜索,我在这里得到的值是 还有没有的UUID = 1(总共我有两个文件,并且两个文件的内容都显示出来)。由于数据具有特殊字符,因此我也尝试使用WhiteSpaceAnalyzer,但随后它没有返回任何数据。我创建了一个自定义分析器,该分析器具有空格,小写字母和标准令牌过滤器,但没有帮助。我还扩展了StopwordAnalyzerBase来创建自己的分析器,并使用NormalizeCharMap替换了特殊字符,这很有帮助,但是我无法进行通配符搜索。

请某人在此方面帮助我。我是Lucene的新手。

    import java.io.IOException;
import java.io.InputStream;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class IndexCreator
{
public void createIndex(String inputFiles,  String indexPath)
{
    //Input Path Variable
    final Path docDir = Paths.get(inputFiles);

    try
    {
        //org.apache.lucene.store.Directory instance
        Directory dir = FSDirectory.open( Paths.get(indexPath) );

        //analyzer with the default stop words
        //Analyzer analyzer = new NewStandardAnalyzer();
        //Analyzer analyzer =  buildAnalyzer();
        //Analyzer analyzer =  new WhitespaceAnalyzer();

        Analyzer analyzer = new StandardAnalyzer();
        //IndexWriter Configuration
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);

        //IndexWriter writes new index files to the directory
        IndexWriter writer = new IndexWriter(dir, iwc);

        //Its recursive method to iterate all files and directories
        indexDocs(writer, docDir);

        writer.commit();
    }
    catch (IOException e)
    {
        e.printStackTrace();
    }
}

private void indexDocs(final IndexWriter writer, Path path) throws 
IOException
{
    //Directory?
    if (Files.isDirectory(path))
    {
        //Iterate directory
        Files.walkFileTree(path, new SimpleFileVisitor<Path>()
        {
            @Override
            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException
            {
                try
                {
                    //Index this file
                    indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
                }
                catch (IOException ioe)
                {
                    ioe.printStackTrace();
                }
                return FileVisitResult.CONTINUE;
            }
        });
    }
    else
    {
        //Index this file
        indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
    }
}

private void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException
{
    try (InputStream stream = Files.newInputStream(file))
    {
        //Create lucene Document
        Document doc = new Document();

        String content = new String(Files.readAllBytes(file));
        //content = content.replace("-", "\\-");
        //content = content.replace(":", "\\:");
        //content = content.replace("=", "\\=");
        //content = content.replace(".", "\\.");
        doc.add(new StringField("path", file.toString(), Field.Store.YES));
        doc.add(new LongPoint("modified", lastModified));
        doc.add(new TextField("contents", content, Store.YES));

        //Updates a document by first deleting the document(s)
        //containing <code>term</code> and then adding the new
        //document.  The delete and then add are atomic as seen
        //by a reader on the same index
        writer.updateDocument(new Term("path", file.toString()), doc);
    }
}

    public static Analyzer buildAnalyzer() throws IOException {
        return CustomAnalyzer.builder()
                .withTokenizer("whitespace")
                .addTokenFilter("lowercase")
                .addTokenFilter("standard")
                .build();

}

public static void main(String[] args) {

        IndexCreator indexCreator = new IndexCreator();indexCreator.createIndex(
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Data", 
 , 
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Index");
        System.out.println("Done");
    }
}


    import java.io.IOException;
    import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searcher
{
//directory contains the lucene indexes
private static final String INDEX_DIR = 
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Index";

public static void main(String[] args) throws Exception
{
    //Create lucene searcher. It search over a single IndexReader.
    Searcher searcher = new Searcher(); 

    //Search indexed contents using search term
    /*searcher.searchInContent("NETWORKCONFIGURATION=Ethernet AND MACADDRESS=60\\:C7\\:98\\:17\\:57\\:80", searcher.createSearcher());
    searcher.searchInContent("NETWORKCONFIGURATION=Ethern*", searcher.createSearcher());*/
    searcher.searchInContent("UUID=1", searcher.createSearcher());

}

private void searchInContent(String textToFind, IndexSearcher searcher) throws Exception
{
    //Create search query
    //QueryParser qp = new QueryParser("contents", new StandardAnalyzer());

    QueryParser qp = new QueryParser("contents", new StandardAnalyzer());
    //textToFind = QueryParser.escape(textToFind).toLowerCase();
    Query query = qp.parse(textToFind);


    //search the index
    TopDocs hits = searcher.search(query, 10);

    System.out.println("Total Results :: " + hits.totalHits);

    for (ScoreDoc sd : hits.scoreDocs)
    {
        Document d = searcher.doc(sd.doc);
        System.out.println("Path : "+ d.get("path") + ", Score : " + sd.score + ", Content : "+d.get("contents"));
    }

}

private IndexSearcher createSearcher() throws IOException
{
    Directory dir = FSDirectory.open(Paths.get(INDEX_DIR));

    //It is an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    //Index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    return searcher;
}

public static Analyzer buildAnalyzer() throws IOException {
    return CustomAnalyzer.builder()
            .withTokenizer("whitespace")
            .addTokenFilter("lowercase")
            .addTokenFilter("standard")
            .build();}}

0 个答案:

没有答案