带有日期参数的Lucene搜索

时间:2019-06-12 15:45:57

标签: java lucene

我对Lucene框架还很陌生。由于我们需要在几毫秒内搜索大量数据,因此我们正在尝试实现Lucene框架。

场景:

  • 我们有EmployeeD,已在Lucene中对其进行索引。对于下面 例如,我只对6个值进行了硬编码。

  • 我有2个参数应作为搜索的输入参数 查询。

 EmployeeDto.java
 private String firstName;
 private String lastName;
 private Long employeeId;
 private Integer salary;
 private Date startDate;
 private Date terminationDate;
 //getters and setters


 EmployeeLucene.java
 public class EmployeeLucene {

 public static void main(String[] args) throws IOException, ParseException {
     // 0. Specify the analyzer for tokenizing text.
     //    The same analyzer should be used for indexing and searching
     StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);

     final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");

     // 1. create the index
     Directory index = new RAMDirectory();
     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
     IndexWriter w = new IndexWriter(index, config);
     long starttimeOfLoad = Calendar.getInstance().getTimeInMillis();
     System.out.println("Data Loading started");

     addEmployee(w, new EmployeeDto("John", "Smith", new Long(101), 10000, DATE_FORMAT.parse("2010-05-05"), DATE_FORMAT.parse("2018-05-05")));
     addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 12000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2015-03-10")));
     addEmployee(w, new EmployeeDto("Franklin", "Robinson", new Long(102), 12000, DATE_FORMAT.parse("2011-04-04"), DATE_FORMAT.parse("2015-07-07")));
     addEmployee(w, new EmployeeDto("Thomas", "Boone", new Long(102), 12000, DATE_FORMAT.parse("2011-02-02"), DATE_FORMAT.parse("2015-03-10")));
     addEmployee(w, new EmployeeDto("John", "Smith", new Long(103), 13000, DATE_FORMAT.parse("2019-05-05"), DATE_FORMAT.parse("2099-12-31")));
     addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 14000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2099-12-31")));

     w.close();
     System.out.println("Data Loaded. Completed in " + (Calendar.getInstance().getTimeInMillis() - starttimeOfLoad));


     // 2. query
     Query q = null;
     try {
         q = new QueryParser(Version.LUCENE_40, "fullName", analyzer).parse(args[0] + "*");
     } catch (org.apache.lucene.queryparser.classic.ParseException e) {
         e.printStackTrace();
     }

     // 3. search
     long starttime = Calendar.getInstance().getTimeInMillis();
     int hitsPerPage = 100;
     IndexReader reader = DirectoryReader.open(index);
     IndexSearcher searcher = new IndexSearcher(reader);
     TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
     searcher.search(q, collector);
     ScoreDoc[] hits = collector.topDocs().scoreDocs;

     // 4. display results
     System.out.println("Found " + hits.length + " hits.");
     List<EmployeeDto> employeeDtoList = new ArrayList<EmployeeDto>();
     for (int i = 0; i < hits.length; ++i) {
         int docId = hits[i].doc;
         Document d = searcher.doc(docId);
         employeeDtoList.add(new EmployeeDto(d.get("firstName"), d.get("lastName"), Long.valueOf(d.get("employeeId")),
                 Integer.valueOf(d.get("salary"))));
     }

     System.out.println(employeeDtoList.size());
     System.out.println(employeeDtoList);
     System.out.println("Time taken:" + (Calendar.getInstance().getTimeInMillis() - starttime) + " ms");

 }

 private static void addEmployee(IndexWriter w, EmployeeDto employeeDto) throws IOException, ParseException {
     Document doc = new Document();

     doc.add(new TextField("fullName", employeeDto.getFirstName() + " " + employeeDto.getLastName(), Field.Store.YES));
     doc.add(new TextField("firstName", employeeDto.getFirstName(), Field.Store.YES));
     doc.add(new TextField("lastName", employeeDto.getLastName(), Field.Store.YES));
     doc.add(new LongField("employeeId", employeeDto.getEmployeeId(), Field.Store.YES));
     doc.add(new LongField("salary", employeeDto.getSalary(), Field.Store.YES));
     doc.add(new LongField("startDate", employeeDto.getStartDate().getTime(), Field.Store.YES));
     doc.add(new LongField("terminationDate", employeeDto.getTerminationDate().getTime(), Field.Store.YES));
     w.addDocument(doc);
 }

}
I run the program as "java EmployeeLucene thom 2014-05-05". 
I should get only 2 values. but getting 3 hits.

问题:

  • 如何在查询字符串中包含第二个参数?第二个参数 应该大于“ startDate”而小于“ terminationDate”
  • 我们可以在文档中包含EmployeeD本身以避免 一旦获得成功,就创建了EmployeeDtos列表。

1 个答案:

答案 0 :(得分:1)

首先,您将获得三个结果,因为您有三个全名包含字符串“ thom *”的记录。它们是记录2、4和6。

第二,Lucene 4.0版本确实很旧。

最后,一种查询startDateterminationDate之间的日期的方法如下:

 // 2. query
 BooleanQuery finalQuery = null;
 try {
    // final query
    finalQuery = new BooleanQuery();

    // thom* query
    Query fullName = new QueryParser(Version.LUCENE_40, "fullName", analyzer).parse("thom" + "*");
    finalQuery.add(fullName, Occur.MUST); // MUST implies that the keyword must occur.

    // greaterStartDate query
    long searchDate = DATE_FORMAT.parse("2014-05-05").getTime();
    Query greaterStartDate = NumericRangeQuery.newLongRange("startDate", null, searchDate, true, true);
    finalQuery.add(greaterStartDate, Occur.MUST); // Using all "MUST" occurs is equivalent to "AND" operator

    // lessTerminationDate query
    Query lessTerminationDate = NumericRangeQuery.newLongRange("terminationDate", searchDate, null, false, false);
    finalQuery.add(lessTerminationDate, Occur.MUST); 

 } catch (org.apache.lucene.queryparser.classic.ParseException e) {
     e.printStackTrace();
 }
  
    

我们能否在文档中包括EmployeeDto本身,以避免在获得点击后创建EmployeeDtos列表。

  

不是我所知道的。

编辑:版本7.0.1

     // 0. Specify the analyzer for tokenizing text.
     //    The same analyzer should be used for indexing and searching
     StandardAnalyzer analyzer = new StandardAnalyzer();

     final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");

     // 1. create the index
     Directory index = new RAMDirectory();
     IndexWriterConfig config = new IndexWriterConfig(analyzer);
     IndexWriter w = new IndexWriter(index, config);
     long starttimeOfLoad = Calendar.getInstance().getTimeInMillis();
     System.out.println("Data Loading started");

     addEmployee(w, new EmployeeDto("John", "Smith", new Long(101), 10000, DATE_FORMAT.parse("2010-05-05"), DATE_FORMAT.parse("2018-05-05")));
     addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 12000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2015-10-10")));
     addEmployee(w, new EmployeeDto("Franklin", "Robinson", new Long(102), 12000, DATE_FORMAT.parse("2011-04-04"), DATE_FORMAT.parse("2015-07-07")));
     addEmployee(w, new EmployeeDto("Thomas", "Boone", new Long(102), 12000, DATE_FORMAT.parse("2011-02-02"), DATE_FORMAT.parse("2015-03-10")));
     addEmployee(w, new EmployeeDto("John", "Smith", new Long(103), 13000, DATE_FORMAT.parse("2019-05-05"), DATE_FORMAT.parse("2099-12-31")));
     addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 14000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2099-12-31")));

     w.close();
     System.out.println("Data Loaded. Completed in " + (Calendar.getInstance().getTimeInMillis() - starttimeOfLoad));

     // 2. query
     BooleanQuery finalQuery = null;
     try {
        // final query
        Builder builder = new BooleanQuery.Builder();

        // thom* query
        Query fullName = new QueryParser("fullName", analyzer).parse("thom" + "*");
        builder.add(fullName, Occur.MUST); // MUST implies that the keyword must occur.

        // greaterStartDate query
        long searchDate = DATE_FORMAT.parse("2014-05-05").getTime();
        Query greaterStartDate = LongPoint.newRangeQuery("startDatePoint", Long.MIN_VALUE, searchDate);
        builder.add(greaterStartDate, Occur.MUST); // Using all "MUST" occurs is equivalent to "AND" operator

        // lessTerminationDate query
        Query lessTerminationDate = LongPoint.newRangeQuery("terminationDatePoint", searchDate, Long.MAX_VALUE);
        builder.add(lessTerminationDate, Occur.MUST);
        finalQuery = builder.build();

     } catch (org.apache.lucene.queryparser.classic.ParseException e) {
         e.printStackTrace();
     }

     // 3. search
     long starttime = Calendar.getInstance().getTimeInMillis();
     int hitsPerPage = 100;
     IndexReader reader = DirectoryReader.open(index);
     IndexSearcher searcher = new IndexSearcher(reader);
     TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage);
     searcher.search(finalQuery, collector);
     ScoreDoc[] hits = collector.topDocs().scoreDocs;

     // 4. display results
     System.out.println("Found " + hits.length + " hits.");
     List<EmployeeDto> employeeDtoList = new ArrayList<EmployeeDto>();
     for (int i = 0; i < hits.length; ++i) {
         int docId = hits[i].doc;
         Document d = searcher.doc(docId);
         employeeDtoList.add(new EmployeeDto(d.get("firstName"), d.get("lastName"), Long.valueOf(d.get("employeeId")),
                 Integer.valueOf(d.get("salary"))));
     }

     System.out.println(employeeDtoList.size());
     System.out.println(employeeDtoList);
     System.out.println("Time taken:" + (Calendar.getInstance().getTimeInMillis() - starttime) + " ms");

 }

 private static void addEmployee(IndexWriter w, EmployeeDto employeeDto) throws IOException {
     Document doc = new Document();

     doc.add(new TextField("fullName", employeeDto.getFirstName() + " " + employeeDto.getLastName(), Store.YES));
     doc.add(new TextField("firstName", employeeDto.getFirstName(), Store.YES));
     doc.add(new TextField("lastName", employeeDto.getLastName(), Store.YES));
     doc.add(new StoredField("employeeId", employeeDto.getEmployeeId()));
     doc.add(new StoredField("salary", employeeDto.getSalary()));
     doc.add(new StoredField("startDate", employeeDto.getStartDate().getTime()));
     doc.add(new LongPoint("startDatePoint", employeeDto.getStartDate().getTime()));
     doc.add(new StoredField("terminationDate", employeeDto.getTerminationDate().getTime()));
     doc.add(new LongPoint("terminationDatePoint", employeeDto.getTerminationDate().getTime()));
     w.addDocument(doc);
 }

编辑:日期字段存储为LongPointStoredField类型。 LongPoint类型可以用于LongPoint.newRangeQuery,但是如果您想知道日期是什么,以后就不能将其作为值检索。 StoredField类型可以作为存储值检索,但不能用于范围查询。尽管此示例未检索日期字段,但版本4确实具有两种功能。如果您不打算再检索这些值,则可以删除StoredField日期。

相关问题