Thursday, April 11, 2013

Simple Auto Suggester using lucene 4.1 using AnalyzingSuggester

Autosuggestion and Autocomplete system is a must for any software that has search engine in it. Google has set a new bar with its excellent autosuggestion and autocomplete system. So, it would be great if we could build autosuggester system by ourselves with little coding and understanding of what goes behind the autosuggester system. This is where lucene comes in. Here, I will show you a simple autosuggester program that will first build the data set to be used for autosuggestion and will lookup that data set for suggesting. The data set is a simple list of word and its count. The three classes shown below demonstrates a simple autosuggester.

LuceneAutoSuggesterTest.java

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
package com.lucene.autosuggester;

import java.io.IOException;
import java.util.List;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
import org.apache.lucene.util.Version;
import org.junit.Test;

public class LuceneAutoSuggesterTest {

 @Test
 public void luceneAutoSuggester() throws IOException {
  WordFreq wordFreqs[] = new WordFreq[] { new WordFreq("ball", 50),
    new WordFreq("bar", 10), new WordFreq("ba", 12),
    new WordFreq("ballon", 6) };

  AnalyzingSuggester suggester = new AnalyzingSuggester(
    new StandardAnalyzer(Version.LUCENE_41));

  suggester.build(new WordFreqArrayIterator(wordFreqs));

  List<LookupResult> results = suggester.lookup("ba", false, 100);

  System.out.println("Suggested words for input \"ba\"");
  for (LookupResult lookupResult : results) {
   System.out.println(lookupResult.key + ":" + lookupResult.value);
  }
 }
}


WordFreq.java

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
package com.lucene.autosuggester;

import org.apache.lucene.util.BytesRef;

public class WordFreq {
 public long count;
 public BytesRef term;

 public WordFreq(BytesRef term, long count) {
  this.term = term;
  this.count = count;
 }

 public WordFreq(String term, long count) {
  this(new BytesRef(term), count);
 }
}


WordFreqArrayIterator.java

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
package com.lucene.autosuggester;

import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;

import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRef;

public final class WordFreqArrayIterator implements TermFreqIterator {
 private WordFreq current;
 private final Iterator<WordFreq> wordFreqIterator;

 public WordFreqArrayIterator(Iterator<WordFreq> wordFreqIterator) {
  this.wordFreqIterator = wordFreqIterator;
 }

 public WordFreqArrayIterator(WordFreq[] list) {
  this(Arrays.asList(list).iterator());
 }

 @Override
 public Comparator<BytesRef> getComparator() {
  return null;
 }

 @Override
 public BytesRef next() {
  if (wordFreqIterator.hasNext()) {
   current = wordFreqIterator.next();
   return current.term;
  }
  return null;
 }

 @Override
 public long weight() {
  return current.count;
 }
}

For more reference on AnalyzingSuggesterBytesRef and TermFreqIterator, see its documentation.

For more info on AnalyzingSuggester visit:
http://blog.mikemccandless.com/2012/09/lucenes-new-analyzing-suggester.html

Wednesday, April 10, 2013

Real time search in lucene aka NRTManager.

Here, I am going to post a simple program that demonstrates real time searching using NRTManager. For more info on lucene's real time search visit: http://blog.mikemccandless.com/2011/11/near-real-time-readers-with-lucenes.html
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package com.immunesecurity.NRTManager;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.StringField;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.NRTManager.TrackingIndexWriter;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.store.Directory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.NRTManager;
import org.apache.lucene.search.NRTManagerReopenThread;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class NRTManagerTest {

 public static void main(String[] args) throws Exception {
  NRTManagerTest nrtManagerTest = new NRTManagerTest();
  nrtManagerTest.init();
  nrtManagerTest.index();
  Thread.sleep(5000);
  nrtManagerTest.search();
 }

 private Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
 private Directory dir = new RAMDirectory();
 private long latestGen;
 private Logger logger = LoggerFactory.getLogger(getClass());
 private NRTManager nrtManager;
 private NRTManagerReopenThread reopenThread;
 private TrackingIndexWriter tiw;
 private Version version = Version.LUCENE_40;
 private IndexWriter writer;

 private void index() {
  Document newDoc = new Document();
  Field idField = new Field("id", "6", StringField.TYPE_STORED);
  newDoc.add(idField);
  try {
   tiw.addDocument(newDoc, analyzer);
  } catch (IOException ex) {
   logger.error(ex.getMessage());
  }
 }

 private void init() throws Exception {
  IndexWriterConfig cfg = new IndexWriterConfig(version, analyzer);
  cfg.setRAMBufferSizeMB(128);
  LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
  mp.setUseCompoundFile(false);
  cfg.setMergePolicy(mp);
  cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

  writer = new IndexWriter(dir, cfg);

  tiw = new TrackingIndexWriter(writer);
  nrtManager = new NRTManager(tiw, new SearcherFactory());
  int priority = Math.min(Thread.currentThread().getPriority() + 2,
    Thread.MAX_PRIORITY);
  reopenThread = new NRTManagerReopenThread(nrtManager, 2, 0.03);
  reopenThread.setName("NRT Reopen Thread");
  reopenThread.setPriority(priority);
  reopenThread.setDaemon(true);
  reopenThread.start();
 }

 private void search() throws Exception {
  nrtManager.waitForGeneration(latestGen);
  IndexSearcher searcher = nrtManager.acquire();
  try {
   TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 100);
   logger.info(String.format("no of results found=%d",
     topDocs.scoreDocs.length));
  } finally {
   nrtManager.release(searcher);
  }
 }
}