Supermind Consulting Blog 
Solr - ElasticSearch - Big Data

Posts about Lucene / Solr / Elastic Search / Nutch

Java port of Quicksilver-style Live Search

Posted by Kelvin on 19 Nov 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch, programming

Here's a straight Java port of the quicksilver algo, found here:

quicksilver.js contains the actual algorithm in javascript.

It uses the same input strings as the demo page at

import java.util.TreeSet;

public class Quicksilver {
  public static void main(String[] args) throws IOException {
    for (ScoreDoc doc : getScores("DGHTD")) System.out.println(doc);
    for (ScoreDoc doc : getScores("Web")) System.out.println(doc);
    for (ScoreDoc doc : getScores("jhn nnmkr")) System.out.println(doc);
    for (ScoreDoc doc : getScores("wp")) System.out.println(doc);

  public static TreeSet<ScoreDoc> getScores(String term) {
    term = term.toLowerCase();
    TreeSet<ScoreDoc> scores = new TreeSet<ScoreDoc>();
    for (int i = 0; i < cache.length; i++) {
      float score = score(cache[i], term, 0);
      if (score > 0) {
        scores.add(new ScoreDoc(score, i));
    return scores;

  public static float score(String str, String abbreviation, int offset) {
//    int offset ? offset : 0 // TODO: I think this is unused… remove

    if (abbreviation.length() == 0) return 0.9f;
    if (abbreviation.length() > str.length()) return 0.0f;

    for (int i = abbreviation.length(); i > 0; i–) {
      String sub_abbreviation = abbreviation.substring(0, i);
      int index = str.indexOf(sub_abbreviation);

      if (index < 0) continue;
      if (index + abbreviation.length() > str.length() + offset) continue;

      String next_string = str.substring(index + sub_abbreviation.length());
      String next_abbreviation = null;

      if (i >= abbreviation.length())
        next_abbreviation = "";
        next_abbreviation = abbreviation.substring(i);

      float remaining_score = score(next_string, next_abbreviation, offset + index);

      if (remaining_score > 0) {
        float score = str.length() – next_string.length();

        if (index != 0) {
          int j = 0;

          char c = str.charAt(index – 1);
          if (c == 32 || c == 9) {
            for (j = (index – 2); j >= 0; j–) {
              c = str.charAt(j);
              score -= ((c == 32 || c == 9) ? 1 : 0.15);

            // XXX maybe not port str heuristic
            //          } else if ([[NSCharacterSet uppercaseLetterCharacterSet] characterIsMember:[self characterAtIndex:matchedRange.location]]) {
            //            for (j = matchedRange.location-1; j >= (int) searchRange.location; j–) {
            //              if ([[NSCharacterSet uppercaseLetterCharacterSet] characterIsMember:[self characterAtIndex:j]])
            //                score–;
            //              else
            //                score -= 0.15;
            //            }
          } else {
            score -= index;

        score += remaining_score * next_string.length();
        score /= str.length();
        return score;
    return 0.0f;

  public static class ScoreDoc implements Comparable<ScoreDoc> {

    public float score;
    public int doc;
    public String term;
    public ScoreDoc(float score, int doc) {
      this.score = score;
      this.doc = doc;
      this.term = cache[doc];

    public int compareTo(ScoreDoc o) {
      if (o.score < score) return -1;
      if (o.score > score) return 1;
      return 0;

    public boolean equals(Object o) {
      if (this == o) return true;
      if (o == null || getClass() != o.getClass()) return false;

      ScoreDoc scoreDoc = (ScoreDoc) o;

      if (doc != scoreDoc.doc) return false;
      if (, score) != 0) return false;

      return true;

    public int hashCode() {
      int result = (score != +0.0f ? Float.floatToIntBits(score) : 0);
      result = 31 * result + doc;
      return result;

    @Override public String toString() {
      final StringBuilder sb = new StringBuilder();
      sb.append(", doc=").append(doc);
      sb.append(", term='").append(term).append('\'');
      return sb.toString();

  public static String[] cache = new String[]{
      "The Well-Designed Web",
      "Welcome John Nunemaker",
      "Sidebar Creative: The Next Steps",
      "The Web/Desktop Divide",
      "2007 in Review",
      "Don't Complicate the Solution",
      "Blog to Business",
      "Single Line CSS",
      "Comments Work Again",
      "The iPhone Effect",
      "Greek Blogger Camp",
      "FeedBurner FeedSmith",
      "Download Counter Update 1.3",
      "Branding Reworked",
      "Productivity and Fascination",
      "Passing the Torch",
      "Goodbye Austin",
      "Ordered Shirts",
      "Sidebar Creative",
      "Building the Modern Web",
      "Open for Business",
      "The Art and Science of CSS",
      "WP Tiger Administration v3.0",
      "Cleaning House",
      "Tiger Admin 3.0 Beta Testing",
      "Rails and MVC",
      "Updates and More",
      "FeedBurner Plugin v2.1 Released",
      "The Global Health Crisis",
      "WP FeedBurner v2.1 Beta",
      "Web Development and Information Technology",
      "On Becoming a Dad",
      "Tiger Admin and Shuttle",
      "Staying Small in a Big Place: Part 1",
      "WaSP eduTF Interview",
      "Planned Parenthood",
      "IE7 and Clearing Floats",
      "SXSWi 2006: Dan Gilbert – How To Do Exactly the Right Thing at All Possible Times",
      "SXSWi 2006: Traditional Design and New Technology",
      "SXSWi 2006: Almost There",
      "HOWTO: Animated Live Search",
      "Leaving Solo",
      "Tagged for Four Things",
      "Automotive Interface",
      "Another FeedBurner Plugin Update",
      "WP Tiger Admin 2.0",
      "WordPress FeedBurner Plugin for 2.0",
      "SXSWi 2006",
      "Statistical AJAX",
      "Semantics and Design",
      "Download Counter Update",
      "Best Buy, Worst Experience",
      "A Realign, or Whatever",
      "Stop with the Jargon",
      "10K+ for Tiger Plugin",
      "Flock and Integration",
      "Only the Beginning",
      "A Tip of the Hat",
      "3 Years",
      "Pepper: Download Counter",
      "Launch: Notre Dame College of Arts and Letters",
      "Innovation, Progress, and Imagination",
      "This Thing Here",
      "Web Developer Opening",
      "WordPress Administration Design: Tiger",
      "SAJAX ColdFusion POST Request Method",
      "An Underscore Solution",
      "Google and the Underscore",
      "The Hand Off",
      "WordPress Upgrade and RSS",
      "WordPress FeedBurner Plugin",
      "Documentation Process",
      "WordPress Underscore Plugin",
      "CMS Release",
      "Two Suggestions for iTunes",
      "Call for Good Music",
      "A Change of Platform",
      "Point/Counterpoint: The Wrapper Div",
      "IE7 List, As Requested",
      "I'm a Switcher",
      "Breadcrumb Trails",
      "Output Code",
      "Bending the Matrix",
      "Children's Resource Group",
      "Do You Freelance?",
      "Project Management Software",
      "I Can't Stand It!",
      "Shiver Me Timbers!",
      "NDWG V1.0",
      "Dealing with IE5/Mac",
      "To All",
      "A Natural Progression",
      "Finishing the Basement",
      "Where Do You Live?",
      "The Recursion Project",
      "Clearing Floats: The FnE Method",
      "Ordered Zen",
      "Comment RSS",
      "Wordpress Code",
      "Writing Lean CSS",
      "v3.0 CMYK",
      "A Clean Slate",
      "Working for the Irish",
      "Excuse the Mess",
      "A Little Help",
      "Design Revisions",
      "FTOS Round 2",
      "I Love Storms",
      "One Gig?",
      "AD:TECH 2004 Chicago",
      "Thanks and Response",
      " v2.0",
      "Skuzzy Standards",
      "Simple List",
      "Anger Management",
      "A Practical Start to Web Standards",
      "Irony and Progress",
      "The Familiar Chirping of Crickets",
      "Results of FTOS Round 1",
      "Figure This Out, Steve",
      "Increasing Developer Productivity",
      "One Down",
      "Content Management Your Way",
      "We Have Liftoff",
      "The Great Divide",
      "What's in a Name?",
      "Just How Important is Validation?"};

    for (int i = 0, n = cache.length; i < n; i++) {
      cache[i] = cache[i].toLowerCase();

Apache Solr vs ElasticSearch – the website

Posted by Kelvin on 14 Nov 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch

Just spent the day hacking together a website that does a blow-by-blow examination of Solr vs ElasticSearch.

Hopefully it'll address any questions people might have about whether to use Solr or ES..

Let me know what you think!

The anatomy of a Lucene Tokenizer

Posted by Kelvin on 12 Nov 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch

A term is the unit of search in Lucene. A Lucene document comprises of a set of terms. Tokenization means splitting up a string into tokens, or terms.

A Lucene Tokenizer is what both Lucene (and correspondingly, Solr) uses to tokenize text.

To implement a custom Tokenizer, you extend org.apache.lucene.analysis.Tokenizer.

The only method you need to implement is public boolean incrementToken(). incrementToken returns false for EOF, true otherwise.

Tokenizers generally take a Reader input in the constructor, which is the source to be tokenized.

With each invocation of incrementToken(), the Tokenizer is expected to return new tokens, by setting the values of TermAttributes. This happens by adding TermAttributes to the superclass, usually as fields in the Tokenizer. e.g.

public class MyCustomTokenizer extends Tokenizer {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

Here, a CharTermAttribute is added to the superclass. A CharTermAttribute stored the term text.

Here's one way to set the value of the term text in incrementToken().

public boolean incrementToken() {
      if(done) return false;
      done = true;
      int upto = 0;
      char[] buffer = new char[512];
      while (true) {
        final int length =, upto, buffer.length – upto); // input is the reader set in the ctor
        if (length == -1) break;
        upto += length;
      return true;

And that's pretty much all you need to start writing custom Lucene tokenizers!

Tokenizing second-level and top-level domain for a URL in Lucene and Solr

Posted by Kelvin on 12 Nov 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch

In my previous post, I described how to extract second- and top-level domains from a URL in Java.

Now, I'll build a Lucene Tokenizer out of it, and a Solr TokenizerFactory class.

DomainTokenizer doesn't do anything really fancy. It first returns the hostname as the first token, then the 2nd-level domain as the second token, and the top-level domain as the last token.

e.g. given the URL, it'll return

Doing so allows you to quickly return all documents in the Lucene or Solr index matching the second-level domain or the TLD.

package org.supermind.solr.analysis;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;


public class DomainTokenizer extends Tokenizer {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

  public static final int STATE_UNINITIALIZED = -1;
  public static final int STATE_INITIALIZED = 0;
  public static final int STATE_2LD = 1;
  public static final int STATE_TLD = 2;
  public static final int STATE_DONE = 3;

  private int state = STATE_UNINITIALIZED;

  private URL url = null;
  private SecondLDExtractor extractor;
  private boolean index2LD;
  private boolean indexTLD;

  public DomainTokenizer(Reader input, SecondLDExtractor extractor, boolean index2LD, boolean indexTLD) {
    this.extractor = extractor;
    this.index2LD = index2LD;
    this.indexTLD = indexTLD;

  public boolean incrementToken() throws IOException {
    if (state == STATE_DONE) return false;

    if (this.url == null) {
      state = STATE_INITIALIZED;

      StringBuilder sb = new StringBuilder();
      int upto = 0;
      char[] buffer = new char[512];
      while (true) {
        final int length =, upto, buffer.length – upto);
        if (length == -1) break;
        upto += length;
      this.url = new URL(sb.toString());
      if (!index2LD && !indexTLD) state = STATE_DONE;
      return true;
    } else if (index2LD && state < STATE_2LD) {
      state = STATE_2LD;
      String twold = extractor.extract2LD(url.getHost());
      return true;
    } else if (indexTLD && state < STATE_TLD) {
      state = STATE_TLD;
      String tld = extractor.extractTLD(url.getHost());
      return true;
    state = STATE_DONE;
    return false;

and here's the corresponding Solr TokenizerFactory.

package org.supermind.solr.analysis;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.analysis.BaseTokenizerFactory;

import java.util.Map;

public class DomainTokenizerFactory extends BaseTokenizerFactory {
  private SecondLDExtractor extractor;
  private boolean index2LD;
  private boolean indexTLD;

  public void init(Map<String, String> args) {
    index2LD = getBoolean("index2LD", true);
    indexTLD = getBoolean("indexTLD", true);
    if (index2LD || indexTLD) {

  private void initTLDExtractor() {
    extractor = new SecondLDExtractor();

  public Tokenizer create(Reader input) {
    DomainTokenizer tokenizer = new DomainTokenizer(input, extractor, index2LD, indexTLD);
    return tokenizer;

Here's a sample fieldType definition.

<fieldType name="domain" class="solr.TextField" positionIncrementGap="100">
        <tokenizer class="org.supermind.solr.analysis.DomainTokenizerFactory"/>

Extracting second-level domains and top-level domains (TLD) from a URL in Java

Posted by Kelvin on 12 Nov 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch

It turns out that extracting second- and top-level domains is not a simple task, the primary difficulty being that in addition to the usual suspects (.com .org .net etc), there are the country suffixes (.uk .it .de etc) which need to be accounted for.

Regex alone has no way of handling this. contains a somewhat authoritative list of TLD and ccTLD that we can use.

Here follows a Java class which parses this list, builds a regex from it, and extracts out the the TLD and second-level domain from a hostname. You'll need to download the effective_tld_names.dat from and place it in the same directory as the Java class.

In my next post, I'll build a Lucene Tokenizer out of this, so it can be used in Lucene and Solr.

package org.supermind.solr.analysis;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SecondLDExtractor {
  private StringBuilder sb = new StringBuilder();
  private Pattern pattern;

  public void init() {
    try {
      ArrayList<String> terms = new ArrayList<String>();

      BufferedReader br = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream("effective_tld_names.dat")));
      String s = null;
      while ((s = br.readLine()) != null) {
        s = s.trim();
        if (s.length() == 0 || s.startsWith("//") || s.startsWith("!")) continue;
      Collections.sort(terms, new StringLengthComparator());
      for(String t: terms) add(t);
    } catch (IOException e) {
      throw new IllegalStateException(e);

  protected void add(String s) {
    s = s.replace(".", "\\.");
    s = "\\." + s;
    if (s.startsWith("*")) {
      s = s.replace("*", ".+");
    } else {

  public void compile() {
    if (sb.length() > 0) sb.deleteCharAt(sb.length()1);
    sb.insert(0, "[^.]+?(");
    pattern = Pattern.compile(sb.toString());
    sb = null;

  public String extract2LD(String host) {
    Matcher m = pattern.matcher(host);
    if (m.find()) {
    return null;

  public String extractTLD(String host) {
    Matcher m = pattern.matcher(host);
    if (m.find()) {
    return null;

  public static class StringLengthComparator implements Comparator<String> {
    public int compare(String s1, String s2) {
      if (s1.length() > s2.length()) return -1;
      if (s1.length() < s2.length()) return 1;
      return 0;

Book review of Apache Solr 3.1 Cookbook

Posted by Kelvin on 30 Jun 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch

I recently got a chance to review Apache Solr 3.1 Cookbook by Rafal Kuc, published by PacktPub.

Now, to give a bit of context: I help folks implementing and customizing Solr professionally, so I know a fair bit of how Solr works, and am also quite familiar with the code internals. I was, therefore, pleasantly surprised when leafing through the table of contents, that there were at least a couple entries which had me wondering: Now how would I do that?

Here's the high-level TOC:

Chapter 1: Apache Solr Configuration
Chapter 2: Indexing your Data
Chapter 3: Analyzing your Text Data
Chapter 4: Solr Administration
Chapter 5: Querying Solr
Chapter 6: Using Faceting Mechanism
Chapter 7: Improving Solr Performance
Chapter 8: Creating Applications that use Solr and Developing your Own Solr Modules
Chapter 9: Using Additional Solr Functionalities
Chapter 10: Dealing with Problems

And here's a list of the recipes in Chapter 5, to give you a feel of the recipes:

Chapter 5: Querying Solr
Asking for a particular field value
Sorting results by a field value
Choosing a different query parser
How to search for a phrase, not a single word
Boosting phrases over words
Positioning some documents over others on a query
Positioning documents with words closer to each other first
Sorting results by a distance from a point
Getting documents with only a partial match
Affecting scoring with function
Nesting queries

You can view the full table of contents from the PacktPub website.

Now, first of all, this book is like one of those cookbook-type books with lots of snippets of how to do stuff in Solr. If you know next to nothing about Solr, this book is not for you. As the PacktPub site says:

This book is part of Packt's Cookbook series… The recipes deal with common problems of working with Solr by using easy-to-understand, real-life examples. The book is not in any way a complete Apache Solr reference…

If, however, you're just past beginner level and wanting to dig a little deeper into Solr and find the FAQs, tutorials, Solr Wiki etc too confusing/verbose/unorganized, then I think Apache Solr 3.1 Cookbook is probably exactly what you need.

The examples are concise, stand-alone, and can be readily implemented in 5 minutes or less. They're a non-threatening way to get past the beginner level, and also offer a glimpse at some of Solr's more advanced functionality.

Oddly enough, the reviews on the web (, goodreads and google books) all rate this book mediocrely, with an average of 3+ stars. In my opinion, this book easily deserves at least 4, if not 4.5 stars, assuming you're not a complete Solr n00b.

OK, I admit the writing is a little repetitive at times (the author's Polish), and some of the recipes are really, really basic, but nonetheless, for a cookbook-style guide aimed at the beginner-intermediate crowd, I think it's great!

Get it from amazon here:
More details at PacktPub

Simplistic noun-phrase chunking with POS tags in Java

Posted by Kelvin on 16 Jun 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch, programming

I needed to extract Noun-Phrases from text. The way this is generally done is using Part-of-Speech (POS) tags. OpenNLP has a both a POS-tagger as well as a Noun-Phrase chunker. However, it's really really really slow!

I decided to look into alternatives, and chanced upon QTag.

QTag is a "freely available, language independent POS-Tagger. It is implemented in Java, and has been successfully tested on Mac OS X, Linux, and Windows."

It's waaay faster than OpenNLP for POS-tagging, though I haven't done any benchmarks as to a accuracy.

Here's my really simplistic but adequate implementation of noun-phrase chunking using QTag.

  private Qtag qt;
  public static List<String> chunkQtag(String str) throws IOException {
    List<String> result = new ArrayList<String>();
    if (qt == null) {
      qt = new Qtag("lib/english");

    String[] split = str.split("\n");
    for (String line : split) {
      String s = qt.tagLine(line, true);
      String lastTag = null;
      String lastToken = null;
      StringBuilder accum = new StringBuilder();
      for (String token : s.split("\n")) {
        String[] s2 = token.split("\t");
        if (s2.length < 2) continue;
        String tag = s2[1];

        if (tag.equals("JJ")
            || tag.startsWith("NN")
            || tag.startsWith("??")
            || (lastTag != null && lastTag.startsWith("NN") && s2[0].equalsIgnoreCase("of"))
            || (lastToken != null && lastToken.equalsIgnoreCase("of") && s2[0].equalsIgnoreCase("the"))
            ) {
        } else {
          if (accum.length() > 0) {
            accum = new StringBuilder();
        lastTag = tag;
        lastToken = s2[0];
      if (accum.length() > 0) {
    return result;

The method returns a list of noun phrases.

Separating relevance signals from document content in Solr or Lucene

Posted by Kelvin on 16 Jun 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch

Full-text search has traditionally been about the indexing and ranking of a corpus of unstructured text content.

The vector space model (VSM) and its cousins, in addition to structural ranking algorithms such as PageRank, have been the authoritative ways of ranking documents.

However, with the recent proliferation of personalization, analytics, social networks and the like, there are increasing ways of determining document relevance, both globally and on a per-user basis. Some call these relevance signals.

Global relevance signals are simple to incorporate into Solr, either as a separate field + query-time boost, or an index-time document boost.

However, there has not traditionally been a satisfactory way of incorporating per-user relevance signals in Lucene/Solr's search process. We'll therefore be focusing on user-specific relevance signals for the rest of this document…

Before going further, here are some examples of user-specific relevance signals:

  • clickstream data
  • search logs
  • user preferences
  • likes, +1, etc
  • purchase history
  • blog, twitter, tumblr feed
  • social graph

I'm going to describe a system of incorporating user-specific relevance signals into your Solr searches in a scalable fashion.


In your Lucene/Solr index, store the documents you want searched. This can be products, companies, jobs etc. It can be multiple data-types, and each doc needs a unique id.

Relevance signals

In a separate sql/nosql database, store your relevance signals. They should be structured in a way which doesn't require complex joins, and be keyed by user-id. i.e. with a single get() query, you should be able to retrieve all necessary relevance data for that user.

One way of doing this is storing the relevance data as json, with individual fields as object ids.

You should also preferably pre-process the relevance data so there is a float/integer which provides the "score" or "value" of that signal.

For example:


In this JSON example, the SPNxxx are product ids, and the integer value is the score.


Now implement a custom FunctionQuery in Solr which accepts the userid as a parameter. Usage will look something like this: influence(201)^0.5 where influence is the name of the functionquery and 201 is the user id, 0.5 being the weight boost.

In the FunctionQuery, issue the DB request and obtain the relevance signal json, e.g. the example above.

Now within the ValueSource itself, load the data ids via FieldCache, and reference the json. The code looks something like:

@Override public DocValues getValues(Map context, IndexReader reader) throws IOException {
    final String[] lookup = FieldCache.DEFAULT.getStrings(reader, idField);
    return new DocValues() {
      @Override public float floatVal(int doc) {
        final String id = lookup[doc];
        if (obj == null) return 0;
        Object v = jsonObj.get(id);
        if (v == null) return 0;
        if (v instanceof Float) {
          return ((Float) v);

See what's happening here is the id field is retrieved from the document id. With our JSON example above, the id value could be something like "SPN332".

This is then used to check against the JSON object. If it exists, the integer/float value is returned as the functionquery score of that doc. Else, 0 is returned.

ElasticSearch 0.19 extension points

Posted by Kelvin on 14 Jun 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch

A list of the extension points exposed by ElasticSearch (as of 0.19.4)

  • Analysis plugins – use different kinds of analyzers
  • River plugins – A river is an external datasource which ES indexes
  • Transport plugins – Different means of exposing ES API, e.g. Thrift, memcached
  • Site plugins – for running various ES-related webapps, like the ES head admin webapp
  • Custom REST endpoint – lets you define a REST action by extending BaseRestHandler.
  • Scripting plugins – providing support for using different scripting languages as search scripts
  • NativeScripts – loosely equivalent to Solr's FunctionQuery. Allows you to return "script fields", custom scores or perform search filtering.

As far as I can tell (from the source), there's no equivalent of Solr's SearchComponent, which allows you to modify the search request processing pipeline in an extremely flexible manner.

Connecting Redis to ElasticSearch for custom scoring with nativescripts

Posted by Kelvin on 14 Jun 2012 | Tagged as: Lucene / Solr / Elastic Search / Nutch

After connecting Redis and MongoDB to Solr, I figured it'd be interesting to do the same with ElasticSearch. Here's the result of my experiments:

We'll be implementing this using AbstractSearchScript, which is roughly ElasticSearch's version of Solr's FunctionQuery.

ES' NativeScriptFactory corresponds loosely to Solr's ValueSourceParser, and AbstractSearchScript to ValueSource.

public class RedisNativeScriptFactory implements NativeScriptFactory {
  @Override public ExecutableScript newScript(@Nullable Map<String, Object> params) {
    return new RedisScript(params);
public class RedisScript extends AbstractFloatSearchScript {
  private String idField;
  private String redisKey;
  private String redisValue;
  private final Jedis jedis;
  private JSONObject obj;

  public RedisScript(Map<String, Object> params) {
    this.idField = (String) params.get("idField");
    this.redisKey = (String) params.get("redisKey");
    this.redisValue = (String) params.get("redisValue");
    jedis = new Jedis("localhost");
    String v = jedis.hget(redisKey, redisValue);
    if (v != null) {
      obj = (JSONObject) JSONValue.parse(v);
    } else {
      obj = new JSONObject();

  @Override public float runAsFloat() {
    String id = doc().field(idField).stringValue();
    Object v = obj.get(id);
    if (v != null) {
      try {
        return Float.parseFloat(v.toString());
      } catch (NumberFormatException e) {
        return 0;
    return 0;

Now in config/elasticsearch.yml, add this:


Change redis to whatever you want the script name to be, and change the class name accordingly too.

Now, to use this:

curl -XGET 'http://localhost:9200/electronics/product/_search' -d '{
  "query" :{
     "custom_score": {
       "query" : { "match_all": {}},
       "script" : "redis",
       "params" :{
          "idField": "id",
          "redisKey": "bar",
          "redisValue" : "500"
       "lang": "native"


PS: My implementation of RedisScript assumes a Redis hash has been populated with a json object corresponding to an idField. Here's a class populating the redis hash. JSON objects are created with the json-smart package, but you can plugin your favourite json lib:

public static void main(String[] args) {
    Jedis jedis = new Jedis("localhost");
    int num = 100000;
    Random r = new Random();
    for(int i=0;i< num;++i) {
      JSONObject o = new JSONObject();
      int numberOfEntries = r.nextInt(100);
      for(int j=0;j< numberOfEntries;++j) {
        o.put("es" + j, r.nextInt(100));
      String json = o.toJSONString(JSONStyle.MAX_COMPRESS);
      jedis.hset("bar", Integer.toString(i), json);

« Previous PageNext Page »