Tokenizing second-level and top-level domain for a URL in Lucene and Solr
Posted by Kelvin on 12 Nov 2012 at 08:36 pm | Tagged as: Lucene / Solr / Elastic Search / Nutch
In my previous post, I described how to extract second- and top-level domains from a URL in Java.
Now, I'll build a Lucene Tokenizer out of it, and a Solr TokenizerFactory class.
DomainTokenizer doesn't do anything really fancy. It first returns the hostname as the first token, then the 2nd-level domain as the second token, and the top-level domain as the last token.
e.g. given the URL http://www.supermind.org, it'll return
www.supermind.org
.supermind.org
.org
Doing so allows you to quickly return all documents in the Lucene or Solr index matching the second-level domain or the TLD.
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
public class DomainTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public static final int STATE_UNINITIALIZED = -1;
public static final int STATE_INITIALIZED = 0;
public static final int STATE_2LD = 1;
public static final int STATE_TLD = 2;
public static final int STATE_DONE = 3;
private int state = STATE_UNINITIALIZED;
private URL url = null;
private SecondLDExtractor extractor;
private boolean index2LD;
private boolean indexTLD;
public DomainTokenizer(Reader input, SecondLDExtractor extractor, boolean index2LD, boolean indexTLD) {
super(input);
this.extractor = extractor;
this.index2LD = index2LD;
this.indexTLD = indexTLD;
}
@Override
public boolean incrementToken() throws IOException {
if (state == STATE_DONE) return false;
clearAttributes();
if (this.url == null) {
state = STATE_INITIALIZED;
StringBuilder sb = new StringBuilder();
int upto = 0;
char[] buffer = new char[512];
while (true) {
final int length = input.read(buffer, upto, buffer.length - upto);
if (length == -1) break;
upto += length;
sb.append(buffer);
}
this.url = new URL(sb.toString());
if (!index2LD && !indexTLD) state = STATE_DONE;
termAtt.append(url.getHost());
return true;
} else if (index2LD && state < STATE_2LD) {
state = STATE_2LD;
String twold = extractor.extract2LD(url.getHost());
termAtt.append("."+twold);
return true;
} else if (indexTLD && state < STATE_TLD) {
state = STATE_TLD;
String tld = extractor.extractTLD(url.getHost());
termAtt.append(tld);
return true;
}
state = STATE_DONE;
return false;
}
}
and here's the corresponding Solr TokenizerFactory.
import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.analysis.BaseTokenizerFactory;
import java.io.Reader;
import java.util.Map;
public class DomainTokenizerFactory extends BaseTokenizerFactory {
private SecondLDExtractor extractor;
private boolean index2LD;
private boolean indexTLD;
@Override
public void init(Map<String, String> args) {
super.init(args);
assureMatchVersion();
index2LD = getBoolean("index2LD", true);
indexTLD = getBoolean("indexTLD", true);
if (index2LD || indexTLD) {
initTLDExtractor();
}
}
private void initTLDExtractor() {
extractor = new SecondLDExtractor();
extractor.init();
}
public Tokenizer create(Reader input) {
DomainTokenizer tokenizer = new DomainTokenizer(input, extractor, index2LD, indexTLD);
return tokenizer;
}
}
Here's a sample fieldType definition.
<analyzer>
<tokenizer class="org.supermind.solr.analysis.DomainTokenizerFactory"/>
</analyzer>
</fieldType>
-
http://www.supermind.org/blog/1078/extracting-second-level-domains-and-top-level-domains-tld-from-a-url-in-java Extracting second-level domains and top-level domains (TLD) from a URL in Java :: Kelvin Tan - Lucene Solr crawl Consultant
