Skip to content

Instantly share code, notes, and snippets.

@dpressel
Created January 19, 2018 18:17
Show Gist options
  • Save dpressel/b727c293fb57bd0acd2e92be2d78c51e to your computer and use it in GitHub Desktop.
Save dpressel/b727c293fb57bd0acd2e92be2d78c51e to your computer and use it in GitHub Desktop.
Adding a char shingling FX for sgdtk struct
package org.sgdtk.struct;
import java.util.ArrayList;
import java.util.List;
/**
* Extracts a an array of character shingle features from a sequence
*
* @author dpressel
*/
public class ShingledCharFeatureExtractor implements FeatureExtractorInterface
{
private final String ns;
private final int featureIndex;
private final int ngram;
public ShingledCharFeatureExtractor(String ns, int ngram)
{
this(ns, ngram, 0);
}
/**
* Constructor, you would probably not need to do this under normal circumstances
*
* @param ns namespace
* @param featureIndex The offset of the feature in a CONLL file
*/
public ShingledCharFeatureExtractor(String ns, int ngram, int featureIndex)
{
this.ns = ns;
this.featureIndex = featureIndex;
this.ngram = ngram;
}
/**
* Given a sequence of {@link State} objects which are multi-dimensional,
* made up of components themselves, and a position in that sequence, extract a single feature
*
* @param states A sequence
* @param current The current absolute position in the sequence
* @return
*/
@Override
public String[] run(List<State> states, int current)
{
String term = states.get(current).atIndex(this.featureIndex);
List<String> buffer = new ArrayList<>();
for (int i = ngram, sz = term.length() - ngram + 1; i < sz; ++i)
{
String aPart = term.substring(i - ngram, i);
buffer.add(ns + aPart);
}
return buffer.toArray(new String[buffer.size()]);
}
/**
* The order of this feature is the max of all of its parts
* @return The order (1 or 2)
*/
@Override
public int getOrder()
{
return 1;
}
/**
* Get the number of parts in the extractor
* @return
*/
@Override
public int size()
{
return 1;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment