Skip to content

Instantly share code, notes, and snippets.

@kburaya
Created December 21, 2017 19:05
Show Gist options
  • Save kburaya/c15950d2aab82bb8e72d789d77b9be45 to your computer and use it in GitHub Desktop.
Save kburaya/c15950d2aab82bb8e72d789d77b9be45 to your computer and use it in GitHub Desktop.
import net.bbridge.core.extractor.TopicsFeatureExtractor;
import net.bbridge.core.text.processing.TextProcessor;
import net.bbridge.feature.extractor.text.TextTopicsFeatureExtractor;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class Main {
public static void main(String[] args) {
String booksPath = "/Users/ksburaya/Documents/NUS/lda-librusec/books/";
int documentSize = 500;
TextTopicsFeatureExtractor extractor = new TextTopicsFeatureExtractor(100, 0.5, 0.1,
10, 1000,
"/Users/ksburaya/Documents/NUS/lda-librusec/librusec.lda", 2);
// read each book and generate documents with N symbols size
List<TextProcessor.Result> trainData = new ArrayList<>();
File folder = new File(booksPath);
File[] listOfFiles = folder.listFiles();
for (File file : listOfFiles) {
if (file.isFile()) {
try {
String[] text = new String(Files.readAllBytes(Paths.get(file.getPath()))).split("\\s+");
int charCounter = 0;
List<String> words = new ArrayList<>();
for (String word: text) {
words.add(word);
charCounter += word.length();
if (charCounter > documentSize) {
charCounter = 0;
TextProcessor.Result result = new TextProcessor.Result(words, new HashMap<>());
trainData.add(result);
words.clear();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
extractor.train(trainData);
List<String> testList = new ArrayList<>();
testList.add("привет");
testList.add("как");
testList.add("дела");
TextProcessor.Result test = new TextProcessor.Result(testList, new HashMap<>());
HashMap <String, Double> ldaResult = (HashMap<String, Double>) extractor.extractFeatures(test);
int i = 0;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment