Created
December 21, 2017 19:05
-
-
Save kburaya/c15950d2aab82bb8e72d789d77b9be45 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import net.bbridge.core.extractor.TopicsFeatureExtractor; | |
import net.bbridge.core.text.processing.TextProcessor; | |
import net.bbridge.feature.extractor.text.TextTopicsFeatureExtractor; | |
import java.io.File; | |
import java.io.IOException; | |
import java.nio.file.Files; | |
import java.nio.file.Paths; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
public class Main { | |
public static void main(String[] args) { | |
String booksPath = "/Users/ksburaya/Documents/NUS/lda-librusec/books/"; | |
int documentSize = 500; | |
TextTopicsFeatureExtractor extractor = new TextTopicsFeatureExtractor(100, 0.5, 0.1, | |
10, 1000, | |
"/Users/ksburaya/Documents/NUS/lda-librusec/librusec.lda", 2); | |
// read each book and generate documents with N symbols size | |
List<TextProcessor.Result> trainData = new ArrayList<>(); | |
File folder = new File(booksPath); | |
File[] listOfFiles = folder.listFiles(); | |
for (File file : listOfFiles) { | |
if (file.isFile()) { | |
try { | |
String[] text = new String(Files.readAllBytes(Paths.get(file.getPath()))).split("\\s+"); | |
int charCounter = 0; | |
List<String> words = new ArrayList<>(); | |
for (String word: text) { | |
words.add(word); | |
charCounter += word.length(); | |
if (charCounter > documentSize) { | |
charCounter = 0; | |
TextProcessor.Result result = new TextProcessor.Result(words, new HashMap<>()); | |
trainData.add(result); | |
words.clear(); | |
} | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
extractor.train(trainData); | |
List<String> testList = new ArrayList<>(); | |
testList.add("привет"); | |
testList.add("как"); | |
testList.add("дела"); | |
TextProcessor.Result test = new TextProcessor.Result(testList, new HashMap<>()); | |
HashMap <String, Double> ldaResult = (HashMap<String, Double>) extractor.extractFeatures(test); | |
int i = 0; | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment