Created
February 18, 2016 08:23
-
-
Save sadedv/6846f1d048047663e85f to your computer and use it in GitHub Desktop.
EnglishWordsAndSentences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package preparingDB; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.io.PrintWriter; | |
import java.util.*; | |
import java.util.concurrent.LinkedBlockingQueue; | |
/** | |
* составяю список слов по рейтингу | |
*/ | |
public class CreateEngWordBase { | |
static String enRuBase = "EngRusSentences.txt"; | |
static String engWordsBase = "EngWords.txt"; | |
static String engWordsBaseWithLinks = "EngWordsLinked.txt"; | |
// static String enRuBase = "testEngRusSentences.txt"; | |
// static String engWordsBase = "testEngWords.txt"; | |
// static String engWordsBaseWithLinks = "testEngWordsLinked.txt"; | |
public static void main(String[] args) throws IOException { | |
BufferedReader wordsDbReader = new BufferedReader(new FileReader(engWordsBase)); | |
PrintWriter printWriter = new PrintWriter(engWordsBaseWithLinks); | |
String wordLine; | |
while (wordsDbReader.ready()) { | |
wordLine = wordsDbReader.readLine(); | |
String[] arr = wordLine.split("\\t"); | |
String wordId = arr[0]; | |
String word = arr[1].toLowerCase(); | |
String rating = arr[2]; | |
Queue<String> idsOfLinkedSentences = new LinkedBlockingQueue<>(); | |
BufferedReader dbReader = new BufferedReader(new FileReader(enRuBase)); | |
StringBuilder ids = new StringBuilder(); | |
String line; | |
while (dbReader.ready()) { | |
line = dbReader.readLine(); | |
String[] arrSen = line.split("\\t"); | |
String id = arrSen[0]; | |
String engLang = arrSen[1]; | |
String engText = arrSen[2]; | |
String rusLang = arrSen[3]; | |
String rusText = arrSen[4]; | |
String[] words = engText.replaceAll("(?!')\\W+", " ").replaceAll("\\p{Digit}+", " ").toLowerCase().split(" "); | |
List<String> list = Arrays.asList(words); | |
if (idsOfLinkedSentences.size() >= 100) { | |
break; | |
} | |
if (list.contains(word)) { | |
idsOfLinkedSentences.add(id); | |
} | |
} | |
dbReader.close(); | |
for (String s : idsOfLinkedSentences) { | |
ids.append(s + ":"); | |
} | |
String idsStr = ids.toString(); | |
printWriter.println(wordLine + "\t" + idsStr.substring(0, idsStr.length() - 1)); | |
} | |
wordsDbReader.close(); | |
printWriter.close(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package preparingDB; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.io.PrintWriter; | |
import java.util.*; | |
/** | |
* составяю список слов по рейтингу | |
*/ | |
public class CreateEngWordBaseWithLinkedSentences { | |
static String enRuBase = "EngRusSentences.txt"; | |
// static String engWordsBase = "filtr.txt"; | |
//static String enRuBase = "test.txt"; | |
static String filtred = "EngWords.txt"; | |
public static void main(String[] args) throws IOException { | |
BufferedReader fileReader = new BufferedReader(new FileReader(enRuBase)); | |
PrintWriter printWriter = new PrintWriter(filtred); | |
int idCount = 0; | |
List<String> listWithAllSentences = new ArrayList<>(); | |
Map<String, Integer> wordCountMap = new HashMap<>(); | |
String line; | |
while (fileReader.ready()) { | |
line = fileReader.readLine(); | |
String[] arr = line.split("\\t"); | |
String id = arr[0]; | |
String engLang = arr[1]; | |
String engText = arr[2]; | |
String rusLang = arr[3]; | |
String rusText = arr[4]; | |
listWithAllSentences.add(engText); | |
} | |
fileReader.close(); | |
//считаем количество предложений содержащих каждое слово | |
for (String engText : listWithAllSentences) { | |
String[] words = engText.replaceAll("(?!')\\W+", " ").replaceAll("\\p{Digit}+", " ").toLowerCase().split(" ");//replaceAll(" \\'", "'") | |
Set<String> set = new HashSet<>(); | |
for (String word : words) { | |
if (((word.length() > 1) || "i".equals(word)) && !word.startsWith("'") | |
&& (!"the".equals(word) && !"tom".equals(word) && !"mary".equals(word) | |
&& !word.startsWith("tom'")&& !word.startsWith("mary'"))) { | |
set.add(word); | |
} | |
} | |
for (String word : set) { | |
if (wordCountMap.containsKey(word)) { | |
wordCountMap.put(word, wordCountMap.get(word) + 1); | |
} else { | |
wordCountMap.put(word, 1); | |
} | |
} | |
} | |
List list = new ArrayList(wordCountMap.entrySet()); | |
//Collections.sort(list, new Comparator<Map.Entry<Integer, Integer>>() { | |
Collections.sort(list, new Comparator<Map.Entry<Integer, Integer>>() { | |
@Override | |
public int compare(Map.Entry<Integer, Integer> a, Map.Entry<Integer, Integer> b) { | |
return a.getValue() - b.getValue(); | |
} | |
}); | |
Collections.reverse(list); | |
//запись в файл | |
for (Object s : list) { | |
idCount++; | |
String[] wordAndCount = s.toString().split("="); | |
String word = wordAndCount[0]; | |
String count = wordAndCount[1]; | |
//if (!(word.length() <= 3 && "1".equals(count)) && !(word.length() <= 2 && Integer.parseInt(count) <= 3)) { | |
printWriter.println(idCount + "\t" + word + "\t" + count); | |
// } | |
} | |
printWriter.close(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package server; | |
import java.io.*; | |
import java.util.*; | |
import java.util.concurrent.LinkedBlockingQueue; | |
public class Practice { | |
static int limitSentencesForOneWord = 5;//количество предложений в тренеровке содержащих изучаемое слово | |
static int hundred = 50;//ограничение уникальных предложений для одного слова | |
static int countOfHundred; | |
static String firstLang; | |
static String secondLang; | |
static Set<String> inputWords = new HashSet<>(); | |
static String dataBase = "EngRusSentences.txt"; | |
static String wordsBase = "EngWords.txt"; | |
static String currentTraining = "currentTraining.txt"; | |
static BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); | |
static Queue<String> queueOfStudiedWords = new LinkedBlockingQueue<>(); | |
static Map<String, List<String>> wordEngTextRusText = new LinkedHashMap<>(); | |
public static void main(String[] args) throws IOException { | |
selectLanguage(); | |
selectWords(); | |
wordFilter(); | |
searchEngSentences(); | |
buffering(); | |
practice(); | |
} | |
/** | |
* Отсеевает введеные пользователеислова, которых нет в базе данных, | |
* те что остались будем тренеровать по частоте их встречаемости в английском языке | |
*/ | |
private static void wordFilter() throws IOException { | |
BufferedReader fileReader = new BufferedReader(new FileReader(wordsBase)); | |
String line; | |
while (fileReader.ready()) { | |
if (queueOfStudiedWords.size() >= 100) | |
break;//ограничим максимальное количество слов для изучения за тренеровку | |
line = fileReader.readLine(); | |
String[] arr = line.split("\\t"); | |
String id = arr[0]; | |
String wordInFile = arr[1]; | |
String ratingOfWord = arr[2]; | |
if (inputWords.contains(wordInFile)) { | |
queueOfStudiedWords.add(wordInFile); | |
} | |
} | |
fileReader.close(); | |
} | |
/** | |
* Тренеровка, непосредственно | |
*/ | |
private static void practice() throws IOException { | |
//reader = new BufferedReader(new InputStreamReader(System.in)); | |
BufferedReader fileReader = new BufferedReader(new FileReader(currentTraining)); | |
String line; | |
while (fileReader.ready()) { | |
line = fileReader.readLine(); | |
String[] arr = line.split("\\t"); | |
String engText = arr[0]; | |
String rusText = arr[1]; | |
System.out.print(engText); | |
// System.out.println("Показать перевод? y/n"); | |
String answer1 = reader.readLine(); | |
if ("".equals(answer1)) { | |
System.out.println(rusText); | |
} | |
// if ("y".equals(answer1)) { | |
// System.out.println(rusText); | |
// System.out.println(); | |
// } | |
// System.out.println("Нажмите ввод чтобы двигаться дальше!"); | |
String answer2 = reader.readLine(); | |
if ("".equals(answer2)) { | |
} | |
} | |
reader.close(); | |
fileReader.close(); | |
} | |
/** | |
* запись найденых предложений во временный файл, для оффлайн тренеровки | |
*/ | |
private static void buffering() throws FileNotFoundException { | |
PrintWriter printWriter = new PrintWriter(new File(currentTraining)); | |
Random rnd = new Random(System.currentTimeMillis()); | |
for (Map.Entry<String, List<String>> pair : wordEngTextRusText.entrySet()) { | |
String word = pair.getKey(); | |
List<String> engTextRusText = pair.getValue(); | |
if (engTextRusText.size() < limitSentencesForOneWord) { | |
for (int i = 0; i < engTextRusText.size(); i++) { | |
printWriter.println(engTextRusText.get(i)); | |
} | |
} else { | |
for (int i = 0; i < limitSentencesForOneWord; i++) { | |
int random = rnd.nextInt(engTextRusText.size()); | |
String currentSentence = engTextRusText.get(random); | |
printWriter.println(currentSentence); | |
engTextRusText.remove(random); | |
} | |
} | |
} | |
printWriter.close(); | |
} | |
/** | |
* поиск иностранных предолжений, содержащих изучаемое слово, и добавление их в карту | |
*/ | |
private static void searchEngSentences() { | |
try { | |
for (String word : queueOfStudiedWords) { | |
List<String> engTextRusText = new ArrayList<>(); | |
BufferedReader fileReader = new BufferedReader(new FileReader(dataBase)); | |
countOfHundred = 0; | |
String line; | |
while (fileReader.ready()) { | |
//загружаю 100 первых предложений из базы данных содержащих изучаемое слово, | |
//потом я выберу 5 случайных из сотни, для разнообразия тренеровок | |
if (countOfHundred >= hundred) { | |
break; | |
} | |
line = fileReader.readLine(); | |
String sentenceWithTheWord = engSentencesMapper(word, line); | |
if (!"".equals(sentenceWithTheWord)) { | |
engTextRusText.add(sentenceWithTheWord); | |
} | |
wordEngTextRusText.put(word, engTextRusText); | |
} | |
fileReader.close(); | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
/** | |
* Поиск английского предложения по слову | |
*/ | |
private static String engSentencesMapper(String word, String line) { | |
String result = ""; | |
String[] arr = line.split("\\t"); | |
if (arr.length == 5) { | |
String key = arr[0]; | |
String engLang = arr[1]; | |
String engText = arr[2]; | |
String rusLang = arr[3]; | |
String rusText = arr[4]; | |
word = word.toLowerCase(); | |
String[] textArr = engText.replaceAll("(?!')\\W+", " ").replaceAll("\\p{Digit}+", " ").toLowerCase().split(" "); | |
for (int i = 0; i < textArr.length; i++) { | |
if (textArr[i].equals(word)) { | |
countOfHundred++; | |
result = engText + "\t" + rusText; | |
break; | |
} | |
} | |
} | |
return result; | |
} | |
/** | |
* Выбираем слова для изучения | |
*/ | |
private static void selectWords() throws IOException { | |
//reader = new BufferedReader(new InputStreamReader(System.in)); | |
System.out.println("Введите слова для изучения или скопируйте текст. Для продолжения всегда нажимайте Enter."); | |
String line; | |
while ((line = reader.readLine()) != null) { | |
if (line.isEmpty()) | |
break; | |
String[] textArr = line.replaceAll("\\n", " ").replaceAll("(?!')\\W+", " ").replaceAll("\\p{Digit}+", " ").toLowerCase().split(" "); | |
for (String word : textArr) { | |
if (!"".equals(word)) { | |
inputWords.add(word); | |
} | |
} | |
} | |
} | |
/** | |
* Выбираем язык, пока что русский и английский | |
*/ | |
private static void selectLanguage() { | |
// BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); | |
// System.out.println("Выберите свой родной язык. Например rus или eng"); | |
// firstLang = reader.readLine(); | |
firstLang = "rus"; | |
// System.out.println("Выберите язык для изучения. Например rus или eng"); | |
//secondLang = reader.readLine(); | |
secondLang = "eng"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment