Created
March 1, 2012 08:34
-
-
Save alexxv/1948330 to your computer and use it in GitHub Desktop.
Date parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
final Parser p = new Parser(); | |
List<String> strings = Files.readLines(new File("c:\\users\\alexv\\Desktop\\task_titles.tsv"), Charset.defaultCharset()); | |
final Set<Character> chars = newHashSet(' ', '@', ':', '-'); | |
final Set<String> good_last_words = newHashSet("on", "@", "by", "due", "date", "untill", "til", "till", "at", "for", "before", "after", "in"); | |
Set<String> text_firsts = newHashSet("today", "tomorrow", "this", "next"); | |
final Set<String> syntaxTreeBlackList = newHashSet("(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ))))", "tomorrow", "this", "next"); | |
final Set<String> syntaxTreeWhiteList = newHashSet("(DATE_TIME_ALTERNATIVE (DATE_TIME (RELATIVE_DATE (SEEK > by_day (DAY_OF_WEEK )))))", | |
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_DATE (MONTH_OF_YEAR ) (DAY_OF_MONTH ))))", | |
"(DATE_TIME_ALTERNATIVE (DATE_TIME (RELATIVE_DATE (SEEK > by_day day))))", | |
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ) pm)))", | |
//"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ) am)))", | |
"(DATE_TIME_ALTERNATIVE (DATE_TIME (RELATIVE_DATE (SEEK > by_day (MONTH_OF_YEAR )))))", | |
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ) (SECONDS_OF_MINUTE ) pm)))", | |
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_DATE (MONTH_OF_YEAR ) (DAY_OF_MONTH ) (YEAR_OF ))))", | |
"(DATE_TIME_ALTERNATIVE (DATE_TIME (RELATIVE_DATE (SEEK > by_day (DAY_OF_WEEK ))) (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ))))", | |
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_DATE (MONTH_OF_YEAR ) (DAY_OF_MONTH )) (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ))))"); | |
final BufferedWriter bw = new BufferedWriter(new FileWriter("c:\\users\\alexv\\Desktop\\task_dates_at_the_beginning.tsv")); | |
final BufferedWriter bw2 = new BufferedWriter(new FileWriter("c:\\users\\alexv\\Desktop\\out.txt")); | |
int cnt = 0; | |
long start = System.currentTimeMillis(); | |
for (String string : strings) { | |
String line = string.trim(); | |
line = line.replace("\t", " "); | |
cnt++; | |
if (cnt % 10000 == 0) { | |
System.out.println(cnt); | |
} | |
bw2.write(line + "\n"); | |
try { | |
List<DateGroup> parse = p.parse(line); | |
if (!parse.isEmpty()) { | |
for (DateGroup dateGroup : parse) { | |
int pos = dateGroup.getPosition(); | |
String text = dateGroup.getText(); | |
String[] text_words = text.split("\\s+"); | |
String text_first_word = text_words.length > 0 ? text_words[0] : ""; | |
String text_last_word = text_words.length > 0 ? text_words[text_words.length - 1] : ""; | |
if (pos == 0) { | |
Character next_char = dateGroup.getText().length() > line.length() ? line.charAt(dateGroup.getText().length()) : '\0'; | |
// if (chars.contains(last_char)) { // space : - , @ | |
String[] words = line.substring(text.length(), line.length()).split("\\s+"); | |
String first_word = words.length > 0 ? words[0] : ""; | |
String syntaxTree = dateGroup.getSyntaxTree().toStringTree(); | |
String syntaxTreeClean = syntaxTree.replaceAll("\\d+", "").replace("am", "pm").trim(); | |
// if (syntaxTreeBlackList.contains(syntaxTree)) | |
// continue; | |
//if (last_words.contains(last_word.toLowerCase()) || last_char == '@' || text_firsts.contains(text_first_word.toLowerCase())) | |
// if (syntaxTreeWhiteList.contains(syntaxTree)) || | |
// good_last_words.contains(last_word.toLowerCase()) || | |
// last_char == '@' || | |
// (syntaxTree.equals("(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ) (SECONDS_OF_MINUTE ) pm)))") && (text_first_word.equals("this") || text_first_word.equals("tonight") || text_first_word.equals("in"))) | |
// ) { | |
bw.write(text + "\t" + line + "\t" + first_word + "\t" + text_first_word + "\t" + text_last_word + "\t" + next_char + "\t" + syntaxTree + "\t" + syntaxTreeClean + "\n"); | |
// } | |
// } | |
} | |
} | |
} | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
} | |
bw.close(); | |
bw2.close(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment