Created
October 25, 2017 15:49
-
-
Save vi3k6i5/00a2a7ddd36e1de28ccc019da6185aed to your computer and use it in GitHub Desktop.
Benchmarking timing performance Keyword Extraction using regex in java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// compare the results with FlashText here https://gist.github.com/vi3k6i5/604eefd92866d081cfa19f862224e4a0 | |
import java.util.regex.*; | |
import java.lang.StringBuilder; | |
import java.util.*; | |
public class RegexBenchmark { | |
public static String getWordOfLength(int length) { | |
String SALTCHARS = "abcdefghijklmnopqrstuvwxyz1234567890"; | |
StringBuilder salt = new StringBuilder(); | |
Random rnd = new Random(); | |
while (salt.length() < length) { // length of the random string. | |
int index = (int) (rnd.nextFloat() * SALTCHARS.length()); | |
salt.append(SALTCHARS.charAt(index)); | |
} | |
String saltStr = salt.toString(); | |
return saltStr; | |
} | |
/* | |
# all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)] | |
*/ | |
public static ArrayList<String> getWordsList() { | |
ArrayList<Integer> lista = new ArrayList<Integer>(); | |
ArrayList<String> list_words = new ArrayList<String>(); | |
lista.add(3); | |
lista.add(4); | |
lista.add(5); | |
lista.add(6); | |
lista.add(7); | |
lista.add(8); | |
Random rand = new Random(); | |
for(int i=0; i<100000; i++){ | |
Integer randomInt = lista.get(rand.nextInt(lista.size())); | |
String next_word = getWordOfLength(randomInt); | |
list_words.add(next_word); | |
} | |
return list_words; | |
} | |
public static List<String> pickNRandomElements(ArrayList<String> list, int n) { | |
Random r = new Random(); | |
int length = list.size(); | |
if (length < n) return null; | |
//We don't need to shuffle the whole list | |
for (int i = length - 1; i >= length - n; --i) | |
{ | |
Collections.swap(list, i , r.nextInt(i + 1)); | |
} | |
return list.subList(length - n, length); | |
} | |
public static String join(List<String> list, String delim) { | |
StringBuilder sb = new StringBuilder(); | |
String loopDelim = ""; | |
for(String s : list) { | |
sb.append(loopDelim); | |
sb.append(s); | |
loopDelim = delim; | |
} | |
return sb.toString(); | |
} | |
public static void main(String args[]){ | |
ArrayList<String> wordList = getWordsList(); | |
System.out.println(wordList.size()); | |
List<String> chosen_words = pickNRandomElements(wordList, 5000); | |
System.out.println(chosen_words.size()); | |
String delim = String.valueOf(' '); | |
String story = join(chosen_words, delim); | |
System.out.println(story.length()); | |
long startTime = System.currentTimeMillis(); | |
long endTime = System.currentTimeMillis(); | |
for (int keywords_length = 1; keywords_length <= 20001; keywords_length+=1000) | |
{ | |
List<String> unique_keywords_sublist = pickNRandomElements(wordList, keywords_length); | |
StringBuffer str = new StringBuffer (""); | |
for (String element : unique_keywords_sublist) { | |
str.append("\\b"); | |
str.append(element); | |
str.append("\\b|"); | |
} | |
String pattern = str.toString(); | |
pattern = pattern.substring(0, pattern.length() - 1); | |
Pattern p = Pattern.compile(pattern); | |
Matcher m = p.matcher(story); | |
StringBuffer sb = new StringBuffer(); | |
int count = 0; | |
startTime = System.currentTimeMillis(); | |
while (m.find()) { | |
count += 1; | |
} | |
endTime = System.currentTimeMillis(); | |
System.out.println(keywords_length + " execution time: " + (endTime - startTime) ); | |
} | |
} | |
} | |
//keywords_length/ time in milliseconds | |
// 1 execution time: 7 | |
// 1001 execution time: 425 | |
// 2001 execution time: 718 | |
// 3001 execution time: 1084 | |
// 4001 execution time: 1461 | |
// 5001 execution time: 1791 | |
// 6001 execution time: 2257 | |
// 7001 execution time: 2655 | |
// 8001 execution time: 3048 | |
// 9001 execution time: 3417 | |
// 10001 execution time: 3744 | |
// 11001 execution time: 4092 | |
// 12001 execution time: 4427 | |
// 13001 execution time: 4724 | |
// 14001 execution time: 5057 | |
// 15001 execution time: 5204 | |
// 16001 execution time: 5494 | |
// 17001 execution time: 5777 | |
// 18001 execution time: 6049 | |
// 19001 execution time: 6419 | |
// 20001 execution time: 6620 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment