Created
September 1, 2015 14:37
-
-
Save saptak/b89d5d28702f3cfcc833 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.lang.reflect.Array; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import java.util.*; | |
public class MP1 { | |
Random generator; | |
String userName; | |
String inputFileName; | |
String delimiters = " \t,;.?!-:@[](){}_*/"; | |
String[] stopWordsArray = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", | |
"yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", | |
"itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", | |
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", | |
"do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", | |
"of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", | |
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", | |
"further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", | |
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", | |
"too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}; | |
void initialRandomGenerator(String seed) throws NoSuchAlgorithmException { | |
MessageDigest messageDigest = MessageDigest.getInstance("SHA"); | |
messageDigest.update(seed.toLowerCase().trim().getBytes()); | |
byte[] seedMD5 = messageDigest.digest(); | |
long longSeed = 0; | |
for (int i = 0; i < seedMD5.length; i++) { | |
longSeed += ((long) seedMD5[i] & 0xffL) << (8 * i); | |
} | |
this.generator = new Random(longSeed); | |
} | |
Integer[] getIndexes() throws NoSuchAlgorithmException { | |
Integer n = 10000; | |
Integer number_of_lines = 50000; | |
Integer[] ret = new Integer[n]; | |
this.initialRandomGenerator(this.userName); | |
for (int i = 0; i < n; i++) { | |
ret[i] = generator.nextInt(number_of_lines); | |
} | |
return ret; | |
} | |
public MP1(String userName, String inputFileName) { | |
this.userName = userName; | |
this.inputFileName = inputFileName; | |
} | |
private List<String> readFile(String filename) | |
{ | |
List<String> records = new ArrayList<String>(); | |
try | |
{ | |
BufferedReader reader = new BufferedReader(new FileReader(filename)); | |
String line; | |
Integer[] valLines = this.getIndexes(); | |
while ((line = reader.readLine()) != null) | |
{ | |
records.add(line); | |
} | |
reader.close(); | |
List<String> recordsSmall = new ArrayList<String>(); | |
for(int n : valLines){ | |
recordsSmall.add(records.get(n)); | |
} | |
return recordsSmall; | |
} | |
catch (Exception e) | |
{ | |
System.err.format("Exception occurred trying to read '%s'.", filename); | |
e.printStackTrace(); | |
return null; | |
} | |
} | |
@SuppressWarnings({ "unchecked", "rawtypes" }) | |
public static Map sortByDescValueAscKey(Map unsortMap) { | |
List list = new LinkedList(unsortMap.entrySet()); | |
Collections.sort(list, new Comparator() { | |
//reversed order of parameter for descending. | |
public int compare(Object o2, Object o1) { | |
if(((Map.Entry) (o1)).getValue()==((Map.Entry) (o2)).getValue()){ | |
return ((Comparable) ((Map.Entry) (o2)).getKey()) | |
.compareTo(((Map.Entry) (o1)).getKey()); | |
}else{ | |
return ((Comparable) ((Map.Entry) (o1)).getValue()) | |
.compareTo(((Map.Entry) (o2)).getValue()); | |
} | |
} | |
}); | |
Map sortedMap = new LinkedHashMap(); | |
for (Iterator it = list.iterator(); it.hasNext();) { | |
Map.Entry entry = (Map.Entry) it.next(); | |
sortedMap.put(entry.getKey(), entry.getValue()); | |
} | |
return sortedMap; | |
} | |
@SuppressWarnings({ "rawtypes", "unchecked" }) | |
public String[] process() throws Exception { | |
String[] ret = new String[20]; | |
//TODO | |
//Open File | |
List<String> lines = this.readFile(this.inputFileName); | |
List<String> words = new ArrayList(); | |
//Divide each sentence into a list of words using delimiters provided in the "delimiters" variable | |
for (String line : lines) { | |
StringTokenizer st = new StringTokenizer(line,delimiters); | |
while (st.hasMoreElements()) { | |
String word = st.nextElement().toString().trim().toLowerCase(); | |
if(!Arrays.asList(stopWordsArray).contains(word)){ | |
words.add(word); | |
} | |
} | |
} | |
Collections.sort(words); | |
String word = ""; | |
int freq = 0; | |
HashMap<String, Integer> groupedWords = new HashMap(); | |
for(String entry: words){ | |
String newWord = entry; | |
int newNumber = 1; | |
if(word.equals(newWord)){ | |
freq+= newNumber; | |
} | |
else | |
{ | |
groupedWords.put(word, freq); | |
freq = newNumber; word = newWord; | |
} | |
} | |
groupedWords.put(word, freq); | |
Map<String, Integer> descGroupedWords = sortByDescValueAscKey(groupedWords); | |
int n=0; | |
for(Map.Entry<String, Integer> entry: descGroupedWords.entrySet()){ | |
ret[n]=entry.getKey(); | |
n++; | |
if(n==20)break; | |
} | |
return ret; | |
} | |
public static void main(String[] args) throws Exception { | |
if (args.length < 1){ | |
System.out.println("MP1 <User ID>"); | |
} | |
else { | |
String userName = args[0]; | |
String inputFileName = "./input.txt"; | |
MP1 mp = new MP1(userName, inputFileName); | |
String[] topItems = mp.process(); | |
for (String item: topItems){ | |
System.out.println(item); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment