Created
September 1, 2015 05:04
-
-
Save anonymous/970048548d77f8b78c2f to your computer and use it in GitHub Desktop.
some java code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.lang.Integer; | |
import java.lang.System; | |
import java.lang.reflect.Array; | |
import java.security.MessageDigest; | |
import java.util.Comparator; | |
import java.security.NoSuchAlgorithmException; | |
import java.util.*; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.util.InputMismatchException; | |
public class MP1 { | |
Random generator; | |
String userName; | |
String inputFileName; | |
static final String delimiters = " \t,;.?!-:@[](){}_*/"; | |
static final String[] stopWordsArray = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", | |
"yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", | |
"itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", | |
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", | |
"do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", | |
"of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", | |
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", | |
"further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", | |
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", | |
"too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}; | |
static final Set<String> stopWordsSet = new HashSet<String>(Arrays.asList(stopWordsArray)); | |
void initialRandomGenerator(String seed) throws NoSuchAlgorithmException { | |
MessageDigest messageDigest = MessageDigest.getInstance("SHA"); | |
messageDigest.update(seed.toLowerCase().trim().getBytes()); | |
byte[] seedMD5 = messageDigest.digest(); | |
long longSeed = 0; | |
for (int i = 0; i < seedMD5.length; i++) { | |
longSeed += ((long) seedMD5[i] & 0xffL) << (8 * i); | |
} | |
this.generator = new Random(longSeed); | |
} | |
// not using this till now | |
Integer[] getIndexes() throws NoSuchAlgorithmException { | |
Integer n = 10000; | |
Integer number_of_lines = 50000; | |
Integer[] ret = new Integer[n]; | |
this.initialRandomGenerator(this.userName); | |
for (int i = 0; i < n; i++) { | |
ret[i] = generator.nextInt(number_of_lines); | |
} | |
return ret; | |
} | |
public MP1(String userName, String inputFileName) { | |
this.userName = userName; | |
this.inputFileName = inputFileName; | |
} | |
public String[] process() throws Exception { | |
String[] ret = new String[20]; | |
HashMap<String, Integer> counter = new HashMap<String, Integer>(); | |
// get the indices for the user id | |
Integer[] indices = this.getIndexes(); | |
Arrays.sort(indices); | |
// reading the file here | |
List<String> lines = new ArrayList<String>(); | |
BufferedReader br = new BufferedReader(new FileReader(inputFileName)); | |
String line = br.readLine(); | |
while (line != null) { | |
lines.add(line); | |
line = br.readLine(); | |
} | |
String[] validLines = new String[indices.length]; | |
for (int i=0; i < validLines.length; i++) { | |
validLines[i] = lines.get(indices[i]); | |
} | |
for (String topic: validLines) { | |
ArrayList<String> words = splitWords(topic); | |
for (String word: words) { | |
int count = counter.containsKey(word) ? counter.get(word) + 1 : 1; | |
counter.put(word, count); | |
} | |
} | |
// build a sorted map | |
// NOTE: Extremely shitty hack to get the first 20 | |
TreeMap<String, Integer> sortedMap = SortByValue(counter); | |
int index = 0; | |
for (Map.Entry<String, Integer> entry : sortedMap.entrySet()) { | |
if (index < 20) { | |
ret[index] = entry.getKey(); | |
index += 1; | |
} else { | |
break; | |
} | |
} | |
return ret; | |
} | |
private TreeMap<String, Integer> SortByValue(HashMap<String, Integer> map) { | |
ValueComparator vc = new ValueComparator(map); | |
TreeMap<String, Integer> sortedMap = new TreeMap<String, Integer>(vc); | |
sortedMap.putAll(map); | |
return sortedMap; | |
} | |
public ArrayList<String> splitWords(String title) { | |
StringTokenizer st = new StringTokenizer(title, delimiters); | |
ArrayList<String> words = new ArrayList<String>(); | |
while (st.hasMoreTokens()) { | |
String token = st.nextToken().trim().toLowerCase(); | |
if (!stopWordsSet.contains(token)) { | |
words.add(token); | |
} | |
} | |
return words; | |
} | |
public static void main(String[] args) throws Exception{ | |
if (args.length < 1){ | |
System.out.println("MP1 <User ID>"); | |
} | |
else { | |
String userName = args[0]; | |
String inputFileName = "./input.txt"; | |
MP1 mp = new MP1(userName, inputFileName); | |
String[] topItems = mp.process(); | |
for (String item: topItems){ | |
System.out.println(item); | |
} | |
} | |
} | |
} | |
class ValueComparator implements Comparator<String> { | |
Map<String, Integer> map; | |
public ValueComparator(Map<String, Integer> base) { | |
this.map = base; | |
} | |
public int compare(String a, String b) { | |
if (map.get(a) > map.get(b)) { | |
return -1; | |
} else if (map.get(a) < map.get(b)) { | |
return 1; | |
} else { // handle case of inequality | |
return a.compareTo(b); | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment