-
-
Save purijatin/c7b156af100412839802 to your computer and use it in GitHub Desktop.
some java code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.jp; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import java.util.*; | |
import java.util.stream.IntStream; | |
/* | |
1) I personally prefer doing: | |
HashMap<String, Integer> counter = new HashMap<>(); | |
over | |
HashMap<String, Integer> counter = new HashMap<String, Integer>(); | |
For reason: It is less verbose. Given the fact that java is already so verbose, this gives some relief. | |
The compiler type-inference will automatically infer that the object type is of a map of `String and Integer`. | |
And you never have to explicitly specify type in case. Technical reason: | |
Generics in Java are in-variant. i.e below is legal | |
List<Animal> ls = new ArrayList<Animal>(); | |
But below is not: | |
List<Animal> ls = new ArrayList<Dog>(); | |
List<Dog> ls = new ArrayList<Animal>(); | |
Bluntly, the generic type on left and right for parametric class should be the same*. | |
* Well technically not always in case of wild-cards. That is exactly also the reason on why they are called wild-cards :) | |
HashMap<? extends Animal, ? super Dog> objectObjectHashMap = new HashMap<Animal, Dog>(); | |
I had written a blog post on this: http://purijatin.github.io/newsletters/generics/ | |
If free, please read it and give me feedback. | |
* | |
*/ | |
public class MP1 { | |
final String userName;//I prefer using as many `final` as much as I can. Make me feel safe about code that no other idiot changes it. | |
final String inputFileName; | |
static final String delimiters = " \t,;.?!-:@[](){}_*/"; | |
static final String[] stopWordsArray = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", | |
"yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", | |
"itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", | |
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", | |
"do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", | |
"of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", | |
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", | |
"further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", | |
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", | |
"too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}; | |
static final Set<String> stopWordsSet = new HashSet<>(Arrays.asList(stopWordsArray)); | |
Random initialRandomGenerator(String seed) throws NoSuchAlgorithmException { | |
/* I would probably use a pure function here instead of setting to a field. Reason being with a field, there | |
* are lots of things to go wrong with instance-variables. What about thread-safety or if someone sets it to null or something. | |
* | |
* Would make instance variable, if it was responsible for maintaining a `state` of the object. In this case, | |
* it is just used to obtain some random generator, so a pure function is a better fit. If this same generator was going | |
* to be used at multiple places in same object, then I would use a instance variable like you did. | |
* But I would construct it in **constructor** rather than setting it by calling in `getIndexes` | |
*/ | |
MessageDigest messageDigest = MessageDigest.getInstance("SHA"); | |
messageDigest.update(seed.toLowerCase().trim().getBytes()); | |
byte[] seedMD5 = messageDigest.digest(); | |
long longSeed = 0; | |
for (int i = 0; i < seedMD5.length; i++) { | |
longSeed += ((long) seedMD5[i] & 0xffL) << (8 * i); | |
} | |
return new Random(longSeed); | |
} | |
// not using this till now | |
private int[] getIndexes() throws NoSuchAlgorithmException { | |
/* | |
Avoid using `Integer` where `int` can be used. This is one of the language blunders. In java, as you know | |
primitives are not objects. Though the `Integer n= 10000;` would eventually be converted to `int` by compiler. | |
*/ | |
int n = 10000; | |
int numberOfLines = 50000; //the custom is to use camelcase. | |
int[] ret = new int[n]; | |
Random generator = initialRandomGenerator(this.userName); | |
for (int i = 0; i < n; i++) { | |
ret[i] = generator.nextInt(numberOfLines); | |
} | |
return ret; | |
} | |
//alternate way to do it | |
private int[] getIndexes2() throws NoSuchAlgorithmException { | |
Random generator = initialRandomGenerator(this.userName); | |
return IntStream.of(10000).map(x -> generator.nextInt(50000)).toArray(); | |
} | |
public MP1(String userName, String inputFileName) { | |
this.userName = userName; | |
this.inputFileName = inputFileName; | |
} | |
public String[] process() throws Exception { | |
// get the indices for the user id | |
int[] indices = this.getIndexes(); | |
Arrays.sort(indices); | |
// reading the file here | |
List<String> lines = new ArrayList<>(); | |
BufferedReader br = new BufferedReader(new FileReader(inputFileName)); | |
String line = br.readLine(); | |
while (line != null) { | |
lines.add(line); | |
line = br.readLine(); | |
} | |
/*br is not closed :(. I normally prefer this syntax: | |
try(BufferedReader br2 = new BufferedReader(new FileReader(inputFileName))){ | |
String line = br.readLine(); | |
while (line != null) { | |
lines.add(line); | |
line = br.readLine(); | |
} | |
} | |
This closes the stream at end | |
*/ | |
//or even better in single line: | |
/* | |
List<String> lines = Files.readAllLines(Paths.get(inputFileName)); | |
*/ | |
String[] validLines = new String[indices.length]; | |
for (int i=0; i < validLines.length; i++) { | |
System.out.println(i); | |
validLines[i] = lines.get(indices[i]); | |
} | |
HashMap<String, Integer> counter = new HashMap<>();//refer point-1 | |
//got below because it is being used for the first time here. | |
for (String topic: validLines) { | |
ArrayList<String> words = splitWords(topic); | |
for (String word: words) { | |
int count = counter.containsKey(word) ? counter.get(word) + 1 : 1; | |
counter.put(word, count); | |
} | |
} | |
// build a sorted map | |
// NOTE: Extremely shitty hack to get the first 20 | |
String[] ret = new String[20]; | |
TreeMap<String, Integer> sortedMap = sortByValue(counter); | |
int index = 0; | |
for (Map.Entry<String, Integer> entry : sortedMap.entrySet()) { | |
if (index < 20) { | |
ret[index] = entry.getKey(); | |
index += 1; | |
} else { | |
break; | |
} | |
} | |
return ret; | |
} | |
//convention is to always start method names with small case | |
private TreeMap<String, Integer> sortByValue(HashMap<String, Integer> map) { | |
TreeMap<String, Integer> sortedMap = new TreeMap<>((a,b) -> { | |
int compare = Integer.compare(map.get(a), map.get(b)); | |
if(compare==0) | |
return a.compareTo(b); | |
else return compare; | |
}); | |
sortedMap.putAll(map); | |
return sortedMap; | |
/* | |
Used a lambda instead of a separate ValueComparator class because, creatign anonymously also does hte job. | |
Moreover it reduces unnecessary classes. | |
*/ | |
} | |
public ArrayList<String> splitWords(String title) { | |
StringTokenizer st = new StringTokenizer(title, delimiters); | |
ArrayList<String> words = new ArrayList<>(); | |
while (st.hasMoreTokens()) { | |
String token = st.nextToken().trim().toLowerCase(); | |
if (!stopWordsSet.contains(token)) { | |
words.add(token); | |
} | |
} | |
return words; | |
} | |
public static void main(String... args) throws Exception{ //varargs over array. Both are exactly the same. | |
// Just that it is modern | |
// Secondly, Just that in 0.001% of case where you call this main from another java thread (you shouldn't but in case) | |
// then its easier to do MP1.main("hi") than MP1.main(new String[]{"hi"}); | |
if (args.length < 1){ | |
System.out.println("MP1 <User ID>"); | |
} | |
else { | |
String userName = args[0]; | |
String inputFileName = "./input.txt"; | |
MP1 mp = new MP1(userName, inputFileName); | |
String[] topItems = mp.process(); | |
for (String item: topItems){ | |
System.out.println(item); | |
} | |
//An alternate way of printing all : | |
// Stream.of(topItems).forEach(System.out::println); | |
} | |
} | |
} | |
//Removed ValueComparator because it can be created anonymously inside method |
prakhar1989
commented
Sep 3, 2015
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment