Skip to content

Instantly share code, notes, and snippets.

@ryanswanstrom
Created March 29, 2012 03:17
Show Gist options
  • Save ryanswanstrom/2232902 to your computer and use it in GitHub Desktop.
Save ryanswanstrom/2232902 to your computer and use it in GitHub Desktop.
A Java class to do some very basic analysis of a list of data scientist job postings.
package com.swgoof.datascience.job;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
/**
* This file is used to count the number of times a word appears in a file.
* The class reads a file of data scientist job descriptions,
* each separated by '--'
* It also keeps track of whether a word occurs in each job description.
*/
public class JobAnalysis {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws FileNotFoundException, IOException {
BufferedReader in = new BufferedReader(new FileReader("datasciencejobs.txt"));
int numOfJobs = 16;
int jobCounter = 0;
Map<String, WordObj> words = new HashMap<String, WordObj>(150);
String line = in.readLine();
do {
if (line.startsWith("--")) {
jobCounter++;
}
line = line.toLowerCase();
// replace all dots with nothing
line = line.replaceAll("\\.", "");
// replace all special chars with space
line = line.replaceAll("[^A-Za-z0-9]", " ");
//System.out.println(line);
StringTokenizer sp = new StringTokenizer(line);
while (sp.hasMoreTokens()) {
String word = sp.nextToken();
//System.out.println("word:: " + word);
WordObj tmp = words.get(word);
if (tmp != null) {
//System.out.println("found: " + word);
// inc counter
tmp.increment(jobCounter);
words.put(word, tmp);
} else {
WordObj wordObj = new WordObj(numOfJobs);
wordObj.flagJob(jobCounter);
words.put(word, wordObj);
}
}
line = in.readLine();
} while (line != null);
in.close();
//Map<String, Integer> sortedWords = sortForMap(words);
List<Map.Entry<String, WordObj>> sortedWords = sortForListForWordObj(words);
for (Map.Entry<String, WordObj> entry : sortedWords) {
System.out.printf("'%s' %s \n", entry.getKey(), entry.getValue());
}
System.out.println("total words: " + sortedWords.size());
}
/**
* This method will return a sorted List of WordObjs.
*
* @param unsortMap
* @return
*/
private static List<Map.Entry<String, WordObj>> sortForListForWordObj(Map<String, WordObj> unsortMap) {
List<Map.Entry<String, WordObj>> sortedList = new ArrayList(unsortMap.entrySet());
//sort list based on comparator
Collections.sort(sortedList, new Comparator() {
@Override
public int compare(Object o1, Object o2) {
return ((Comparable) ((WordObj)((Map.Entry) (o1)).getValue()).getNum()).compareTo(((WordObj)((Map.Entry) (o2)).getValue()).getNum());
}
});
return sortedList;
}
}
/**
* A class to keep track of how many times a word occurs and which
* job posts it appears in.
*/
class WordObj {
/** number of times the word occurs */
private Integer num;
private Boolean[] inJobs; // a true means the word is in that job
public WordObj(int numOfJobs) {
this.num = 1;
this.inJobs = new Boolean[numOfJobs];
}
public void increment(int jobNum) {
this.flagJob(jobNum);
num++;
}
public void flagJob(int jobNum) {
if (jobNum >= 0 && jobNum < inJobs.length) {
this.inJobs[jobNum] = Boolean.TRUE;
}
}
@Override
public String toString() {
int counter = 0;
for (Boolean b : inJobs) {
if (b != null && b) {
counter++;
}
}
return String.format(" occured %d times and in %d job descriptions" , getNum(), counter);
}
/**
* @return the num
*/
public Integer getNum() {
return num;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment