Created
March 29, 2012 03:17
-
-
Save ryanswanstrom/2232902 to your computer and use it in GitHub Desktop.
A Java class to do some very basic analysis of a list of data scientist job postings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.swgoof.datascience.job; | |
import java.io.BufferedReader; | |
import java.io.FileNotFoundException; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.StringTokenizer; | |
/** | |
* This file is used to count the number of times a word appears in a file. | |
* The class reads a file of data scientist job descriptions, | |
* each separated by '--' | |
* It also keeps track of whether a word occurs in each job description. | |
*/ | |
public class JobAnalysis { | |
/** | |
* @param args the command line arguments | |
*/ | |
public static void main(String[] args) throws FileNotFoundException, IOException { | |
BufferedReader in = new BufferedReader(new FileReader("datasciencejobs.txt")); | |
int numOfJobs = 16; | |
int jobCounter = 0; | |
Map<String, WordObj> words = new HashMap<String, WordObj>(150); | |
String line = in.readLine(); | |
do { | |
if (line.startsWith("--")) { | |
jobCounter++; | |
} | |
line = line.toLowerCase(); | |
// replace all dots with nothing | |
line = line.replaceAll("\\.", ""); | |
// replace all special chars with space | |
line = line.replaceAll("[^A-Za-z0-9]", " "); | |
//System.out.println(line); | |
StringTokenizer sp = new StringTokenizer(line); | |
while (sp.hasMoreTokens()) { | |
String word = sp.nextToken(); | |
//System.out.println("word:: " + word); | |
WordObj tmp = words.get(word); | |
if (tmp != null) { | |
//System.out.println("found: " + word); | |
// inc counter | |
tmp.increment(jobCounter); | |
words.put(word, tmp); | |
} else { | |
WordObj wordObj = new WordObj(numOfJobs); | |
wordObj.flagJob(jobCounter); | |
words.put(word, wordObj); | |
} | |
} | |
line = in.readLine(); | |
} while (line != null); | |
in.close(); | |
//Map<String, Integer> sortedWords = sortForMap(words); | |
List<Map.Entry<String, WordObj>> sortedWords = sortForListForWordObj(words); | |
for (Map.Entry<String, WordObj> entry : sortedWords) { | |
System.out.printf("'%s' %s \n", entry.getKey(), entry.getValue()); | |
} | |
System.out.println("total words: " + sortedWords.size()); | |
} | |
/** | |
* This method will return a sorted List of WordObjs. | |
* | |
* @param unsortMap | |
* @return | |
*/ | |
private static List<Map.Entry<String, WordObj>> sortForListForWordObj(Map<String, WordObj> unsortMap) { | |
List<Map.Entry<String, WordObj>> sortedList = new ArrayList(unsortMap.entrySet()); | |
//sort list based on comparator | |
Collections.sort(sortedList, new Comparator() { | |
@Override | |
public int compare(Object o1, Object o2) { | |
return ((Comparable) ((WordObj)((Map.Entry) (o1)).getValue()).getNum()).compareTo(((WordObj)((Map.Entry) (o2)).getValue()).getNum()); | |
} | |
}); | |
return sortedList; | |
} | |
} | |
/** | |
* A class to keep track of how many times a word occurs and which | |
* job posts it appears in. | |
*/ | |
class WordObj { | |
/** number of times the word occurs */ | |
private Integer num; | |
private Boolean[] inJobs; // a true means the word is in that job | |
public WordObj(int numOfJobs) { | |
this.num = 1; | |
this.inJobs = new Boolean[numOfJobs]; | |
} | |
public void increment(int jobNum) { | |
this.flagJob(jobNum); | |
num++; | |
} | |
public void flagJob(int jobNum) { | |
if (jobNum >= 0 && jobNum < inJobs.length) { | |
this.inJobs[jobNum] = Boolean.TRUE; | |
} | |
} | |
@Override | |
public String toString() { | |
int counter = 0; | |
for (Boolean b : inJobs) { | |
if (b != null && b) { | |
counter++; | |
} | |
} | |
return String.format(" occured %d times and in %d job descriptions" , getNum(), counter); | |
} | |
/** | |
* @return the num | |
*/ | |
public Integer getNum() { | |
return num; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment