-
-
Save lovromazgon/304b6b65cb07c884a80b to your computer and use it in GitHub Desktop.
Java implementation of the Apriori algorithm for mining frequent itemsets - using multiple threads (default 8)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.util.*; | |
/** The class encapsulates an implementation of the Apriori algorithm to compute frequent itemsets. | |
* | |
* This version uses multiple threads to go through the dataset and is therefore 3x faster than the 1 thread version. | |
* | |
* Notice: To enable progress tracking (CLIProgressBar) follow these steps: | |
* 1. download this class and put it on the classpath: https://gist.github.com/lovromazgon/9c801554ceb56157de30 | |
* 2. uncomment lines 229 and 243 | |
* | |
* Usage as library: see {@link ExampleOfClientCodeOfApriori} | |
* | |
* @author Lovro Mazgon, implemented multithreaded version, 2015 | |
* Original authors: | |
* @author Martin Monperrus, University of Darmstadt, 2010 | |
* @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009. | |
* @copyright GNU General Public License v3 | |
* No reproduction in whole or part without maintaining this copyright notice | |
* and imposing this condition on any subsequent users. | |
*/ | |
public class Apriori extends Observable { | |
private static final int NUMBER_OF_THREADS = 8; | |
/** all input items */ | |
private final Iterable<int[]> allItems; | |
/** the list of current itemsets */ | |
private List<int[]> itemsets ; | |
/** set of distinct items in the dataset */ | |
private List<Integer> distinctItems; | |
/** number of different items in the dataset */ | |
private int numItems; | |
/** total number of transactions in transaFile */ | |
private int numTransactions; | |
/** minimum support for a frequent itemset in percentage, e.g. 0.8 */ | |
private double minSup; | |
/** This is the main interface to use this class as a library */ | |
public Apriori(Observer ob, Iterable<int[]> itemsets, double minSupport) { | |
allItems = itemsets; | |
configure(allItems, minSupport); | |
this.addObserver(ob); | |
} | |
/** computes numItems, numTransactions, and sets minSup */ | |
private void configure(Iterable<int[]> allItems, double minSupport) { | |
// setting minsupport | |
minSup = minSupport; | |
if (minSup>1 || minSup<0) throw new RuntimeException("minSup: bad value"); | |
// going thourgh the itemsets to compute distinctItems and numTransactions | |
numTransactions = 0; | |
Set<Integer> itemsSet = new HashSet<>(); | |
for (int[] itemset : allItems) { | |
numTransactions++; | |
for (int i : itemset) { | |
itemsSet.add(i); | |
} | |
} | |
this.distinctItems = new ArrayList<>(itemsSet); | |
// setting numItems | |
numItems = distinctItems.size(); | |
outputConfig(); | |
} | |
/** outputs the current configuration */ | |
private void outputConfig() { | |
//output config info to the user | |
log("Input configuration: " + numItems + " items, " + numTransactions + " transactions, "); | |
log("minsup = " + (minSup*100) + "%"); | |
} | |
/** starts the algorithm after configuration */ | |
public void run() throws Exception { | |
//start timer | |
long start = System.currentTimeMillis(); | |
// first we generate the candidates of size 1 | |
createItemsetsOfSize1(); | |
int itemsetNumber = 1; //the current itemset being looked at | |
int nbFrequentSets = 0; | |
while (itemsets.size() > 0) { | |
calculateFrequentItemsets(); | |
if (itemsets.size()!=0) { | |
nbFrequentSets += itemsets.size(); | |
log("Found " + itemsets.size() + " frequent itemsets of size " + itemsetNumber + " (with support " + (minSup*100) + "%)"); | |
createNewItemsetsFromPreviousOnes(); | |
} | |
itemsetNumber++; | |
} | |
//display the execution time | |
long end = System.currentTimeMillis(); | |
log("Execution time is: "+((double)(end-start)/1000) + " seconds."); | |
log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup*100)+"% (absolute "+Math.round(numTransactions*minSup)+")"); | |
log("Done"); | |
} | |
/** puts in itemsets all sets of size 1, | |
* i.e. all possibles items of the datasets | |
*/ | |
private void createItemsetsOfSize1() { | |
itemsets = new ArrayList<int[]>(); | |
for(int i=0; i<numItems; i++) | |
{ | |
int[] cand = {i}; | |
itemsets.add(cand); | |
} | |
} | |
/** triggers actions if a frequent item set has been found */ | |
private void foundFrequentItemSet(int[] itemset, int support) { | |
int[] convertedItemset = new int[itemset.length]; | |
for (int i = 0; i < itemset.length; i++) { | |
convertedItemset[i] = distinctItems.get(itemset[i]); | |
} | |
this.setChanged(); | |
notifyObservers(convertedItemset); | |
System.out.println(Arrays.toString(convertedItemset) + " (" + ((support / (double) numTransactions))+" " + support + ")"); | |
} | |
/** outputs a message in Sys.err if not used as library */ | |
private void log(String message) { | |
System.err.println(message); | |
} | |
/** | |
* if m is the size of the current itemsets, | |
* generate all possible itemsets of size n+1 from pairs of current itemsets | |
* replaces the itemsets of itemsets by the new ones | |
*/ | |
private void createNewItemsetsFromPreviousOnes() | |
{ | |
// by construction, all existing itemsets have the same size | |
int currentSizeOfItemsets = itemsets.get(0).length; | |
log("Creating itemsets of size " + (currentSizeOfItemsets+1) + " based on " + itemsets.size() + " itemsets of size " + currentSizeOfItemsets); | |
HashMap<String, int[]> tempCandidates = new HashMap<>(); //temporary candidates | |
// compare each pair of itemsets of size n-1 | |
for(int i = 0; i < itemsets.size(); i++) { | |
for(int j = i+1; j < itemsets.size(); j++) { | |
int[] X = itemsets.get(i); | |
int[] Y = itemsets.get(j); | |
assert(X.length == Y.length); | |
//make a string of the first n-2 tokens of the strings | |
int[] newCand = new int[currentSizeOfItemsets+1]; | |
for (int s=0; s < newCand.length-1; s++) { | |
newCand[s] = X[s]; | |
} | |
int ndifferent = 0; | |
// then we find the missing value | |
for (int s1=0; s1 < Y.length; s1++) { | |
boolean found = false; | |
// is Y[s1] in X? | |
for(int s2=0; s2 < X.length; s2++) { | |
if (X[s2]==Y[s1]) { | |
found = true; | |
break; | |
} | |
} | |
if (!found){ // Y[s1] is not in X | |
ndifferent++; | |
// we put the missing value at the end of newCand | |
newCand[newCand.length -1] = Y[s1]; | |
} | |
} | |
// we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates | |
assert(ndifferent>0); | |
if (ndifferent==1) { | |
// HashMap does not have the correct "equals" for int[] :-( | |
// I have to create the hash myself using a String :-( | |
// I use Arrays.toString to reuse equals and hashcode of String | |
Arrays.sort(newCand); | |
tempCandidates.put(Arrays.toString(newCand),newCand); | |
} | |
} | |
} | |
//set the new itemsets | |
itemsets = new ArrayList<>(tempCandidates.values()); | |
log("Created " + itemsets.size() + " unique itemsets of size "+(currentSizeOfItemsets+1)); | |
} | |
/** put "true" in trans[i] if the integer i is in line */ | |
private boolean[] itemsetToBooleanArray(int[] itemset) { | |
boolean[] trans = new boolean[numItems]; | |
for (int i : itemset) { | |
trans[distinctItems.indexOf(i)] = true; | |
} | |
return trans; | |
} | |
/** passes through the data to measure the frequency of sets in itemsets, | |
* then filters thoses who are under the minimum support (minSup) | |
*/ | |
private void calculateFrequentItemsets() throws Exception | |
{ | |
log("Passing through the data to compute the frequency of " + itemsets.size() + " itemsets of size " + itemsets.get(0).length); | |
List<int[]> frequentCandidates = new ArrayList<>(); //the frequent candidates for the current itemset | |
int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros | |
// create worker threads | |
Iterator<int[]> iterator = allItems.iterator(); | |
List<WorkerThread> threads = new ArrayList<>(); | |
for( int i = 0; i < NUMBER_OF_THREADS; i++) { | |
threads.add(new WorkerThread(count, iterator)); | |
} | |
// start worker threads | |
threads.forEach(WorkerThread::start); | |
// CLIProgressBar progressBar = new CLIProgressBar(numTransactions); | |
boolean running = true; | |
int progress; | |
while(running) { | |
//check if all are still running | |
running = false; | |
progress = 0; | |
for (WorkerThread wt : threads) { | |
progress += wt.getNumberOfProcessedItemsets(); | |
if (!running && wt.isAlive()) { | |
running = true; | |
} | |
} | |
// progressBar.print(progress); | |
Thread.sleep(5000); | |
} | |
for (int i = 0; i < itemsets.size(); i++) { | |
// if the count% is larger than the minSup%, add to the candidate to | |
// the frequent candidates | |
if ((count[i] / (double) (numTransactions)) >= minSup) { | |
foundFrequentItemSet(itemsets.get(i), count[i]); | |
frequentCandidates.add(itemsets.get(i)); | |
} | |
} | |
//new candidates are only the frequent candidates | |
itemsets = frequentCandidates; | |
} | |
private class WorkerThread extends Thread { | |
int[] count; | |
Iterator<int[]> iterator; | |
int numberOfProcessedItemsets; | |
public WorkerThread(int[] count, Iterator<int[]> itemsetsIterator) { | |
this.count = count; | |
this.iterator = itemsetsIterator; | |
this.numberOfProcessedItemsets = 0; | |
} | |
@Override | |
public void run() { | |
try { | |
while (true) { | |
int[] itemset = iterator.next(); | |
boolean[] trans = itemsetToBooleanArray(itemset); | |
// check each candidate | |
checkCandidate: | |
for (int c = 0; c < itemsets.size(); c++) { | |
// tokenize the candidate so that we know what items need to be present for a match | |
int[] cand = itemsets.get(c); | |
// check each item in the itemset to see if it is present in the transaction | |
for (int xx : cand) { | |
if (trans[xx] == false) { | |
continue checkCandidate; | |
} | |
} | |
// at this point it is a match, increase the count | |
count[c]++; | |
} | |
numberOfProcessedItemsets++; | |
} | |
} catch (NoSuchElementException e) { | |
//nothing wrong with this, means that we're done with the iteration | |
} | |
} | |
public int getNumberOfProcessedItemsets() { | |
return numberOfProcessedItemsets; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment