-
-
Save monperrus/7157717 to your computer and use it in GitHub Desktop.
import java.io.*; | |
import java.util.*; | |
/** The class encapsulates an implementation of the Apriori algorithm | |
* to compute frequent itemsets. | |
* | |
* Datasets contains integers (>=0) separated by spaces, one transaction by line, e.g. | |
* 1 2 3 | |
* 0 9 | |
* 1 9 | |
* | |
* Usage with the command line : | |
* $ java mining.Apriori fileName support | |
* $ java mining.Apriori /tmp/data.dat 0.8 | |
* $ java mining.Apriori /tmp/data.dat 0.8 > frequent-itemsets.txt | |
* | |
* For a full library, see SPMF https://www.philippe-fournier-viger.com/spmf/ | |
* | |
* @author Martin Monperrus, University of Darmstadt, 2010 | |
* @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009. | |
* @copyright GNU General Public License v3 | |
* No reproduction in whole or part without maintaining this copyright notice | |
* and imposing this condition on any subsequent users. | |
*/ | |
public class Apriori extends Observable { | |
public static void main(String[] args) throws Exception { | |
Apriori ap = new Apriori(args); | |
} | |
/** the list of current itemsets */ | |
private List<int[]> itemsets ; | |
/** the name of the transcation file */ | |
private String transaFile; | |
/** number of different items in the dataset */ | |
private int numItems; | |
/** total number of transactions in transaFile */ | |
private int numTransactions; | |
/** minimum support for a frequent itemset in percentage, e.g. 0.8 */ | |
private double minSup; | |
/** by default, Apriori is used with the command line interface */ | |
private boolean usedAsLibrary = false; | |
/** This is the main interface to use this class as a library */ | |
public Apriori(String[] args, Observer ob) throws Exception | |
{ | |
usedAsLibrary = true; | |
configure(args); | |
this.addObserver(ob); | |
go(); | |
} | |
/** generates the apriori itemsets from a file | |
* | |
* @param args configuration parameters: args[0] is a filename, args[1] the min support (e.g. 0.8 for 80%) | |
*/ | |
public Apriori(String[] args) throws Exception | |
{ | |
configure(args); | |
go(); | |
} | |
/** starts the algorithm after configuration */ | |
private void go() throws Exception { | |
//start timer | |
long start = System.currentTimeMillis(); | |
// first we generate the candidates of size 1 | |
createItemsetsOfSize1(); | |
int itemsetNumber=1; //the current itemset being looked at | |
int nbFrequentSets=0; | |
while (itemsets.size()>0) | |
{ | |
calculateFrequentItemsets(); | |
if(itemsets.size()!=0) | |
{ | |
nbFrequentSets+=itemsets.size(); | |
log("Found "+itemsets.size()+" frequent itemsets of size " + itemsetNumber + " (with support "+(minSup*100)+"%)");; | |
createNewItemsetsFromPreviousOnes(); | |
} | |
itemsetNumber++; | |
} | |
//display the execution time | |
long end = System.currentTimeMillis(); | |
log("Execution time is: "+((double)(end-start)/1000) + " seconds."); | |
log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup*100)+"% (absolute "+Math.round(numTransactions*minSup)+")"); | |
log("Done"); | |
} | |
/** triggers actions if a frequent item set has been found */ | |
private void foundFrequentItemSet(int[] itemset, int support) { | |
if (usedAsLibrary) { | |
this.setChanged(); | |
notifyObservers(itemset); | |
} | |
else {System.out.println(Arrays.toString(itemset) + " ("+ ((support / (double) numTransactions))+" "+support+")");} | |
} | |
/** outputs a message in Sys.err if not used as library */ | |
private void log(String message) { | |
if (!usedAsLibrary) { | |
System.err.println(message); | |
} | |
} | |
/** computes numItems, numTransactions, and sets minSup */ | |
private void configure(String[] args) throws Exception | |
{ | |
// setting transafile | |
if (args.length!=0) transaFile = args[0]; | |
else transaFile = "chess.dat"; // default | |
// setting minsupport | |
if (args.length>=2) minSup=(Double.valueOf(args[1]).doubleValue()); | |
else minSup = .8;// by default | |
if (minSup>1 || minSup<0) throw new Exception("minSup: bad value"); | |
// going thourgh the file to compute numItems and numTransactions | |
numItems = 0; | |
numTransactions=0; | |
BufferedReader data_in = new BufferedReader(new FileReader(transaFile)); | |
while (data_in.ready()) { | |
String line=data_in.readLine(); | |
if (line.matches("\\s*")) continue; // be friendly with empty lines | |
numTransactions++; | |
StringTokenizer t = new StringTokenizer(line," "); | |
while (t.hasMoreTokens()) { | |
int x = Integer.parseInt(t.nextToken()); | |
//log(x); | |
if (x+1>numItems) numItems=x+1; | |
} | |
} | |
outputConfig(); | |
} | |
/** outputs the current configuration | |
*/ | |
private void outputConfig() { | |
//output config info to the user | |
log("Input configuration: "+numItems+" items, "+numTransactions+" transactions, "); | |
log("minsup = "+minSup*100+"%"); | |
} | |
/** puts in itemsets all sets of size 1, | |
* i.e. all possibles items of the datasets | |
*/ | |
private void createItemsetsOfSize1() { | |
itemsets = new ArrayList<int[]>(); | |
for(int i=0; i<numItems; i++) | |
{ | |
int[] cand = {i}; | |
itemsets.add(cand); | |
} | |
} | |
/** | |
* if m is the size of the current itemsets, | |
* generate all possible itemsets of size n+1 from pairs of current itemsets | |
* replaces the itemsets of itemsets by the new ones | |
*/ | |
private void createNewItemsetsFromPreviousOnes() | |
{ | |
// by construction, all existing itemsets have the same size | |
int currentSizeOfItemsets = itemsets.get(0).length; | |
log("Creating itemsets of size "+(currentSizeOfItemsets+1)+" based on "+itemsets.size()+" itemsets of size "+currentSizeOfItemsets); | |
HashMap<String, int[]> tempCandidates = new HashMap<String, int[]>(); //temporary candidates | |
// compare each pair of itemsets of size n-1 | |
for(int i=0; i<itemsets.size(); i++) | |
{ | |
for(int j=i+1; j<itemsets.size(); j++) | |
{ | |
int[] X = itemsets.get(i); | |
int[] Y = itemsets.get(j); | |
assert (X.length==Y.length); | |
//make a string of the first n-2 tokens of the strings | |
int [] newCand = new int[currentSizeOfItemsets+1]; | |
for(int s=0; s<newCand.length-1; s++) { | |
newCand[s] = X[s]; | |
} | |
int ndifferent = 0; | |
// then we find the missing value | |
for(int s1=0; s1<Y.length; s1++) | |
{ | |
boolean found = false; | |
// is Y[s1] in X? | |
for(int s2=0; s2<X.length; s2++) { | |
if (X[s2]==Y[s1]) { | |
found = true; | |
break; | |
} | |
} | |
if (!found){ // Y[s1] is not in X | |
ndifferent++; | |
// we put the missing value at the end of newCand | |
newCand[newCand.length -1] = Y[s1]; | |
} | |
} | |
// we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates | |
assert(ndifferent>0); | |
if (ndifferent==1) { | |
// HashMap does not have the correct "equals" for int[] :-( | |
// I have to create the hash myself using a String :-( | |
// I use Arrays.toString to reuse equals and hashcode of String | |
Arrays.sort(newCand); | |
tempCandidates.put(Arrays.toString(newCand),newCand); | |
} | |
} | |
} | |
//set the new itemsets | |
itemsets = new ArrayList<int[]>(tempCandidates.values()); | |
log("Created "+itemsets.size()+" unique itemsets of size "+(currentSizeOfItemsets+1)); | |
} | |
/** put "true" in trans[i] if the integer i is in line */ | |
private void line2booleanArray(String line, boolean[] trans) { | |
Arrays.fill(trans, false); | |
StringTokenizer stFile = new StringTokenizer(line, " "); //read a line from the file to the tokenizer | |
//put the contents of that line into the transaction array | |
while (stFile.hasMoreTokens()) | |
{ | |
int parsedVal = Integer.parseInt(stFile.nextToken()); | |
trans[parsedVal]=true; //if it is not a 0, assign the value to true | |
} | |
} | |
/** passes through the data to measure the frequency of sets in {@link itemsets}, | |
* then filters thoses who are under the minimum support (minSup) | |
*/ | |
private void calculateFrequentItemsets() throws Exception | |
{ | |
log("Passing through the data to compute the frequency of " + itemsets.size()+ " itemsets of size "+itemsets.get(0).length); | |
List<int[]> frequentCandidates = new ArrayList<int[]>(); //the frequent candidates for the current itemset | |
boolean match; //whether the transaction has all the items in an itemset | |
int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros | |
// load the transaction file | |
BufferedReader data_in = new BufferedReader(new InputStreamReader(new FileInputStream(transaFile))); | |
boolean[] trans = new boolean[numItems]; | |
// for each transaction | |
for (int i = 0; i < numTransactions; i++) { | |
// boolean[] trans = extractEncoding1(data_in.readLine()); | |
String line = data_in.readLine(); | |
line2booleanArray(line, trans); | |
// check each candidate | |
for (int c = 0; c < itemsets.size(); c++) { | |
match = true; // reset match to false | |
// tokenize the candidate so that we know what items need to be | |
// present for a match | |
int[] cand = itemsets.get(c); | |
//int[] cand = candidatesOptimized[c]; | |
// check each item in the itemset to see if it is present in the | |
// transaction | |
for (int xx : cand) { | |
if (trans[xx] == false) { | |
match = false; | |
break; | |
} | |
} | |
if (match) { // if at this point it is a match, increase the count | |
count[c]++; | |
//log(Arrays.toString(cand)+" is contained in trans "+i+" ("+line+")"); | |
} | |
} | |
} | |
data_in.close(); | |
for (int i = 0; i < itemsets.size(); i++) { | |
// if the count% is larger than the minSup%, add to the candidate to | |
// the frequent candidates | |
if ((count[i] / (double) (numTransactions)) >= minSup) { | |
foundFrequentItemSet(itemsets.get(i),count[i]); | |
frequentCandidates.add(itemsets.get(i)); | |
} | |
//else log("-- Remove candidate: "+ Arrays.toString(candidates.get(i)) + " is: "+ ((count[i] / (double) numTransactions))); | |
} | |
//new candidates are only the frequent candidates | |
itemsets = frequentCandidates; | |
} | |
} |
i need code for fast distributed mining algorithm for association rules.
please help me.
No, the association rule is NOT implemented in this code :(, just the Apriori Algorithm
what if i'm getting my data from a database , how do i structure the data for the algorithm to use it
plz provide me code for partition on apriori algo or divisive apriori algo in java
plz provide me code of eclat algorithm in c++
[email protected] this is my mail address
Please provide me code for reverse apriori algorithm in R or java
[email protected] this is my mail id
Usage as library: see {@link ExampleOfClientCodeOfApriori} where is this example?
@monperrus Everyone, be aware with the usage of the code. The code assumes that your transactions DB contains records all from 0 to n. If your records don't start with 0, e..g [209 212 209 212 212 212; 45 63 89; 89 53 63], above code will not work. The author should make appropriate changes in config function.
I forked this code and added association rules to it enjoy ;)
My Forked Apriori.java
@ENGINEER-RC thanks bro
I need the description of the data set "retail.gz " available in the link "http://fimi.ua.ac.be/data/." please help me.
how can i add chess.dat in netbeans?
Thanks so much for sharing! However I found a typo in the code, specifically, line 151
log("minsup = "+minSup+"%");
should be
log("minsup = "+minSup*100+"%");
I found a typo in the code, specifically, line 151
Thanks for typo! Fixed.
For a full library, see SPMF https://www.philippe-fournier-viger.com/spmf/
i want the same code but only for strings not only integers any help?
I forked this code and added association rules to it enjoy ;)
My Forked Apriori.java
can you please share your file again? It seems that it doesn't exist anymore
I forked this code and added association rules to it enjoy ;)
My Forked Apriori.javacan you please share your file again? It seems that it doesn't exist anymore
https://gist.github.com/tomrockdsouza/34bdc63161befad19ce33564a473fc58
llbbnn did you implement the association rules from this code? can you help..?