Created
March 2, 2013 19:51
-
-
Save gupul2k/5072976 to your computer and use it in GitHub Desktop.
Simple TermDocument Matrix: This takes 3 arguments. Raw File, Word_List in Corpus, FWs List
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* @(#) TermDocumentMatrix 1.00 2/25/2013 | |
* | |
* [Copyright Information] | |
*/ | |
/* | |
* Revision History: | |
* Revision Version Project Change Date Author Description | |
* No. No. Code Req. no. | |
* 1 1 Sobhan Hota | |
*/ | |
//package jclasses; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.FileOutputStream; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.StringTokenizer; | |
/** | |
* Class Name : This class generates TermDocumentMatrix for the supplied file to Words_list supplied. | |
* | |
* Classes Used : Features | |
*/ | |
public class TermDocumentMatrix { | |
public static void main(String argv[]) throws FileNotFoundException, | |
IOException{ | |
String inp_Review_File = null; | |
String inp_Features_File = null; | |
String inp_FWs_File = null; | |
ArrayList terms_List = null; | |
FileReader ip_RvwFileReader = null; | |
BufferedReader buff_RvwReader = null; | |
String all_rel_fre_Str = ""; | |
String all_words_csv = ""; | |
try { | |
if(argv.length != 3) { | |
System.out.println("Usage - Provide correct parameter value: java GenerateVectorData <Corpus> <Word List File> <FWs File> "); | |
return; | |
} | |
//Read text file specified for review | |
inp_Review_File = argv[0]; //Review File Name | |
inp_Features_File = argv[1]; //All words from Loss Data | |
inp_FWs_File = argv[2]; //All words from Loss Data | |
terms_List = collectAllTermsList(inp_Features_File, | |
inp_FWs_File); | |
ip_RvwFileReader = new FileReader(inp_Review_File); | |
buff_RvwReader = new BufferedReader(ip_RvwFileReader); | |
String temp_Str = null; | |
StringTokenizer stk_For_Reviews = null; | |
double numberOfTokens = 0.0; | |
double totalTokens = 0.0; | |
//All Words in csv [as Header] | |
for(int bow_count =0; bow_count < terms_List.size(); bow_count++) { | |
String one_BoW = (String)terms_List.get(bow_count); | |
all_words_csv = all_words_csv + one_BoW+", "; | |
} | |
//System.out.print(all_words_csv); | |
//Span across ALL the Features for this file | |
for(int bow_count =0; bow_count < terms_List.size(); bow_count++) { | |
String one_BoW = (String)terms_List.get(bow_count); | |
ip_RvwFileReader = new FileReader(inp_Review_File); | |
buff_RvwReader = new BufferedReader(ip_RvwFileReader); | |
String temp_Line_Str = null; | |
StringTokenizer stk = null; | |
double count_Term = 0; | |
while ((temp_Line_Str = buff_RvwReader.readLine()) != null) { | |
if(temp_Line_Str.length() > 0) { | |
stk = new StringTokenizer(temp_Line_Str); | |
while(stk.hasMoreTokens()) { | |
for(int counter=0; counter < stk.countTokens(); counter++) { | |
String temp_Tok = stk.nextToken(); | |
if(temp_Tok.equalsIgnoreCase(one_BoW.trim()) || temp_Tok.startsWith(one_BoW.trim())) { | |
count_Term++; | |
} | |
} | |
} | |
}//if ends | |
}//while ends | |
all_rel_fre_Str = all_rel_fre_Str + count_Term+","; | |
} | |
//System.out.println(all_rel_fre_Str); | |
String out_put_File_Name = "./output/"+"Term_Doucment_Matrix.txt"; | |
File outputFile = new File(out_put_File_Name); | |
FileOutputStream outstream = new FileOutputStream(outputFile, true); | |
printToInputFile(outstream, all_words_csv.substring(0, all_words_csv.length()-2)); | |
printToInputFile(outstream, all_rel_fre_Str+inp_Review_File); | |
} catch(FileNotFoundException fe) { | |
fe.printStackTrace(); | |
} catch(IOException ioe) { | |
ioe.printStackTrace(); | |
} catch(Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
/** | |
* This method collects the features from a text file and keep it in an ArrayList. | |
*@param: String Features File Name | |
*@return ArrayList | |
*/ | |
public static ArrayList collectAllTermsList(String features_Filename, | |
String FWs_File) throws IOException { | |
ArrayList returnFeatures_List = new ArrayList(); | |
ArrayList fws_List = new ArrayList(); | |
FileReader features_FileReader = new FileReader(features_Filename); | |
BufferedReader features_Buff_Rdr = new BufferedReader(features_FileReader); | |
FileReader fws_FileReader = new FileReader(FWs_File); | |
BufferedReader fws_Buff_Rdr = new BufferedReader(fws_FileReader); | |
String temp_Str = null; | |
try { | |
//Read All words into a List | |
while ((temp_Str = features_Buff_Rdr.readLine()) != null) { | |
if(temp_Str.length() > 0) { | |
returnFeatures_List.add(temp_Str.trim()); | |
} | |
} | |
//Read FWs into a List | |
while ((temp_Str = fws_Buff_Rdr.readLine()) != null) { | |
if(temp_Str.length() > 0) { | |
fws_List.add(temp_Str.trim()); | |
} | |
} | |
//System.out.println("Before "+returnFeatures_List.size()); | |
for(int fw_count =0; fw_count < fws_List.size(); fw_count++) { | |
String one_FW = (String)fws_List.get(fw_count); | |
if(returnFeatures_List.contains(one_FW.trim())) { | |
returnFeatures_List.remove(one_FW.trim()); | |
} | |
} | |
//System.out.println("After FWs removal "+returnFeatures_List.size()); | |
} catch(FileNotFoundException fe) { | |
fe.printStackTrace(); | |
} catch(IOException ioe) { | |
ioe.printStackTrace(); | |
} catch(Exception e) { | |
e.printStackTrace(); | |
} | |
return returnFeatures_List; | |
} | |
/** | |
* This method prints vector. | |
*@param: | |
*@return | |
*/ | |
public static void printToInputFile(FileOutputStream outstream, | |
String content_To_Print) throws java.io.FileNotFoundException, | |
IOException { | |
//Writing to File | |
byte[] buffer = new byte[8192]; | |
String newLine = "\n"; | |
byte[] bufferNewLine = new byte[1]; | |
try { | |
buffer = content_To_Print.getBytes(); | |
outstream.write(buffer); | |
bufferNewLine = newLine.getBytes(); | |
outstream.write(bufferNewLine); | |
outstream.flush(); | |
//outstream.close(); | |
} catch(FileNotFoundException fe) { | |
fe.printStackTrace(); | |
} catch(IOException ioe) { | |
ioe.printStackTrace(); | |
} catch(Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
}//class ends |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment