Skip to content

Instantly share code, notes, and snippets.

@gupul2k
Created March 2, 2013 19:51
Show Gist options
  • Save gupul2k/5072976 to your computer and use it in GitHub Desktop.
Save gupul2k/5072976 to your computer and use it in GitHub Desktop.
Simple TermDocument Matrix: This takes 3 arguments. Raw File, Word_List in Corpus, FWs List
/* @(#) TermDocumentMatrix 1.00 2/25/2013
*
* [Copyright Information]
*/
/*
* Revision History:
* Revision Version Project Change Date Author Description
* No. No. Code Req. no.
* 1 1 Sobhan Hota
*/
//package jclasses;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
/**
* Class Name : This class generates TermDocumentMatrix for the supplied file to Words_list supplied.
*
* Classes Used : Features
*/
public class TermDocumentMatrix {
public static void main(String argv[]) throws FileNotFoundException,
IOException{
String inp_Review_File = null;
String inp_Features_File = null;
String inp_FWs_File = null;
ArrayList terms_List = null;
FileReader ip_RvwFileReader = null;
BufferedReader buff_RvwReader = null;
String all_rel_fre_Str = "";
String all_words_csv = "";
try {
if(argv.length != 3) {
System.out.println("Usage - Provide correct parameter value: java GenerateVectorData <Corpus> <Word List File> <FWs File> ");
return;
}
//Read text file specified for review
inp_Review_File = argv[0]; //Review File Name
inp_Features_File = argv[1]; //All words from Loss Data
inp_FWs_File = argv[2]; //All words from Loss Data
terms_List = collectAllTermsList(inp_Features_File,
inp_FWs_File);
ip_RvwFileReader = new FileReader(inp_Review_File);
buff_RvwReader = new BufferedReader(ip_RvwFileReader);
String temp_Str = null;
StringTokenizer stk_For_Reviews = null;
double numberOfTokens = 0.0;
double totalTokens = 0.0;
//All Words in csv [as Header]
for(int bow_count =0; bow_count < terms_List.size(); bow_count++) {
String one_BoW = (String)terms_List.get(bow_count);
all_words_csv = all_words_csv + one_BoW+", ";
}
//System.out.print(all_words_csv);
//Span across ALL the Features for this file
for(int bow_count =0; bow_count < terms_List.size(); bow_count++) {
String one_BoW = (String)terms_List.get(bow_count);
ip_RvwFileReader = new FileReader(inp_Review_File);
buff_RvwReader = new BufferedReader(ip_RvwFileReader);
String temp_Line_Str = null;
StringTokenizer stk = null;
double count_Term = 0;
while ((temp_Line_Str = buff_RvwReader.readLine()) != null) {
if(temp_Line_Str.length() > 0) {
stk = new StringTokenizer(temp_Line_Str);
while(stk.hasMoreTokens()) {
for(int counter=0; counter < stk.countTokens(); counter++) {
String temp_Tok = stk.nextToken();
if(temp_Tok.equalsIgnoreCase(one_BoW.trim()) || temp_Tok.startsWith(one_BoW.trim())) {
count_Term++;
}
}
}
}//if ends
}//while ends
all_rel_fre_Str = all_rel_fre_Str + count_Term+",";
}
//System.out.println(all_rel_fre_Str);
String out_put_File_Name = "./output/"+"Term_Doucment_Matrix.txt";
File outputFile = new File(out_put_File_Name);
FileOutputStream outstream = new FileOutputStream(outputFile, true);
printToInputFile(outstream, all_words_csv.substring(0, all_words_csv.length()-2));
printToInputFile(outstream, all_rel_fre_Str+inp_Review_File);
} catch(FileNotFoundException fe) {
fe.printStackTrace();
} catch(IOException ioe) {
ioe.printStackTrace();
} catch(Exception e) {
e.printStackTrace();
}
}
/**
* This method collects the features from a text file and keep it in an ArrayList.
*@param: String Features File Name
*@return ArrayList
*/
public static ArrayList collectAllTermsList(String features_Filename,
String FWs_File) throws IOException {
ArrayList returnFeatures_List = new ArrayList();
ArrayList fws_List = new ArrayList();
FileReader features_FileReader = new FileReader(features_Filename);
BufferedReader features_Buff_Rdr = new BufferedReader(features_FileReader);
FileReader fws_FileReader = new FileReader(FWs_File);
BufferedReader fws_Buff_Rdr = new BufferedReader(fws_FileReader);
String temp_Str = null;
try {
//Read All words into a List
while ((temp_Str = features_Buff_Rdr.readLine()) != null) {
if(temp_Str.length() > 0) {
returnFeatures_List.add(temp_Str.trim());
}
}
//Read FWs into a List
while ((temp_Str = fws_Buff_Rdr.readLine()) != null) {
if(temp_Str.length() > 0) {
fws_List.add(temp_Str.trim());
}
}
//System.out.println("Before "+returnFeatures_List.size());
for(int fw_count =0; fw_count < fws_List.size(); fw_count++) {
String one_FW = (String)fws_List.get(fw_count);
if(returnFeatures_List.contains(one_FW.trim())) {
returnFeatures_List.remove(one_FW.trim());
}
}
//System.out.println("After FWs removal "+returnFeatures_List.size());
} catch(FileNotFoundException fe) {
fe.printStackTrace();
} catch(IOException ioe) {
ioe.printStackTrace();
} catch(Exception e) {
e.printStackTrace();
}
return returnFeatures_List;
}
/**
* This method prints vector.
*@param:
*@return
*/
public static void printToInputFile(FileOutputStream outstream,
String content_To_Print) throws java.io.FileNotFoundException,
IOException {
//Writing to File
byte[] buffer = new byte[8192];
String newLine = "\n";
byte[] bufferNewLine = new byte[1];
try {
buffer = content_To_Print.getBytes();
outstream.write(buffer);
bufferNewLine = newLine.getBytes();
outstream.write(bufferNewLine);
outstream.flush();
//outstream.close();
} catch(FileNotFoundException fe) {
fe.printStackTrace();
} catch(IOException ioe) {
ioe.printStackTrace();
} catch(Exception e) {
e.printStackTrace();
}
}
}//class ends
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment