sajidrahman · April 4, 2017 16:20
diff --git a/CSVReader.java b/CSVReader.java

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Scanner;
 import java.util.Set;
 import java.util.StringTokenizer;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.tartarus.snowball.SnowballStemmer;

 public class CSVReader {
 	
 	private static Class stemClass = null;
 	private static SnowballStemmer stemmer = null;
 	private static Set<String> stopWords;
 	
 	private static void setup(){
 		
 		stopWords = new HashSet<String>();
 		try {
 	        // input the file content to the String "input"
 			File filepath = new File("/Users/sajid/Desktop/stopwords.txt");
 			Scanner sc = new Scanner(filepath);

 	        while (sc.hasNextLine()) {
 	        	String temp = sc.next();
 	            if(temp != null && !temp.isEmpty()){
 	            	stopWords.add(temp);
 	            }
 	            System.out.println(temp);
 	        }
 	        sc.close();

 	    } catch (Exception e) {
 	        System.out.println("Problem reading file.");
 	    }
 		try {
 			stemClass = Class.forName("org.tartarus.snowball.ext." + "english" + "Stemmer");
 		} catch (ClassNotFoundException e1) {
 			// TODO Auto-generated catch block
 			e1.printStackTrace();
 		}

 		try {
 			stemmer = (SnowballStemmer) stemClass.newInstance();
 		} catch (InstantiationException | IllegalAccessException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 	}

    public static void main(String[] args) {
    	setup();
        String csvFile = "/Users/sajid/Desktop/security-posts.txt";
        String line = "";
        String cvsSplitBy = "\t";
        int i = 1;
        Set<String> tagSet = new HashSet<String>();

 		try {
 	        // input the file content to the String "input"
 			File filepath = new File("/Users/sajid/Desktop/tags.txt");
 			Scanner sc = new Scanner(filepath);

 	        while (sc.hasNextLine()) {
 	        	String temp = sc.next();
 	            tagSet.add(temp);
 	            System.out.println(temp);
 	        }
 	        sc.close();

 	    } catch (Exception e) {
 	        System.out.println("Problem reading file.");
 	    }


        try (BufferedReader br = new BufferedReader(new FileReader(csvFile))) {
        	String fileName = "sec-";

            while ((line = br.readLine()) != null) {

                // use comma as separator
                String[] post = line.split(cvsSplitBy);
                System.out.println(post[2]);
                
                String[] tags = post[2].split("><");
                int count = 0;
                
                for(String tag: tags){
                	if(tag.contains("<")){
                		tag = tag.replace("<", "");
                	}
                	if(tag.contains(">")){
                		tag = tag.replace(">","");
                	}
                	
                	if(tagSet.contains(tag))
                		count++;
                }
                
                if(count>=2){
                	//write to file
                	fileName = "sec-"+post[0]+".txt";
                	String stemmedText = runStemmer(post[1]+" "+sanitizeBodyText(post[4]));
                	writeFile(stemmedText, fileName);
                	i++;
                }
 //                System.out.println("SecurityPost [title= " + post[1] + " , body=" + sanitizeBodyText(post[4]) + "]");
                

            }

        } catch (IOException e) {
            e.printStackTrace();
        }
        
        System.out.println("Total files written: "+i);
    }
    
 	public static String sanitizeBodyText(String body) {
 		String REGEX1 = "<code>.+?</code>";
 		String REGEX2 = "<(.|\n)+?>";
 		String REPLACE = "";
 		Pattern p1 = Pattern.compile(REGEX1, Pattern.DOTALL);
 		Pattern p2 = Pattern.compile(REGEX2, Pattern.DOTALL);
 		// get a matcher object
 		Matcher m1 = p1.matcher(body);

 		StringBuffer sb1 = new StringBuffer();
 		while (m1.find()) {
 			m1.appendReplacement(sb1, REPLACE);
 		}
 		m1.appendTail(sb1);
 		// System.out.println(sb1.toString());
 		String tempBody = sb1.toString().trim();
 		Matcher m2 = p2.matcher(tempBody);
 		StringBuffer sb2 = new StringBuffer();
 		while (m2.find()) {
 			m2.appendReplacement(sb2, REPLACE);
 		}
 		m2.appendTail(sb2);
 		
 		tempBody = sb2.toString().replaceAll("\\s+$", "");
 		tempBody = tempBody.replaceAll("\\n", "");
 		return tempBody;
 	}
 	
 	   public static void writeFile(String text, String fileName)throws IOException {
 		      File file = new File(fileName);
 		      
 		      // creates the file
 		      file.createNewFile();
 		      
 		      // creates a FileWriter Object
 		      FileWriter writer = new FileWriter(file); 
 		      
 		      // Writes the content to the file
 		      writer.write(text); 
 		      writer.flush();
 		      writer.close();
 		   }
 	   
 	  
 		private static String runStemmer(String value) {
 			
 			String url = "(http?|https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
 			String line = value.toString().toLowerCase().replace("\"", "");
 			line = line.replaceAll(url, "");

 			StringTokenizer tokenizer = new StringTokenizer(line, " \t\n\r\f,.:;?![]'");

 			StringBuilder builder = new StringBuilder();

 			while (tokenizer.hasMoreTokens()) {

 					String token = tokenizer.nextToken();
 					//remove digits from the beginning or from any other place of the word
 //					String regex = "\\d+|\\+*|-*|=|\\*";
 //					token = Pattern.compile(regex, Pattern.DOTALL).matcher(token).replaceAll(REPLACE);
 //					token = token.replaceAll("--", REPLACE);
 //					token = token.replaceAll("=", REPLACE);
 //					token = token.replaceAll("-", REPLACE);
 					String REGEX4digits = "\\d+";
 					Pattern p1 = Pattern.compile(REGEX4digits);

 					// get a matcher object
 					Matcher m1 = p1.matcher(token);
 					if (m1.matches()||token.equalsIgnoreCase("*")
 							|| token.equalsIgnoreCase("--")||
 							token.equalsIgnoreCase("+"))
 						continue;
 					else if (stopWords.contains(token))
 						continue;
 					else if(token.length()==0)
 						continue;
 					else {
 						stemmer.setCurrent(token);
 						stemmer.stem();
 						String stemmed_word = stemmer.getCurrent();
 						builder.append(stemmed_word);
 						builder.append(" ");
 //						builder.append(docname.toString());
 					}


 				}
 			
 			System.out.println(builder.toString());
 			return builder.toString();
 			
 		}

 }

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileOutputStream;
	import java.io.FileReader;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.util.HashSet;
	import java.util.Scanner;
	import java.util.Set;
	import java.util.StringTokenizer;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.tartarus.snowball.SnowballStemmer;

	public class CSVReader {

	private static Class stemClass = null;
	private static SnowballStemmer stemmer = null;
	private static Set<String> stopWords;

	private static void setup(){

	stopWords = new HashSet<String>();
	try {
	// input the file content to the String "input"
	File filepath = new File("/Users/sajid/Desktop/stopwords.txt");
	Scanner sc = new Scanner(filepath);

	while (sc.hasNextLine()) {
	String temp = sc.next();
	if(temp != null && !temp.isEmpty()){
	stopWords.add(temp);
	}
	System.out.println(temp);
	}
	sc.close();

	} catch (Exception e) {
	System.out.println("Problem reading file.");
	}
	try {
	stemClass = Class.forName("org.tartarus.snowball.ext." + "english" + "Stemmer");
	} catch (ClassNotFoundException e1) {
	// TODO Auto-generated catch block
	e1.printStackTrace();
	}

	try {
	stemmer = (SnowballStemmer) stemClass.newInstance();
	} catch (InstantiationException \| IllegalAccessException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}

	public static void main(String[] args) {
	setup();
	String csvFile = "/Users/sajid/Desktop/security-posts.txt";
	String line = "";
	String cvsSplitBy = "\t";
	int i = 1;
	Set<String> tagSet = new HashSet<String>();

	try {
	// input the file content to the String "input"
	File filepath = new File("/Users/sajid/Desktop/tags.txt");
	Scanner sc = new Scanner(filepath);

	while (sc.hasNextLine()) {
	String temp = sc.next();
	tagSet.add(temp);
	System.out.println(temp);
	}
	sc.close();

	} catch (Exception e) {
	System.out.println("Problem reading file.");
	}


	try (BufferedReader br = new BufferedReader(new FileReader(csvFile))) {
	String fileName = "sec-";

	while ((line = br.readLine()) != null) {

	// use comma as separator
	String[] post = line.split(cvsSplitBy);
	System.out.println(post[2]);

	String[] tags = post[2].split("><");
	int count = 0;

	for(String tag: tags){
	if(tag.contains("<")){
	tag = tag.replace("<", "");
	}
	if(tag.contains(">")){
	tag = tag.replace(">","");
	}

	if(tagSet.contains(tag))
	count++;
	}

	if(count>=2){
	//write to file
	fileName = "sec-"+post[0]+".txt";
	String stemmedText = runStemmer(post[1]+" "+sanitizeBodyText(post[4]));
	writeFile(stemmedText, fileName);
	i++;
	}
	// System.out.println("SecurityPost [title= " + post[1] + " , body=" + sanitizeBodyText(post[4]) + "]");


	}

	} catch (IOException e) {
	e.printStackTrace();
	}

	System.out.println("Total files written: "+i);
	}

	public static String sanitizeBodyText(String body) {
	String REGEX1 = "<code>.+?</code>";
	String REGEX2 = "<(.\|\n)+?>";
	String REPLACE = "";
	Pattern p1 = Pattern.compile(REGEX1, Pattern.DOTALL);
	Pattern p2 = Pattern.compile(REGEX2, Pattern.DOTALL);
	// get a matcher object
	Matcher m1 = p1.matcher(body);

	StringBuffer sb1 = new StringBuffer();
	while (m1.find()) {
	m1.appendReplacement(sb1, REPLACE);
	}
	m1.appendTail(sb1);
	// System.out.println(sb1.toString());
	String tempBody = sb1.toString().trim();
	Matcher m2 = p2.matcher(tempBody);
	StringBuffer sb2 = new StringBuffer();
	while (m2.find()) {
	m2.appendReplacement(sb2, REPLACE);
	}
	m2.appendTail(sb2);

	tempBody = sb2.toString().replaceAll("\\s+$", "");
	tempBody = tempBody.replaceAll("\\n", "");
	return tempBody;
	}

	public static void writeFile(String text, String fileName)throws IOException {
	File file = new File(fileName);

	// creates the file
	file.createNewFile();

	// creates a FileWriter Object
	FileWriter writer = new FileWriter(file);

	// Writes the content to the file
	writer.write(text);
	writer.flush();
	writer.close();
	}


	private static String runStemmer(String value) {

	String url = "(http?\|https?\|ftp\|file)://[-a-zA-Z0-9+&@#/%?=~_\|!:,.;]*[-a-zA-Z0-9+&@#/%=~_\|]";
	String line = value.toString().toLowerCase().replace("\"", "");
	line = line.replaceAll(url, "");

	StringTokenizer tokenizer = new StringTokenizer(line, " \t\n\r\f,.:;?![]'");

	StringBuilder builder = new StringBuilder();

	while (tokenizer.hasMoreTokens()) {

	String token = tokenizer.nextToken();
	//remove digits from the beginning or from any other place of the word
	// String regex = "\\d+\|\\+\|-\|=\|\\*";
	// token = Pattern.compile(regex, Pattern.DOTALL).matcher(token).replaceAll(REPLACE);
	// token = token.replaceAll("--", REPLACE);
	// token = token.replaceAll("=", REPLACE);
	// token = token.replaceAll("-", REPLACE);
	String REGEX4digits = "\\d+";
	Pattern p1 = Pattern.compile(REGEX4digits);

	// get a matcher object
	Matcher m1 = p1.matcher(token);
	if (m1.matches()\|\|token.equalsIgnoreCase("*")
	\|\| token.equalsIgnoreCase("--")\|\|
	token.equalsIgnoreCase("+"))
	continue;
	else if (stopWords.contains(token))
	continue;
	else if(token.length()==0)
	continue;
	else {
	stemmer.setCurrent(token);
	stemmer.stem();
	String stemmed_word = stemmer.getCurrent();
	builder.append(stemmed_word);
	builder.append(" ");
	// builder.append(docname.toString());
	}


	}

	System.out.println(builder.toString());
	return builder.toString();

	}

	}
No results found