Skip to content

Instantly share code, notes, and snippets.

@sajidrahman
Created April 4, 2017 16:20
Show Gist options
  • Save sajidrahman/17ab0674f5b708dfdd93428fea45a9b0 to your computer and use it in GitHub Desktop.
Save sajidrahman/17ab0674f5b708dfdd93428fea45a9b0 to your computer and use it in GitHub Desktop.
Read, cleanup, stem Stack Overflow posts from csv file
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.tartarus.snowball.SnowballStemmer;
public class CSVReader {
private static Class stemClass = null;
private static SnowballStemmer stemmer = null;
private static Set<String> stopWords;
private static void setup(){
stopWords = new HashSet<String>();
try {
// input the file content to the String "input"
File filepath = new File("/Users/sajid/Desktop/stopwords.txt");
Scanner sc = new Scanner(filepath);
while (sc.hasNextLine()) {
String temp = sc.next();
if(temp != null && !temp.isEmpty()){
stopWords.add(temp);
}
System.out.println(temp);
}
sc.close();
} catch (Exception e) {
System.out.println("Problem reading file.");
}
try {
stemClass = Class.forName("org.tartarus.snowball.ext." + "english" + "Stemmer");
} catch (ClassNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
stemmer = (SnowballStemmer) stemClass.newInstance();
} catch (InstantiationException | IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
setup();
String csvFile = "/Users/sajid/Desktop/security-posts.txt";
String line = "";
String cvsSplitBy = "\t";
int i = 1;
Set<String> tagSet = new HashSet<String>();
try {
// input the file content to the String "input"
File filepath = new File("/Users/sajid/Desktop/tags.txt");
Scanner sc = new Scanner(filepath);
while (sc.hasNextLine()) {
String temp = sc.next();
tagSet.add(temp);
System.out.println(temp);
}
sc.close();
} catch (Exception e) {
System.out.println("Problem reading file.");
}
try (BufferedReader br = new BufferedReader(new FileReader(csvFile))) {
String fileName = "sec-";
while ((line = br.readLine()) != null) {
// use comma as separator
String[] post = line.split(cvsSplitBy);
System.out.println(post[2]);
String[] tags = post[2].split("><");
int count = 0;
for(String tag: tags){
if(tag.contains("<")){
tag = tag.replace("<", "");
}
if(tag.contains(">")){
tag = tag.replace(">","");
}
if(tagSet.contains(tag))
count++;
}
if(count>=2){
//write to file
fileName = "sec-"+post[0]+".txt";
String stemmedText = runStemmer(post[1]+" "+sanitizeBodyText(post[4]));
writeFile(stemmedText, fileName);
i++;
}
// System.out.println("SecurityPost [title= " + post[1] + " , body=" + sanitizeBodyText(post[4]) + "]");
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Total files written: "+i);
}
public static String sanitizeBodyText(String body) {
String REGEX1 = "<code>.+?</code>";
String REGEX2 = "<(.|\n)+?>";
String REPLACE = "";
Pattern p1 = Pattern.compile(REGEX1, Pattern.DOTALL);
Pattern p2 = Pattern.compile(REGEX2, Pattern.DOTALL);
// get a matcher object
Matcher m1 = p1.matcher(body);
StringBuffer sb1 = new StringBuffer();
while (m1.find()) {
m1.appendReplacement(sb1, REPLACE);
}
m1.appendTail(sb1);
// System.out.println(sb1.toString());
String tempBody = sb1.toString().trim();
Matcher m2 = p2.matcher(tempBody);
StringBuffer sb2 = new StringBuffer();
while (m2.find()) {
m2.appendReplacement(sb2, REPLACE);
}
m2.appendTail(sb2);
tempBody = sb2.toString().replaceAll("\\s+$", "");
tempBody = tempBody.replaceAll("\\n", "");
return tempBody;
}
public static void writeFile(String text, String fileName)throws IOException {
File file = new File(fileName);
// creates the file
file.createNewFile();
// creates a FileWriter Object
FileWriter writer = new FileWriter(file);
// Writes the content to the file
writer.write(text);
writer.flush();
writer.close();
}
private static String runStemmer(String value) {
String url = "(http?|https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
String line = value.toString().toLowerCase().replace("\"", "");
line = line.replaceAll(url, "");
StringTokenizer tokenizer = new StringTokenizer(line, " \t\n\r\f,.:;?![]'");
StringBuilder builder = new StringBuilder();
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
//remove digits from the beginning or from any other place of the word
// String regex = "\\d+|\\+*|-*|=|\\*";
// token = Pattern.compile(regex, Pattern.DOTALL).matcher(token).replaceAll(REPLACE);
// token = token.replaceAll("--", REPLACE);
// token = token.replaceAll("=", REPLACE);
// token = token.replaceAll("-", REPLACE);
String REGEX4digits = "\\d+";
Pattern p1 = Pattern.compile(REGEX4digits);
// get a matcher object
Matcher m1 = p1.matcher(token);
if (m1.matches()||token.equalsIgnoreCase("*")
|| token.equalsIgnoreCase("--")||
token.equalsIgnoreCase("+"))
continue;
else if (stopWords.contains(token))
continue;
else if(token.length()==0)
continue;
else {
stemmer.setCurrent(token);
stemmer.stem();
String stemmed_word = stemmer.getCurrent();
builder.append(stemmed_word);
builder.append(" ");
// builder.append(docname.toString());
}
}
System.out.println(builder.toString());
return builder.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment