Created
April 4, 2017 16:20
-
-
Save sajidrahman/17ab0674f5b708dfdd93428fea45a9b0 to your computer and use it in GitHub Desktop.
Read, cleanup, stem Stack Overflow posts from csv file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileOutputStream; | |
import java.io.FileReader; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.util.HashSet; | |
import java.util.Scanner; | |
import java.util.Set; | |
import java.util.StringTokenizer; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import org.tartarus.snowball.SnowballStemmer; | |
public class CSVReader { | |
private static Class stemClass = null; | |
private static SnowballStemmer stemmer = null; | |
private static Set<String> stopWords; | |
private static void setup(){ | |
stopWords = new HashSet<String>(); | |
try { | |
// input the file content to the String "input" | |
File filepath = new File("/Users/sajid/Desktop/stopwords.txt"); | |
Scanner sc = new Scanner(filepath); | |
while (sc.hasNextLine()) { | |
String temp = sc.next(); | |
if(temp != null && !temp.isEmpty()){ | |
stopWords.add(temp); | |
} | |
System.out.println(temp); | |
} | |
sc.close(); | |
} catch (Exception e) { | |
System.out.println("Problem reading file."); | |
} | |
try { | |
stemClass = Class.forName("org.tartarus.snowball.ext." + "english" + "Stemmer"); | |
} catch (ClassNotFoundException e1) { | |
// TODO Auto-generated catch block | |
e1.printStackTrace(); | |
} | |
try { | |
stemmer = (SnowballStemmer) stemClass.newInstance(); | |
} catch (InstantiationException | IllegalAccessException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
} | |
public static void main(String[] args) { | |
setup(); | |
String csvFile = "/Users/sajid/Desktop/security-posts.txt"; | |
String line = ""; | |
String cvsSplitBy = "\t"; | |
int i = 1; | |
Set<String> tagSet = new HashSet<String>(); | |
try { | |
// input the file content to the String "input" | |
File filepath = new File("/Users/sajid/Desktop/tags.txt"); | |
Scanner sc = new Scanner(filepath); | |
while (sc.hasNextLine()) { | |
String temp = sc.next(); | |
tagSet.add(temp); | |
System.out.println(temp); | |
} | |
sc.close(); | |
} catch (Exception e) { | |
System.out.println("Problem reading file."); | |
} | |
try (BufferedReader br = new BufferedReader(new FileReader(csvFile))) { | |
String fileName = "sec-"; | |
while ((line = br.readLine()) != null) { | |
// use comma as separator | |
String[] post = line.split(cvsSplitBy); | |
System.out.println(post[2]); | |
String[] tags = post[2].split("><"); | |
int count = 0; | |
for(String tag: tags){ | |
if(tag.contains("<")){ | |
tag = tag.replace("<", ""); | |
} | |
if(tag.contains(">")){ | |
tag = tag.replace(">",""); | |
} | |
if(tagSet.contains(tag)) | |
count++; | |
} | |
if(count>=2){ | |
//write to file | |
fileName = "sec-"+post[0]+".txt"; | |
String stemmedText = runStemmer(post[1]+" "+sanitizeBodyText(post[4])); | |
writeFile(stemmedText, fileName); | |
i++; | |
} | |
// System.out.println("SecurityPost [title= " + post[1] + " , body=" + sanitizeBodyText(post[4]) + "]"); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
System.out.println("Total files written: "+i); | |
} | |
public static String sanitizeBodyText(String body) { | |
String REGEX1 = "<code>.+?</code>"; | |
String REGEX2 = "<(.|\n)+?>"; | |
String REPLACE = ""; | |
Pattern p1 = Pattern.compile(REGEX1, Pattern.DOTALL); | |
Pattern p2 = Pattern.compile(REGEX2, Pattern.DOTALL); | |
// get a matcher object | |
Matcher m1 = p1.matcher(body); | |
StringBuffer sb1 = new StringBuffer(); | |
while (m1.find()) { | |
m1.appendReplacement(sb1, REPLACE); | |
} | |
m1.appendTail(sb1); | |
// System.out.println(sb1.toString()); | |
String tempBody = sb1.toString().trim(); | |
Matcher m2 = p2.matcher(tempBody); | |
StringBuffer sb2 = new StringBuffer(); | |
while (m2.find()) { | |
m2.appendReplacement(sb2, REPLACE); | |
} | |
m2.appendTail(sb2); | |
tempBody = sb2.toString().replaceAll("\\s+$", ""); | |
tempBody = tempBody.replaceAll("\\n", ""); | |
return tempBody; | |
} | |
public static void writeFile(String text, String fileName)throws IOException { | |
File file = new File(fileName); | |
// creates the file | |
file.createNewFile(); | |
// creates a FileWriter Object | |
FileWriter writer = new FileWriter(file); | |
// Writes the content to the file | |
writer.write(text); | |
writer.flush(); | |
writer.close(); | |
} | |
private static String runStemmer(String value) { | |
String url = "(http?|https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]"; | |
String line = value.toString().toLowerCase().replace("\"", ""); | |
line = line.replaceAll(url, ""); | |
StringTokenizer tokenizer = new StringTokenizer(line, " \t\n\r\f,.:;?![]'"); | |
StringBuilder builder = new StringBuilder(); | |
while (tokenizer.hasMoreTokens()) { | |
String token = tokenizer.nextToken(); | |
//remove digits from the beginning or from any other place of the word | |
// String regex = "\\d+|\\+*|-*|=|\\*"; | |
// token = Pattern.compile(regex, Pattern.DOTALL).matcher(token).replaceAll(REPLACE); | |
// token = token.replaceAll("--", REPLACE); | |
// token = token.replaceAll("=", REPLACE); | |
// token = token.replaceAll("-", REPLACE); | |
String REGEX4digits = "\\d+"; | |
Pattern p1 = Pattern.compile(REGEX4digits); | |
// get a matcher object | |
Matcher m1 = p1.matcher(token); | |
if (m1.matches()||token.equalsIgnoreCase("*") | |
|| token.equalsIgnoreCase("--")|| | |
token.equalsIgnoreCase("+")) | |
continue; | |
else if (stopWords.contains(token)) | |
continue; | |
else if(token.length()==0) | |
continue; | |
else { | |
stemmer.setCurrent(token); | |
stemmer.stem(); | |
String stemmed_word = stemmer.getCurrent(); | |
builder.append(stemmed_word); | |
builder.append(" "); | |
// builder.append(docname.toString()); | |
} | |
} | |
System.out.println(builder.toString()); | |
return builder.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment