Skip to content

Instantly share code, notes, and snippets.

@dnfehren
Created July 12, 2012 13:35
Show Gist options
  • Save dnfehren/3098144 to your computer and use it in GitHub Desktop.
Save dnfehren/3098144 to your computer and use it in GitHub Desktop.
Java URL finder, reads a text file and outputs a csv with identified urls and the number of times the domain appeared.
/* Url Finder by Daniel Fehrenbach August 2009
*
* Notes
*
* regular expression "\\b((https?)://|(www))\\S+" does not handle urls enclosed in parentheses (ie "(http....") used a find+replace in text editor to remove them
*
*
*
* */
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import java.util.Scanner;
public class url_search {
public static ArrayList<String> fullUrlList;
public static ArrayList<String> cleanUrlList;
public static List<String> countedUrlList;
public static ArrayList<String> finalUrlList;
public static void main(String[] args) throws FileNotFoundException{
fullUrlList = new ArrayList<String>();
cleanUrlList = new ArrayList<String>();
Scanner fileName = new Scanner(System.in);
System.out.println("Please Enter Your Full File Name (example - document.txt) : ");
String fileChoice = fileName.next();
Scanner locName = new Scanner(System.in);
System.out.println("Now enter the full path and file name for the location of the output of your list as shown");
System.out.println("two backslashes are required between folder names");
System.out.println("for example - C:\\\\Documents and Settings\\\\NAME\\\\Desktop\\\\apaurls.csv");
System.out.println("Please type the name and location of your final list? : ");
String location = locName.nextLine();
//System.out.println(location);
File inFile = new File(fileChoice);
Scanner sc = new Scanner(inFile);
while (sc.hasNextLine()){//makes list of full urls stored in array list as fullUrlList
String line = sc.nextLine();
String[] words = line.split("\\s");
for(int i = 0; i < words.length ; i++){
if (words[i].matches("\\b((https?)://|(www))\\S+"))
fullUrlList.add(words[i]);
}
//System.out.print(fullUrlList);}
}
ListIterator<String> cleanPass = fullUrlList.listIterator();
while (cleanPass.hasNext()){//removes http:// from url and anything after the top level domain (.com etc)
String cleanUrlstr = (String) cleanPass.next();
String[] urlComponents = cleanUrlstr.split("/");
if (urlComponents[0].matches("http:"))
cleanUrlList.add(urlComponents[2]);
else cleanUrlList.add(urlComponents[0]);
}
Collections.sort(cleanUrlList);//sorts cleaned url list alphabetically
//System.out.print(cleanUrlList);
String display = new String();
String[] urlArray = new String[23000];
for(int i = 0; i <cleanUrlList.size(); i++){
int count = 0;
String candidate = cleanUrlList.get(i);
for(int f = 0; f < cleanUrlList.size(); f++){
if (candidate.equals(cleanUrlList.get(f))){
count++;}
}
display = candidate+","+count;
urlArray[i] = display;
}
countedUrlList = (Arrays.asList(urlArray));
//System.out.println(countedUrlList);
String finalChoice = new String();
//String finalUrlArray[] = new String[500];
int lastChosen;
try{
FileWriter writer = new FileWriter(location);
writer.append("url");
writer.append(',');
writer.append("count");
writer.append('\n');
for(int d = 0; d<countedUrlList.size(); d++){
String chosen = countedUrlList.get(d);
lastChosen = countedUrlList.lastIndexOf(chosen);
//System.out.println(lastChosen);
finalChoice = countedUrlList.get(lastChosen);
//System.out.println(finalChoice);
writer.append(finalChoice);
writer.append('\n');
}
writer.flush();
writer.close();
}
catch(IOException e)
{
e.printStackTrace();
}
System.out.println("File now available at above location");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment