dnfehren · July 12, 2012 13:35
diff --git a/url_search.java b/url_search.java

 /* Url Finder by Daniel Fehrenbach August 2009
 * 
 * Notes
 * 
 * regular expression "\\b((https?)://|(www))\\S+" does not handle urls enclosed in parentheses (ie "(http....") used a find+replace in text editor to remove them
 * 
 * 
 * 
 * */

 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.ListIterator;
 import java.util.Scanner;

 public class url_search {
 	
 	public static ArrayList<String> fullUrlList;
 	public static ArrayList<String> cleanUrlList;
 	public static List<String> countedUrlList;
 	public static ArrayList<String> finalUrlList;

 	public static void main(String[] args) throws FileNotFoundException{
 		
 		fullUrlList = new ArrayList<String>();
 		cleanUrlList = new ArrayList<String>();
 		
 		Scanner fileName = new Scanner(System.in);
 		System.out.println("Please Enter Your Full File Name (example - document.txt) : ");
 		String fileChoice = fileName.next();
 		
 		
 		Scanner locName = new Scanner(System.in);
 		System.out.println("Now enter the full path and file name for the location of the output of your list as shown");
 		System.out.println("two backslashes are required between folder names");
 		System.out.println("for example - C:\\\\Documents and Settings\\\\NAME\\\\Desktop\\\\apaurls.csv");
 		System.out.println("Please type the name and location of your final list? : ");
 		String location = locName.nextLine();
 		//System.out.println(location);
 		
 		File inFile = new File(fileChoice);
 		Scanner sc = new Scanner(inFile);
 		
 		while (sc.hasNextLine()){//makes list of full urls stored in array list as fullUrlList       
 			    String line = sc.nextLine();
 			    String[] words = line.split("\\s");
 			    
 			    for(int i = 0; i < words.length ; i++){
 			    	if (words[i].matches("\\b((https?)://|(www))\\S+"))
 			    		fullUrlList.add(words[i]);
 			    }
 			    //System.out.print(fullUrlList);}
 		}
 		
 		ListIterator<String> cleanPass = fullUrlList.listIterator();
 		
 		while (cleanPass.hasNext()){//removes http:// from url and anything after the top level domain (.com etc)
 			String cleanUrlstr = (String) cleanPass.next();
 			String[] urlComponents = cleanUrlstr.split("/");
 			
 			if (urlComponents[0].matches("http:"))
 				cleanUrlList.add(urlComponents[2]);
 			else cleanUrlList.add(urlComponents[0]);						
 		}
 			Collections.sort(cleanUrlList);//sorts cleaned url list alphabetically
 			//System.out.print(cleanUrlList);
 		
 		String display = new String();
 		String[] urlArray = new String[23000];
 		
 		for(int i = 0; i <cleanUrlList.size(); i++){
 			int count = 0;
 			String candidate = cleanUrlList.get(i);

 			for(int f = 0; f < cleanUrlList.size(); f++){
 				if (candidate.equals(cleanUrlList.get(f))){
 				count++;}
 			}
 			display = candidate+","+count;
 			urlArray[i] = display;
 		}
 		countedUrlList = (Arrays.asList(urlArray));
 		//System.out.println(countedUrlList);
 		
 		String finalChoice = new String();
 		//String finalUrlArray[] = new String[500];
 		int lastChosen;
 		
 		try{
 		FileWriter writer = new FileWriter(location);
 
 		writer.append("url");
 		writer.append(',');
 		writer.append("count");
 		writer.append('\n');
 		
 		for(int d = 0; d<countedUrlList.size(); d++){
 			String chosen = countedUrlList.get(d);
 			lastChosen = countedUrlList.lastIndexOf(chosen);
 			//System.out.println(lastChosen);
 			finalChoice = countedUrlList.get(lastChosen);
 			//System.out.println(finalChoice);
 			
 			writer.append(finalChoice);
 			writer.append('\n');		
 		}
 			writer.flush();
 			writer.close();
 		
 		}
 		catch(IOException e)
 		{
 		 e.printStackTrace();
 		} 
 		
 	System.out.println("File now available at above location");	
 	
 	}	
 }

	/* Url Finder by Daniel Fehrenbach August 2009
	*
	* Notes
	*
	* regular expression "\\b((https?)://\|(www))\\S+" does not handle urls enclosed in parentheses (ie "(http....") used a find+replace in text editor to remove them
	*
	*
	*
	* */

	import java.io.File;
	import java.io.FileNotFoundException;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.List;
	import java.util.ListIterator;
	import java.util.Scanner;

	public class url_search {

	public static ArrayList<String> fullUrlList;
	public static ArrayList<String> cleanUrlList;
	public static List<String> countedUrlList;
	public static ArrayList<String> finalUrlList;

	public static void main(String[] args) throws FileNotFoundException{

	fullUrlList = new ArrayList<String>();
	cleanUrlList = new ArrayList<String>();

	Scanner fileName = new Scanner(System.in);
	System.out.println("Please Enter Your Full File Name (example - document.txt) : ");
	String fileChoice = fileName.next();


	Scanner locName = new Scanner(System.in);
	System.out.println("Now enter the full path and file name for the location of the output of your list as shown");
	System.out.println("two backslashes are required between folder names");
	System.out.println("for example - C:\\\\Documents and Settings\\\\NAME\\\\Desktop\\\\apaurls.csv");
	System.out.println("Please type the name and location of your final list? : ");
	String location = locName.nextLine();
	//System.out.println(location);

	File inFile = new File(fileChoice);
	Scanner sc = new Scanner(inFile);

	while (sc.hasNextLine()){//makes list of full urls stored in array list as fullUrlList
	String line = sc.nextLine();
	String[] words = line.split("\\s");

	for(int i = 0; i < words.length ; i++){
	if (words[i].matches("\\b((https?)://\|(www))\\S+"))
	fullUrlList.add(words[i]);
	}
	//System.out.print(fullUrlList);}
	}

	ListIterator<String> cleanPass = fullUrlList.listIterator();

	while (cleanPass.hasNext()){//removes http:// from url and anything after the top level domain (.com etc)
	String cleanUrlstr = (String) cleanPass.next();
	String[] urlComponents = cleanUrlstr.split("/");

	if (urlComponents[0].matches("http:"))
	cleanUrlList.add(urlComponents[2]);
	else cleanUrlList.add(urlComponents[0]);
	}
	Collections.sort(cleanUrlList);//sorts cleaned url list alphabetically
	//System.out.print(cleanUrlList);

	String display = new String();
	String[] urlArray = new String[23000];

	for(int i = 0; i <cleanUrlList.size(); i++){
	int count = 0;
	String candidate = cleanUrlList.get(i);

	for(int f = 0; f < cleanUrlList.size(); f++){
	if (candidate.equals(cleanUrlList.get(f))){
	count++;}
	}
	display = candidate+","+count;
	urlArray[i] = display;
	}
	countedUrlList = (Arrays.asList(urlArray));
	//System.out.println(countedUrlList);

	String finalChoice = new String();
	//String finalUrlArray[] = new String[500];
	int lastChosen;

	try{
	FileWriter writer = new FileWriter(location);

	writer.append("url");
	writer.append(',');
	writer.append("count");
	writer.append('\n');

	for(int d = 0; d<countedUrlList.size(); d++){
	String chosen = countedUrlList.get(d);
	lastChosen = countedUrlList.lastIndexOf(chosen);
	//System.out.println(lastChosen);
	finalChoice = countedUrlList.get(lastChosen);
	//System.out.println(finalChoice);

	writer.append(finalChoice);
	writer.append('\n');
	}
	writer.flush();
	writer.close();

	}
	catch(IOException e)
	{
	e.printStackTrace();
	}

	System.out.println("File now available at above location");

	}
	}