Created
July 12, 2012 13:35
-
-
Save dnfehren/3098144 to your computer and use it in GitHub Desktop.
Java URL finder, reads a text file and outputs a csv with identified urls and the number of times the domain appeared.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Url Finder by Daniel Fehrenbach August 2009 | |
* | |
* Notes | |
* | |
* regular expression "\\b((https?)://|(www))\\S+" does not handle urls enclosed in parentheses (ie "(http....") used a find+replace in text editor to remove them | |
* | |
* | |
* | |
* */ | |
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Collections; | |
import java.util.List; | |
import java.util.ListIterator; | |
import java.util.Scanner; | |
public class url_search { | |
public static ArrayList<String> fullUrlList; | |
public static ArrayList<String> cleanUrlList; | |
public static List<String> countedUrlList; | |
public static ArrayList<String> finalUrlList; | |
public static void main(String[] args) throws FileNotFoundException{ | |
fullUrlList = new ArrayList<String>(); | |
cleanUrlList = new ArrayList<String>(); | |
Scanner fileName = new Scanner(System.in); | |
System.out.println("Please Enter Your Full File Name (example - document.txt) : "); | |
String fileChoice = fileName.next(); | |
Scanner locName = new Scanner(System.in); | |
System.out.println("Now enter the full path and file name for the location of the output of your list as shown"); | |
System.out.println("two backslashes are required between folder names"); | |
System.out.println("for example - C:\\\\Documents and Settings\\\\NAME\\\\Desktop\\\\apaurls.csv"); | |
System.out.println("Please type the name and location of your final list? : "); | |
String location = locName.nextLine(); | |
//System.out.println(location); | |
File inFile = new File(fileChoice); | |
Scanner sc = new Scanner(inFile); | |
while (sc.hasNextLine()){//makes list of full urls stored in array list as fullUrlList | |
String line = sc.nextLine(); | |
String[] words = line.split("\\s"); | |
for(int i = 0; i < words.length ; i++){ | |
if (words[i].matches("\\b((https?)://|(www))\\S+")) | |
fullUrlList.add(words[i]); | |
} | |
//System.out.print(fullUrlList);} | |
} | |
ListIterator<String> cleanPass = fullUrlList.listIterator(); | |
while (cleanPass.hasNext()){//removes http:// from url and anything after the top level domain (.com etc) | |
String cleanUrlstr = (String) cleanPass.next(); | |
String[] urlComponents = cleanUrlstr.split("/"); | |
if (urlComponents[0].matches("http:")) | |
cleanUrlList.add(urlComponents[2]); | |
else cleanUrlList.add(urlComponents[0]); | |
} | |
Collections.sort(cleanUrlList);//sorts cleaned url list alphabetically | |
//System.out.print(cleanUrlList); | |
String display = new String(); | |
String[] urlArray = new String[23000]; | |
for(int i = 0; i <cleanUrlList.size(); i++){ | |
int count = 0; | |
String candidate = cleanUrlList.get(i); | |
for(int f = 0; f < cleanUrlList.size(); f++){ | |
if (candidate.equals(cleanUrlList.get(f))){ | |
count++;} | |
} | |
display = candidate+","+count; | |
urlArray[i] = display; | |
} | |
countedUrlList = (Arrays.asList(urlArray)); | |
//System.out.println(countedUrlList); | |
String finalChoice = new String(); | |
//String finalUrlArray[] = new String[500]; | |
int lastChosen; | |
try{ | |
FileWriter writer = new FileWriter(location); | |
writer.append("url"); | |
writer.append(','); | |
writer.append("count"); | |
writer.append('\n'); | |
for(int d = 0; d<countedUrlList.size(); d++){ | |
String chosen = countedUrlList.get(d); | |
lastChosen = countedUrlList.lastIndexOf(chosen); | |
//System.out.println(lastChosen); | |
finalChoice = countedUrlList.get(lastChosen); | |
//System.out.println(finalChoice); | |
writer.append(finalChoice); | |
writer.append('\n'); | |
} | |
writer.flush(); | |
writer.close(); | |
} | |
catch(IOException e) | |
{ | |
e.printStackTrace(); | |
} | |
System.out.println("File now available at above location"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment