Last active
August 29, 2015 14:14
-
-
Save kchodorow/d3c5438dda0eac0892df to your computer and use it in GitHub Desktop.
Extra credit stub for Monday, February 9th
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.net.URL; | |
import java.nio.channels.Channels; | |
import java.nio.channels.ReadableByteChannel; | |
import java.util.ArrayList; | |
import java.util.NoSuchElementException; | |
import java.util.Scanner; | |
/** | |
* Crawls the web. | |
*/ | |
public class Spider { | |
public static void main(String args[]) { | |
WebCrawler crawler = new WebCrawler("http://www.google.com", 3); | |
crawler.crawl(); | |
} | |
} | |
class WebCrawler { | |
// List of URLs to crawl. This only contains startingUrl when created, more URLs are added by | |
// addUrls(). | |
private ArrayList<String> urls; | |
// How many links to follow. | |
private int depth; | |
// Used to give unique names to downloaded files. | |
private int counter; | |
public WebCrawler(String startingUrl, int depth) { | |
this.depth = depth; | |
this.urls = new ArrayList<String>(); | |
urls.add(startingUrl); | |
this.counter = 0; | |
} | |
public void crawl() { | |
// TODO: fill in. | |
} | |
/** | |
* Download a page at a given URL. Returns the local filename that the page was saved under. | |
* Returns null if there was an error downloading the file. | |
* | |
* The downloaded files are saved in the top-level folder of your project (i.e., as siblings to | |
* the src/ directory). | |
*/ | |
private String downloadPage(String url) { | |
String filename = "page" + counter + ".html"; | |
counter = counter + 1; | |
try { | |
URL website = new URL(url); | |
ReadableByteChannel rbc = Channels.newChannel(website.openStream()); | |
FileOutputStream fos = new FileOutputStream(filename); | |
System.out.println("At depth " + depth + ", downloading " + url); | |
// Download the page at the given URL to a file named page.html. | |
fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); | |
fos.close(); | |
} catch (IOException e) { | |
System.out.println("Failed to download URL " + url + ": " + e.getMessage()); | |
return null; | |
} | |
return filename; | |
} | |
/** | |
* Add any URLs found in the given file to the list of files to crawl. | |
*/ | |
private void addUrls(String filename) { | |
// Read the contents of the file into a String. | |
String content; | |
try { | |
content = new Scanner(new File(filename)).useDelimiter("\\Z").next(); | |
} catch (FileNotFoundException e) { | |
System.out.println("Could not read file " + filename + ": " + e.getMessage()); | |
return; | |
} catch (NoSuchElementException e) { | |
System.out.println(filename + " was empty, skipping (" + e.getMessage() + ")"); | |
return; | |
} | |
int urlPosition = content.indexOf("\"http://"); | |
while (urlPosition >= 0) { | |
int endOfUrlPosition = content.indexOf("\"", urlPosition+1); | |
String nextUrl = content.substring(urlPosition+1, endOfUrlPosition); | |
urls.add(nextUrl); | |
content = content.substring(endOfUrlPosition + 1); | |
urlPosition = content.indexOf("\"http://"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment