Created
October 8, 2010 17:04
-
-
Save alexzhan/617128 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.net.MalformedURLException; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.ArrayList; | |
import java.util.Date; | |
/** | |
* @author alexzhan | |
* To fetch the URLs of one specific HTML page | |
* param gb2312 may be changed | |
* pagam http://www.swjtu.edu.cn may be changed | |
*/ | |
public class YangInitial { | |
private static ArrayList<String> urlList = new ArrayList<String>(); | |
private static String getDocumentAt(String urlString) { | |
StringBuffer html_text = new StringBuffer(); | |
try { | |
URL url = new URL(urlString); | |
URLConnection conn = url.openConnection(); | |
BufferedReader reader = new BufferedReader(new InputStreamReader( | |
conn.getInputStream(), "gb2312")); | |
String line = null; | |
while ((line = reader.readLine()) != null) { | |
html_text.append(line + "\n"); | |
} | |
reader.close(); | |
} catch (MalformedURLException e) { | |
System.out.println("invalid URL: " + urlString); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return html_text.toString(); | |
} | |
private static void writeToFile(ArrayList<String> list) throws IOException { | |
BufferedWriter bw = new BufferedWriter(new FileWriter("/home/alex/Desktop/urlList")); | |
for (String string : list) { | |
bw.write(string + "\r\n");//windows \r\n ;linux \r is ok | |
} | |
bw.close(); | |
} | |
private static void generateURL(String string,String urlString) throws IOException { | |
if(!urlString.endsWith("/")) urlString = urlString + "/"; | |
String mainURL = urlString.substring(0, urlString.indexOf("/", 8)); | |
int start = string.indexOf("<a"); | |
int end = string.indexOf("</a", start); | |
while (start != -1) { | |
String hrefString = string.substring(start, end); | |
String href = "href="; | |
int hrefStart = hrefString.indexOf(href); | |
int urlStart = hrefString.indexOf("\"", hrefStart) + 1; | |
int urlEnd = hrefString.indexOf("\"", urlStart); | |
String url = hrefString.substring(urlStart, urlEnd); | |
if(url.startsWith("http")) | |
urlList.add(url); | |
else if(url.startsWith("/") && !url.equals("/")) | |
urlList.add(mainURL + url); | |
start = string.indexOf("<a", end); | |
end = string.indexOf("</a", start); | |
} | |
System.err.println("the number of url of this page:" + urlList.size()); | |
for (String urlString2 : urlList) { | |
System.err.println(urlString2); | |
} | |
writeToFile(urlList); | |
} | |
public static void main(String[] args) throws IOException { | |
String url = "http://www.swjtu.edu.cn"; | |
Date time1 = new Date(); | |
String string = getDocumentAt(url); | |
generateURL(string, url); | |
Date time2 = new Date(); | |
long processtime = time2.getTime() - time1.getTime(); | |
System.out.println("Done with time(" + processtime + ")ms"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment