Skip to content

Instantly share code, notes, and snippets.

@nickgrealy
Created November 25, 2016 03:16
Show Gist options
  • Save nickgrealy/bde1c63fc689f5c7b7725d86f1a61b81 to your computer and use it in GitHub Desktop.
Save nickgrealy/bde1c63fc689f5c7b7725d86f1a61b81 to your computer and use it in GitHub Desktop.
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Socket;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class SpiderWeb {
/**
* Regular expression to match file types - .js/.css/.png/.jpg/.gif
*/
public static final Pattern resources = Pattern.compile("([^\"'\n({}]+\\.(js|css|png|jpg|gif))",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
/**
* Prints out all .png/.jpg/.gif resources, reference in the homepage and css content.
*/
public static void main(String[] args) throws Exception {
URL url = new URL("http://stackoverflow.com/questions/40468319/java-filter-images-from-an-http-response");
System.out.println(recurseResourcesFromHtml(url)
.stream()
.filter(r -> r.endsWith("png") || r.endsWith("jpg") || r.endsWith("gif"))
.collect(Collectors.joining("\n")));
}
/**
* Looks through original HTML document for file resources (.js/.css/.png/.jpg/.gif).
* It then, looks through .css files for more file resources.
* This can easily be extended to include '.js' files, or recursively enter 'href' links.
*/
public static Set<String> recurseResourcesFromHtml(URL url) {
// get resources from source html...
Set<String> resources = findResources(url, httpGet(url));
// search css resources for resources...
Set<String> cssResources = new HashSet<>();
resources.stream()
.filter(r -> r.endsWith(".css"))
.map(r -> r.startsWith("/") ? getBaseUrl(url) + r : r)
.forEach(r -> {
try {
cssResources.addAll(recurseResourcesFromHtml(new URL(r)));
} catch (MalformedURLException e) {
System.err.println("Ignoring incompatible url - '" + r + "' reason: " + e.getLocalizedMessage());
}
});
resources.addAll(cssResources);
return resources;
}
/**
* Pulls out "resources" from the provided text.
*/
public static Set<String> findResources(URL url, String text) {
Matcher matcher = resources.matcher(text);
Set<String> resources = new HashSet<>();
while (matcher.find()) {
String resource = matcher.group(1);
String urlStr = url.toString();
int endIndex = urlStr.lastIndexOf("/") + 1;
String parentPath = endIndex > 0 ? urlStr.substring(0, endIndex) : urlStr;
String fqResource = resource.startsWith("//") ? url.getProtocol() + ":" + resource :
resource.startsWith("http") ? resource
: resource.startsWith("/") ? getBaseUrl(url) + resource : parentPath + resource;
if (fqResource.contains("?")) {
fqResource = fqResource.substring(0, fqResource.indexOf("?"));
}
resources.add(fqResource);
}
return resources;
}
/**
* Your own socket code/http implementation goes here...
*/
public static String httpGet(URL url) {
try {
Socket socket = new Socket();
socket.setSoTimeout(3000);
System.err.println("Connecting to " + url + "...");
socket.connect(new InetSocketAddress(url.getHost(), getPort(url)));
try (InputStream inputStream = socket.getInputStream();
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream()))) {
// send request...
writer.write("GET " + url.getPath() + " HTTP/1.0\r\nHost: " + url.getHost() + "\r\n\r\n");
writer.flush();
// get response...
ByteArrayOutputStream result = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int length;
while ((length = inputStream.read(buffer)) != -1) {
result.write(buffer, 0, length);
}
return result.toString("UTF-8");
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private static String getBaseUrl(URL url) {
return String.format("%s://%s:%s", url.getProtocol(), url.getHost(), getPort(url));
}
private static String getBaseUrlAndPath(URL url) {
return String.format("%s://%s:%s/%s", url.getProtocol(), url.getHost(), getPort(url), url.getPath());
}
private static int getPort(URL url) {
int port = url.getPort();
return port != -1 ? port : "https".equals(url.getProtocol()) ? 443 : 80;
}
}
http://cdn.sstatic.net/Sites/stackoverflow/../../img/developer-story/announcement_banner/bg.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/hero-careers-import.png
http://pixel.quantserve.com/pixel/p-c1rF4kxgLUzNc.gif
http://cdn.sstatic.net/Sites/stackoverflow/../../img/fade.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/img-upload.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/documentation/doc-sprites.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/share-sprite.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/user-profile-hero.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/openid/openid-logos.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/diff-icons/sidebyside-markdown.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/open-graph/checkmark.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/jobs/ico-hero-02.png
http://stackoverflow.com/questions/40468319/img[src$=.png
http://cdn.sstatic.net/Sites/stackoverflow/img/[email protected]
http://cdn.sstatic.net/Sites/stackoverflow/../../img/openid-large.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/documentation/bg-hero.png
http://cdn.sstatic.net/Sites/stackoverflow/img/sprites.png
http://cdn.sstatic.net/Sites/stackoverflow/img/10m-pattern.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/signup_careers_bg.png
http://cdn.sstatic.net/Sites/stackoverflow/img/apple-touch-icon.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/user-profile-graph.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/diff-icons/inline-html.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/developer-story/announcement_banner/story-mockup.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/battleship.png
http://sstatic.net/img/progress-dots.gif
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/helpcenter/icon-pushpin.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/filter-sprites.png
http://cdn.sstatic.net/Sites/stackoverflow/img/wmd-buttons.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/diff-icons/sidebyside-html.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/favicons-sprite32.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/openid/new-login-sprite.png
https://i.stack.imgur.com/RdhGt.jpg
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/about/sprite-about-shadow.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/share-sprite.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/shared-icons.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/user-profile-no-data.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/favicons-sprite16.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/user-profile-sprite.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/user-profile-sprite.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/documentation/hero-mockup.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/fatarrows.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/signup.png
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/about/sprite-about.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/timeline.png
http://sstatic.net/Img/progress-dots.gif
http://cdn.sstatic.net/Sites/stackoverflow/../../Img/so-profile-icons-01.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/share-sprite-new.png
http://cdn.sstatic.net/Sites/stackoverflow/../../img/mini-hero-bg.png
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment