Skip to content

Instantly share code, notes, and snippets.

@Viacheslav77
Created March 19, 2016 02:02
Show Gist options
  • Select an option

  • Save Viacheslav77/07200d5f6e2dd363fd91 to your computer and use it in GitHub Desktop.

Select an option

Save Viacheslav77/07200d5f6e2dd363fd91 to your computer and use it in GitHub Desktop.
Скачать все HTML файлы, доступные по ссылкам в отдельный каталог.
package ParseHTML;
//Скачать все HTML файлы, доступные по ссылкам в отдельный каталог.
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MainDownload {
static List<String> list = new ArrayList<String> ();
public static void main(String args[]) throws Exception {
String path = "http://meta.ua";
String pathToList = "d:\\1\\";
String html = getHTML(path);
String text = "\"http://.*?\\\"";
getTextURL(html, text);
setSave(pathToList);
}
private static void getTextURL(String html, String text) {
Pattern p = Pattern.compile(text, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
while (m.find()) {
list.add(m.group().replace("\"", ""));
}
}
private static String getHTML(String urlStr) throws IOException {
StringBuilder sb = new StringBuilder();
URL u = new URL(urlStr);
HttpURLConnection uc = (HttpURLConnection) u.openConnection();
try (BufferedReader in = new BufferedReader( new InputStreamReader(uc.getInputStream()))){
String inputLine;
while ( ( inputLine = in.readLine()) != null)
sb.append(inputLine+"\n");
}
return sb.toString();
}
// Записываем найденный список ссылок на странице в файлы
public static void setSave(String pathToList) throws IOException{
for(String s: list){
String html = getHTML(s);
StringBuilder strbld= new StringBuilder(pathToList);
strbld
.append(
s.replace("http://", "")
.replace("/","_")
.replace(";","_")
.replace("&","_")
.replace("?","")
.replace(">","")
.replace("=","")
.replace(".","_")
.replace("\"",""))
.append(".html");
File f = new File(strbld.toString());
try (FileWriter fw = new FileWriter(f)){
if (!f.isFile())
f.createNewFile();
fw.write(html);
System.out.println("File ... " + strbld + " ... save");
}
}
}
}
File ... d:\1\www_w3_org_TR_html4_strict_dtd.html ... save
File ... d:\1\meta_ua_favicon_small_ico.html ... save
File ... d:\1\static_meta_ua_css_meta_cssv0_4.html ... save
File ... d:\1\static_meta_ua_js_cook_js.html ... save
File ... d:\1\meta_ua_addons_search_ie-opensearch_xml.html ... save
File ... d:\1\meta_ua_addons_search_market-ie-opensearch_xml.html ... save
File ... d:\1\meta_ua_addons_search_news-ie-opensearch_xml.html ... save
File ... d:\1\static_meta_ua_js_meta_pack_plus_jsv1_99.html ... save
File ... d:\1\static_meta_ua_js_ajax_h_js.html ... save
File ... d:\1\static_meta_ua_js_swfobject_jsv0_1.html ... save
File ... d:\1\static_meta_ua_js_dd_js.html ... save
File ... d:\1\static_meta_ua_css_boxes1_cssv0_2.html ... save
File ... d:\1\static_meta_ua_css_blocks2_cssv0_2.html ... save
File ... d:\1\static_meta_ua_js_blocks_jsv0_1.html ... save
File ... d:\1\meta_ua.html ... save
File ... d:\1\search_meta_ua.html ... save
File ... d:\1\static_meta_ua_js_z_jsv0_2.html ... save
File ... d:\1\meta_ua_.html ... save
File ... d:\1\meta_ua_img_m_logo_t_gif.html ... save
File ... d:\1\search_meta_ua_.html ... save
File ... d:\1\search_meta_ua_search_asp.html ... save
File ... d:\1\meta_ua_reestr_asp.html ... save
File ... d:\1\dir_meta_ua_search_php.html ... save
File ... d:\1\news_meta_ua_.html ... save
File ... d:\1\news_meta_ua_.html ... save
File ... d:\1\map_meta_ua_.html ... save
File ... d:\1\map_meta_ua_.html ... save
File ... d:\1\edu_meta_ua_.html ... save
File ... d:\1\edu_meta_ua_.html ... save
File ... d:\1\metamarket_ua_.html ... save
File ... d:\1\metamarket_ua_search_php.html ... save
File ... d:\1\search_meta_ua_search_asp.html ... save
File ... d:\1\static_meta_ua_js_keyb_constant_jsv02.html ... save
File ... d:\1\static_meta_ua_js_keyb_keyb_jsv02.html ... save
File ... d:\1\static_meta_ua_img_ico_barv35_gif.html ... save
File ... d:\1\ad4sell_com_js_mtb_js.html ... save
File ... d:\1\translate_meta_ua_ru_noredir.html ... save
File ... d:\1\translate_meta_ua_ru_.html ... save
File ... d:\1\translate_meta_ua_images_btn-translate-ru_png.html ... save
File ... d:\1\meta_ua_sitemap_asp.html ... save
File ... d:\1\news_meta_ua_slices_news_html.html ... save
File ... d:\1\news_meta_ua.html ... save
File ... d:\1\news_meta_ua_topnews_.html ... save
File ... d:\1\metamarket_ua_.html ... save
File ... d:\1\dir_meta_ua_.html ... save
File ... d:\1\sport_ua_.html ... save
File ... d:\1\auto_meta_ua_.html ... save
File ... d:\1\auto_meta_ua_search_full_.html ... save
File ... d:\1\auto_meta_ua_.html ... save
File ... d:\1\veslo_org.html ... save
File ... d:\1\veslo_org_v_13952_html.html ... save
File ... d:\1\veslo_org_meta_hair09_jpg.html ... save
File ... d:\1\veslo_org_v_13949_html.html ... save
File ... d:\1\veslo_org_v_13950_html.html ... save
File ... d:\1\veslo_org_v_13951_html.html ... save
File ... d:\1\horo_meta_ua.html ... save
File ... d:\1\pogoda_meta_ua_slice_.html ... save
File ... d:\1\meta_ua_feeds_finance_asp.html ... save
File ... d:\1\meta_ua_finance_.html ... save
File ... d:\1\tv_meta_ua_.html ... save
File ... d:\1\forum_meta_ua.html ... save
File ... d:\1\meta_ua_feeds_forum_asp.html ... save
File ... d:\1\dating_meta_ua_.html ... save
File ... d:\1\dating_meta_ua_.html ... save
File ... d:\1\chat_meta_ua_.html ... save
File ... d:\1\chat_meta_ua.html ... save
File ... d:\1\meta_ua_img_b4_1000x180_1_jpg.html ... save
File ... d:\1\cobrand_ria_com_js_ria_informer_jsriacode2797873d94a545312fd5ad7361d9b1e6.html ... save
File ... d:\1\juke_mmi_bemobile_ua_bug_pic_gifsiteidmeta_ua.html ... save
File ... d:\1\juke_mmi_bemobile_ua_bug_pic_gifsiteidmeta_ua_j1_nocache.html ... save
File ... d:\1\source_mmi_bemobile_ua_cm_cm_js.html ... save
File ... d:\1\passport_meta_ua_get_main_info_php.html ... save
File ... d:\1\meta_ua_img_ico_barv35_anim_gif.html ... save
File ... d:\1\passport_meta_ua_get_main_info_php.html ... save
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment