Created
March 19, 2016 02:02
-
-
Save Viacheslav77/07200d5f6e2dd363fd91 to your computer and use it in GitHub Desktop.
Скачать все HTML файлы, доступные по ссылкам в отдельный каталог.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package ParseHTML; | |
| //Скачать все HTML файлы, доступные по ссылкам в отдельный каталог. | |
| import java.io.BufferedReader; | |
| import java.io.File; | |
| import java.io.FileWriter; | |
| import java.io.IOException; | |
| import java.io.InputStreamReader; | |
| import java.net.HttpURLConnection; | |
| import java.net.URL; | |
| import java.net.URLConnection; | |
| import java.util.ArrayList; | |
| import java.util.List; | |
| import java.util.regex.Matcher; | |
| import java.util.regex.Pattern; | |
| public class MainDownload { | |
| static List<String> list = new ArrayList<String> (); | |
| public static void main(String args[]) throws Exception { | |
| String path = "http://meta.ua"; | |
| String pathToList = "d:\\1\\"; | |
| String html = getHTML(path); | |
| String text = "\"http://.*?\\\""; | |
| getTextURL(html, text); | |
| setSave(pathToList); | |
| } | |
| private static void getTextURL(String html, String text) { | |
| Pattern p = Pattern.compile(text, Pattern.CASE_INSENSITIVE); | |
| Matcher m = p.matcher(html); | |
| while (m.find()) { | |
| list.add(m.group().replace("\"", "")); | |
| } | |
| } | |
| private static String getHTML(String urlStr) throws IOException { | |
| StringBuilder sb = new StringBuilder(); | |
| URL u = new URL(urlStr); | |
| HttpURLConnection uc = (HttpURLConnection) u.openConnection(); | |
| try (BufferedReader in = new BufferedReader( new InputStreamReader(uc.getInputStream()))){ | |
| String inputLine; | |
| while ( ( inputLine = in.readLine()) != null) | |
| sb.append(inputLine+"\n"); | |
| } | |
| return sb.toString(); | |
| } | |
| // Записываем найденный список ссылок на странице в файлы | |
| public static void setSave(String pathToList) throws IOException{ | |
| for(String s: list){ | |
| String html = getHTML(s); | |
| StringBuilder strbld= new StringBuilder(pathToList); | |
| strbld | |
| .append( | |
| s.replace("http://", "") | |
| .replace("/","_") | |
| .replace(";","_") | |
| .replace("&","_") | |
| .replace("?","") | |
| .replace(">","") | |
| .replace("=","") | |
| .replace(".","_") | |
| .replace("\"","")) | |
| .append(".html"); | |
| File f = new File(strbld.toString()); | |
| try (FileWriter fw = new FileWriter(f)){ | |
| if (!f.isFile()) | |
| f.createNewFile(); | |
| fw.write(html); | |
| System.out.println("File ... " + strbld + " ... save"); | |
| } | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| File ... d:\1\www_w3_org_TR_html4_strict_dtd.html ... save | |
| File ... d:\1\meta_ua_favicon_small_ico.html ... save | |
| File ... d:\1\static_meta_ua_css_meta_cssv0_4.html ... save | |
| File ... d:\1\static_meta_ua_js_cook_js.html ... save | |
| File ... d:\1\meta_ua_addons_search_ie-opensearch_xml.html ... save | |
| File ... d:\1\meta_ua_addons_search_market-ie-opensearch_xml.html ... save | |
| File ... d:\1\meta_ua_addons_search_news-ie-opensearch_xml.html ... save | |
| File ... d:\1\static_meta_ua_js_meta_pack_plus_jsv1_99.html ... save | |
| File ... d:\1\static_meta_ua_js_ajax_h_js.html ... save | |
| File ... d:\1\static_meta_ua_js_swfobject_jsv0_1.html ... save | |
| File ... d:\1\static_meta_ua_js_dd_js.html ... save | |
| File ... d:\1\static_meta_ua_css_boxes1_cssv0_2.html ... save | |
| File ... d:\1\static_meta_ua_css_blocks2_cssv0_2.html ... save | |
| File ... d:\1\static_meta_ua_js_blocks_jsv0_1.html ... save | |
| File ... d:\1\meta_ua.html ... save | |
| File ... d:\1\search_meta_ua.html ... save | |
| File ... d:\1\static_meta_ua_js_z_jsv0_2.html ... save | |
| File ... d:\1\meta_ua_.html ... save | |
| File ... d:\1\meta_ua_img_m_logo_t_gif.html ... save | |
| File ... d:\1\search_meta_ua_.html ... save | |
| File ... d:\1\search_meta_ua_search_asp.html ... save | |
| File ... d:\1\meta_ua_reestr_asp.html ... save | |
| File ... d:\1\dir_meta_ua_search_php.html ... save | |
| File ... d:\1\news_meta_ua_.html ... save | |
| File ... d:\1\news_meta_ua_.html ... save | |
| File ... d:\1\map_meta_ua_.html ... save | |
| File ... d:\1\map_meta_ua_.html ... save | |
| File ... d:\1\edu_meta_ua_.html ... save | |
| File ... d:\1\edu_meta_ua_.html ... save | |
| File ... d:\1\metamarket_ua_.html ... save | |
| File ... d:\1\metamarket_ua_search_php.html ... save | |
| File ... d:\1\search_meta_ua_search_asp.html ... save | |
| File ... d:\1\static_meta_ua_js_keyb_constant_jsv02.html ... save | |
| File ... d:\1\static_meta_ua_js_keyb_keyb_jsv02.html ... save | |
| File ... d:\1\static_meta_ua_img_ico_barv35_gif.html ... save | |
| File ... d:\1\ad4sell_com_js_mtb_js.html ... save | |
| File ... d:\1\translate_meta_ua_ru_noredir.html ... save | |
| File ... d:\1\translate_meta_ua_ru_.html ... save | |
| File ... d:\1\translate_meta_ua_images_btn-translate-ru_png.html ... save | |
| File ... d:\1\meta_ua_sitemap_asp.html ... save | |
| File ... d:\1\news_meta_ua_slices_news_html.html ... save | |
| File ... d:\1\news_meta_ua.html ... save | |
| File ... d:\1\news_meta_ua_topnews_.html ... save | |
| File ... d:\1\metamarket_ua_.html ... save | |
| File ... d:\1\dir_meta_ua_.html ... save | |
| File ... d:\1\sport_ua_.html ... save | |
| File ... d:\1\auto_meta_ua_.html ... save | |
| File ... d:\1\auto_meta_ua_search_full_.html ... save | |
| File ... d:\1\auto_meta_ua_.html ... save | |
| File ... d:\1\veslo_org.html ... save | |
| File ... d:\1\veslo_org_v_13952_html.html ... save | |
| File ... d:\1\veslo_org_meta_hair09_jpg.html ... save | |
| File ... d:\1\veslo_org_v_13949_html.html ... save | |
| File ... d:\1\veslo_org_v_13950_html.html ... save | |
| File ... d:\1\veslo_org_v_13951_html.html ... save | |
| File ... d:\1\horo_meta_ua.html ... save | |
| File ... d:\1\pogoda_meta_ua_slice_.html ... save | |
| File ... d:\1\meta_ua_feeds_finance_asp.html ... save | |
| File ... d:\1\meta_ua_finance_.html ... save | |
| File ... d:\1\tv_meta_ua_.html ... save | |
| File ... d:\1\forum_meta_ua.html ... save | |
| File ... d:\1\meta_ua_feeds_forum_asp.html ... save | |
| File ... d:\1\dating_meta_ua_.html ... save | |
| File ... d:\1\dating_meta_ua_.html ... save | |
| File ... d:\1\chat_meta_ua_.html ... save | |
| File ... d:\1\chat_meta_ua.html ... save | |
| File ... d:\1\meta_ua_img_b4_1000x180_1_jpg.html ... save | |
| File ... d:\1\cobrand_ria_com_js_ria_informer_jsriacode2797873d94a545312fd5ad7361d9b1e6.html ... save | |
| File ... d:\1\juke_mmi_bemobile_ua_bug_pic_gifsiteidmeta_ua.html ... save | |
| File ... d:\1\juke_mmi_bemobile_ua_bug_pic_gifsiteidmeta_ua_j1_nocache.html ... save | |
| File ... d:\1\source_mmi_bemobile_ua_cm_cm_js.html ... save | |
| File ... d:\1\passport_meta_ua_get_main_info_php.html ... save | |
| File ... d:\1\meta_ua_img_ico_barv35_anim_gif.html ... save | |
| File ... d:\1\passport_meta_ua_get_main_info_php.html ... save |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment