Created
November 23, 2017 16:22
-
-
Save Terryhung/3fe1b8d697f85849f0dcf2a70ec1711e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static String AddDocHead(String s) throws Exception { | |
String _s = "<style> .article{padding-left: 18px;padding-right: 18px;padding-top: 15px;word-break: break-all}.article-header{font-size: 27px;color: #1a1a1a}.article-info{font-size: 16px;color: #c2c2c2;margin-top: 18px;margin-bottom: 18px}.article-content{margin-top: 18px;font-size: 22px;color: #818181}.main-image{width: 100%;max-height: 500px;margin-top: 20px;margin-bottom: 20px}hr{margin-top: 20px;color: #f5f5f5} img{width: 100%}</style>".concat(s); | |
return _s; | |
} | |
public static void Parsing() throws Exception { | |
String url = "http://www.cna.com.tw/news/aloc/201710030325-1.aspx"; | |
Document doc = Jsoup.connect(url) | |
.header("User-Agent", "Mozilla/5.0 (Linux; Android 7.0; SAMSUNG SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/5.2 Chrome/51.0.2704.106 Mobile Safari/537.36") | |
.get(); | |
String target = "div.news_article"; | |
Elements divs = doc.select(target); | |
String remove = "script, button"; | |
Elements removed = doc.select(remove); | |
String image_url = "http://img.appledaily.com.tw/images/twapple/640pix/20170925/BN02/BN02_005.jpg"; | |
String news_title = "苗栗縣鼓勵青年創業苗栗縣鼓勵青年創業苗栗縣鼓勵青年創業"; | |
String news_source = "CNA"; | |
String news_source_date = "2017-11-11"; | |
removed.remove(); | |
Whitelist wl = Whitelist.relaxed(); | |
wl.addTags("div", "span", "p", "br", "article", "section", "style"); | |
wl.addAttributes("div", "class"); | |
wl.addAttributes("img", "class", "src"); | |
/* To html*/ | |
String mProcessedHtml = Jsoup.clean(divs.outerHtml(),wl); | |
mProcessedHtml = String.format("<div class='article'><div class='article-header'>%s</div><div class='article-info'>%s, %s</div><div class='article-content'>%s</div></div>", news_title, news_source, news_source_date, mProcessedHtml); | |
/* Add Title to first position*/ | |
/*Wrap New Tag*/ | |
mProcessedHtml = AddDocHead(mProcessedHtml); | |
CreateHTML(Arrays.asList(mProcessedHtml)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment