Created
July 16, 2013 03:21
-
-
Save ansjsun/6005498 to your computer and use it in GitHub Desktop.
html文章正文抽取类.计算正文
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.kuyun.nlp; | |
import java.util.ArrayList; | |
import java.util.List; | |
import com.kuyun.nlp.util.PageDown; | |
public class HtmlExtraction { | |
public static void main(String[] args) { | |
String html = PageDown | |
.getHtml("http://sports.sina.com.cn/g/pl/2013-07-15/12276671936.shtml"); | |
// String html = "<aaa>bbbb<aaaa><aaa>bbbb<aaaa><aaa>bbbb<aaaa><aaa>bbbb<aaaa>" ; | |
long start = System.currentTimeMillis(); | |
for (int i = 0; i < 1000; i++) { | |
paser(html); | |
} | |
System.out.println(System.currentTimeMillis() - start); | |
} | |
public static void paser(String html) { | |
html = html.replaceAll("\\s+", " ").replaceAll("<", "<").replaceAll(">", ">") | |
.replaceAll(" ", " ").replaceAll("&[a-zA-Z]{2,5};", " ") | |
.replaceAll("(?is)<wbr\\s*/>", "").replaceAll("(?is)<!--.*?-->", "") | |
.replaceAll("(?is)<script.*?>.*?</script>", "") | |
.replaceAll("(?is)<style.*?>.*?</style>", "") | |
.replaceAll("(?is)(<a.*?>.*?</a>\\s*[^<]{0,6}\\s*(<[^a].*?>[\\|/]?\\s*)*){2,}+", ""); | |
; | |
List<Node> list = new ArrayList<HtmlExtraction.Node>(); | |
html = html.toLowerCase(); | |
int start = html.indexOf("<bod"); | |
if (start == -1) | |
start = 0; | |
else | |
start = findRightindex(start, html); | |
int length = html.length(); | |
char c = 0; | |
Node node = new Node(); | |
int right; | |
boolean flag = false; | |
for (int i = start; i < length; i++) { | |
c = html.charAt(i); | |
if (Character.isWhitespace(c)) { | |
continue; | |
} | |
if (c == '<') { | |
right = findRightindex(i, html); | |
node.updateLength(right - i); | |
i = right; | |
flag = true; | |
} else { | |
if (flag == true) { | |
list.add(node); | |
node = new Node(); | |
flag = false; | |
} | |
node.append(c); | |
} | |
} | |
list.add(node); | |
findMaxPath(list); | |
} | |
private static void findMaxPath(List<Node> list) { | |
for (Node node : list) { | |
System.out.println(node.score+"\t"+node.text); | |
} | |
// TODO Auto-generated method stub | |
Node from = null; | |
int maxScore = 0; | |
int index = 0; | |
int tempScore = 0; | |
Node node = null; | |
for (int i = 0; i < list.size(); i++) { | |
node = list.get(i); | |
if (from == null) { | |
from = node; | |
} else { | |
tempScore = node.walk(from); | |
if (maxScore < tempScore) { | |
maxScore = tempScore; | |
index = i; | |
} | |
from = node; | |
} | |
} | |
node = list.get(index); | |
while (node.maxFrom != node) { | |
node.maxFrom.to = node; | |
node = node.maxFrom; | |
} | |
StringBuilder sb = new StringBuilder(node.text); | |
while ((node = node.to) != null) { | |
sb.append(node.text); | |
} | |
System.out.println(sb); | |
} | |
private static int findRightindex(int start, String html) { | |
// TODO Auto-generated method stub | |
int length = html.length(); | |
for (; start < length; start++) { | |
if (html.charAt(start) == '>') | |
return start; | |
} | |
return 0; | |
} | |
static class Node { | |
private StringBuilder text = new StringBuilder(); | |
private int tagLength; | |
private int score; | |
private Node maxFrom; | |
private Node to; | |
public void updateLength(int length) { | |
tagLength += length; | |
} | |
public void append(char c) { | |
text.append(c); | |
this.score++; | |
} | |
public int walk(Node from) { | |
if (from.score + this.score - from.tagLength > this.score) { | |
this.maxFrom = from; | |
this.score = from.score + this.score - from.tagLength; | |
} else { | |
this.maxFrom = this; | |
} | |
return this.score; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment