Created
November 23, 2010 01:14
-
-
Save osima/711067 to your computer and use it in GitHub Desktop.
fetch title from URL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @Grab(group='nekohtml', module='nekohtml', version='1.9.6') | |
| import org.cyberneko.html.parsers.SAXParser | |
| import java.util.regex.Pattern | |
| // URLからタイトルを取得する. | |
| class TitleUtil { | |
| String getTitle(URL url) | |
| throws FileNotFoundException,Exception{ | |
| getTitleFromUrl(url) | |
| } | |
| static def getEncode(URL url) | |
| throws FileNotFoundException,Exception{ | |
| // 文字コード判別 | |
| def encode = null | |
| def r = new BufferedReader(new InputStreamReader(url.openStream()) ) | |
| while( true ){ | |
| def line = r.readLine() | |
| if( line == null ){ | |
| break | |
| } | |
| if( (line =~ /meta/ || line =~ /META/) && line =~ /charset/ ){ | |
| //println line | |
| def pat = java.util.regex.Pattern.compile('charset=(.*)"') | |
| def m = pat.matcher(line) | |
| if( m.find() ){ | |
| encode = m.group(1) | |
| } | |
| break | |
| } | |
| } | |
| r.close() | |
| encode | |
| } | |
| // 取得できなかったときは null 値がかえる | |
| static def getTitleFromUrl(URL url) | |
| throws FileNotFoundException,Exception{ | |
| // 1) get encode | |
| def enc = getEncode(url) | |
| if( enc==null ){ enc='UTF-8' } | |
| else { | |
| // ここにJavaでサポートされているエンコード名を列挙 | |
| //def encode_list = ['UTF-8','SHIFT_JIS',''] | |
| if( enc.toUpperCase().trim() =~ /UTF-8/ ){ | |
| enc = 'UTF-8' | |
| } | |
| } | |
| // 2) get title | |
| def t = null | |
| try{ | |
| def r = new BufferedReader(new InputStreamReader(url.openStream(),enc) ) | |
| new XmlSlurper(new SAXParser()).parse(r).'**'.findAll{ | |
| if( it.name() == 'TITLE' ){ | |
| //t = it.toString() | |
| def sb = ''<<'' | |
| new StringReader(it.toString()).each{ | |
| sb << it | |
| } | |
| t = sb.toString() | |
| } | |
| } | |
| r.close() | |
| t.trim() | |
| } | |
| catch(Exception ex){ | |
| } | |
| t | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment