Skip to content

Instantly share code, notes, and snippets.

@osima
Created November 23, 2010 01:14
Show Gist options
  • Select an option

  • Save osima/711067 to your computer and use it in GitHub Desktop.

Select an option

Save osima/711067 to your computer and use it in GitHub Desktop.
fetch title from URL
@Grab(group='nekohtml', module='nekohtml', version='1.9.6')
import org.cyberneko.html.parsers.SAXParser
import java.util.regex.Pattern
// URLからタイトルを取得する.
class TitleUtil {
String getTitle(URL url)
throws FileNotFoundException,Exception{
getTitleFromUrl(url)
}
static def getEncode(URL url)
throws FileNotFoundException,Exception{
// 文字コード判別
def encode = null
def r = new BufferedReader(new InputStreamReader(url.openStream()) )
while( true ){
def line = r.readLine()
if( line == null ){
break
}
if( (line =~ /meta/ || line =~ /META/) && line =~ /charset/ ){
//println line
def pat = java.util.regex.Pattern.compile('charset=(.*)"')
def m = pat.matcher(line)
if( m.find() ){
encode = m.group(1)
}
break
}
}
r.close()
encode
}
// 取得できなかったときは null 値がかえる
static def getTitleFromUrl(URL url)
throws FileNotFoundException,Exception{
// 1) get encode
def enc = getEncode(url)
if( enc==null ){ enc='UTF-8' }
else {
// ここにJavaでサポートされているエンコード名を列挙
//def encode_list = ['UTF-8','SHIFT_JIS','']
if( enc.toUpperCase().trim() =~ /UTF-8/ ){
enc = 'UTF-8'
}
}
// 2) get title
def t = null
try{
def r = new BufferedReader(new InputStreamReader(url.openStream(),enc) )
new XmlSlurper(new SAXParser()).parse(r).'**'.findAll{
if( it.name() == 'TITLE' ){
//t = it.toString()
def sb = ''<<''
new StringReader(it.toString()).each{
sb << it
}
t = sb.toString()
}
}
r.close()
t.trim()
}
catch(Exception ex){
}
t
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment