Skip to content

Instantly share code, notes, and snippets.

@hdemon
Created August 17, 2011 17:52
Show Gist options
  • Save hdemon/1152142 to your computer and use it in GitHub Desktop.
Save hdemon/1152142 to your computer and use it in GitHub Desktop.
ゆっくり動画クローラ v0.1のコア部分 2011/8/17
# -*- encoding: utf-8 -*-
require 'rubygems'
require 'mechanize'
require 'kconv'
require 'mysql'
require 'date'
require 'damerau-levenshtein'
require 'cgi'
load 'unicode.rb'
$nikoTitle = " ‐ ニコニコ動画(原宿)"
class Numeric
def round_to(valid_digit)
powered = self * (10 ** valid_digit)
powered.round * (10 ** -valid_digit)
end
end
class Mech
def initialize
@low = Mechanize.new
@mechHolder = "mech isn't initialized yet."
puts "Mechanize initialized."
end
public
def get (url, holder)
sleep 3
puts "connect to " + url
@low.get(url)
@holder = holder
end
attr_accessor :low
attr_accessor :holder
end
def login (mail, pass)
$mech.get("https://secure.nicovideo.jp/secure/login_form", "loginForm")
$mech.low.page.search("//div[@class='loginFormWrap']/form") { |form|
form['mail'] = mail
form['password'] = pass
form.click_button
}
puts "Logged in Niconico."
end
class Extractor
def extrMylistId (startDate, endDate)
sMlHTML = SearchMylistHTML.new()
sMlHTML.extrVideoInfo(startDate, endDate)
sMlHTML.movieHTML.each_key { |videoId|
sMlHTML.movieHTML[videoId].mlIdLikeAry = sMlHTML.movieHTML[videoId].extrMylistId
puts "Extracting mylist-id in " + videoId
idlike = sMlHTML.movieHTML[videoId].mlIdLikeAry
if idlike.length == 0 then puts videoId.to_s + " isn't contained mylist-id." end
idlike.each { |e|
puts e.to_s + " is extracted."
_isSeries = sMlHTML.movieHTML[videoId].isSeriesof(e.to_s)
@mlTitle = _isSeries['mlTitle']
if _isSeries['judge'] then
puts e.to_s + "/" + @mlTitle + "\sis series."
#return {'mylistId' => e, 'title' => mlTitle}
else
puts e.to_s + "/" + @mlTitle + "\sis not series."
end
puts
}
puts
}
puts
end
end
# マイリスト検索結果画面の管理クラス
class SearchMylistHTML
def initialize
@word = "%E3%82%86%E3%81%A3%E3%81%8F%E3%82%8A%E5%AE%9F%E6%B3%81"
@url = "http://www.nicovideo.jp/search/"
@page = 0
@eachMovie = "//div[@id='PAGEBODY']//div[@style='clear:both;']"
@mylist = []
@movieHTML = {}
end
private
def get
@reqUrl = @url + @word + "?page=#{@page+1}&sort=f&order=d"
@holder = "search/" + @reqUrl
if $mech.holder != @holder then
$mech.get(@reqUrl, @holder)
end
@movNum = ($mech.low.page.search(@eachMovie).size)
end
# 動画の投稿日を取得する。
def getItsDate (no)
dateLump = $mech.low.page.search(@eachMovie + "/div/table/tr/td[2]/div/p/strong")
itsDate = dateLump[no].text.scan(/[0-9]{4}年[0-9]{1,}月[0-9]{1,}日/)[0]
itsDate[4] = "-"; itsDate[7] = "-"; itsDate[10] = ""
itsDate = Date.strptime(itsDate)
return itsDate
end
# 共用するイテレータの類型。ページ毎と動画毎の2重ループ。
# 32とは、検索画面に一度に表示される動画の数。
def itelator (endDate, cb_eachPage, cb_eachMov)
termLoop = false
@page = 0
until termLoop do
get
cb_eachPage.call(@page)
32.times { |no|
if getItsDate(no) == endDate - 1 then
termLoop = true
break
end
cb_eachMov.call(no)
}
@page += 1
end
end
public
# 動画の動画IDとタイトルを同時に取得。
def extrVideoInfo (startDate, endDate)
_page = 0
itelator(
endDate,
lambda { |page|
@videoIdLump = $mech.low.page.search(@eachMovie + "/div/table")
@titleLump = $mech.low.page.search(
@eachMovie +
"/div/table" +
"/tr/td[2]/div/p[2]" +
"/nobr/a"
)
_page = page
},
lambda { |no|
@videoId = @videoIdLump[no]['summary']
@title = @titleLump[no]['title']
if startDate > getItsDate(no) then
@movieHTML[@videoId] = MovieHTML.new()
@movieHTML[@videoId].videoId = @videoId
@movieHTML[@videoId].title = @title
puts _page.to_s + "-" + no.to_s + ": " + @videoId.to_s + " " + @title.to_s
end
}
)
end
attr_accessor :movieHTML
end
# 個別の動画画面管理クラス。
class MovieHTML
private
def get
@videoId = videoId
@holder = "movie/" + @videoId
if $mech.holder != @holder then
$mech.get('http://www.nicovideo.jp/watch/' + @videoId, @holder)
end
end
public
def extrTitle
get
str = $mech.low.page.title
@title = str[0, str.length - $nikoTitle.length]
return @title
end
def extrMylistId
get
text = $mech.low.page.search(
"//div[@id='PAGECONTAINER']//p[@class='font12']"
).text
mlIdLikeAry = []
text.scan(/mylist\/[0-9]{1,}/).each { |e|
mlIdLikeAry.push(e.scan(/[0-9]{1,}/)[0])
}
mlIdLikeAry = mlIdLikeAry.uniq
return mlIdLikeAry
end
# 自分が、与えられたマイリストのシリーズの一員であるかどうかを判定する。
# 自分がそのマイリストに含まれるか、そしてマイリス内の動画のタイトルとの類似性が一定以上かの二段階判定。
def isSeriesof (mylistId)
judge = false
mylistHTML = MylistHTML.new(mylistId)
match = mylistHTML.match(@title)
if match['match'] then
if match['similarity'] > 0.5 then judge = true end
end
return {
'judge' => judge,
'mlTitle' => match['title']
}
end
attr_accessor :videoId
attr_accessor :title
attr_accessor :mylistId
attr_accessor :mlIdLikeAry
attr_accessor :viewNum
attr_accessor :resNum
attr_accessor :mylNum
end
class MylistHTML
def initialize (mylistId)
@mylistId = mylistId
end
private
def get
@reqUrl = 'http://www.nicovideo.jp/mylist/' + @mylistId
@holder = "mylist/" + @reqUrl
if $mech.holder != @holder then
$mech.get(@reqUrl, @holder)
end
end
public
def extrTitle
get
str = $mech.low.page.title
@title = str[0, str.length - $nikoTitle.length]
return @title
end
def extrEachTitle
get
titleAry = $mech.low.page.search(
"/html/body" +
"/div[@id='PAGECONTAINER']" +
"/div[@id='PAGEMAIN']" +
"/div[@id='PAGEBODY']" +
"/script[7]"
).text
titleAry = titleAry.scan(/\"title\"\:\"[^\"]{1,}/)
count = 0
titleAry.each { |e|
titleAry[count] = e["\"title\":\"".length, e.length]
count += 1
}
return titleAry
end
def match (title)
titleAry = extrEachTitle
match = false
dl = DamerauLevenshtein
d = 0.0
puts "matching..."
puts title
titleAry.each { |e|
_d = dl.distance(title, Unicode.unescape(e))
d += _d
print "\t" + Unicode.unescape(e)
if Unicode.unescape(e) == title then
puts "\t[matched]\tlvd:" + _d.to_s
match = true
else
puts "\t[unmatched]\tlvd:" + _d.to_s
end
}
similarity = 1 - (d / titleAry.length / title.length)
puts "Similarity: " + similarity.to_s
return {
'match' => match,
'similarity' => similarity,
'title' => extrTitle
}
end
end
$mech = Mech.new()
$mech.holder = "mech isn't initialized yet."
login('***', '***')
extr = Extractor.new()
today = Date::new(Time.now.year, Time.now.month, Time.now.day)
yesterday = today - 1
twoDaysAgo = today - 2
extr.extrMylistId(yesterday, twoDaysAgo)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment