Created
August 17, 2011 17:52
-
-
Save hdemon/1152142 to your computer and use it in GitHub Desktop.
ゆっくり動画クローラ v0.1のコア部分 2011/8/17
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
require 'rubygems' | |
require 'mechanize' | |
require 'kconv' | |
require 'mysql' | |
require 'date' | |
require 'damerau-levenshtein' | |
require 'cgi' | |
load 'unicode.rb' | |
$nikoTitle = " ‐ ニコニコ動画(原宿)" | |
class Numeric | |
def round_to(valid_digit) | |
powered = self * (10 ** valid_digit) | |
powered.round * (10 ** -valid_digit) | |
end | |
end | |
class Mech | |
def initialize | |
@low = Mechanize.new | |
@mechHolder = "mech isn't initialized yet." | |
puts "Mechanize initialized." | |
end | |
public | |
def get (url, holder) | |
sleep 3 | |
puts "connect to " + url | |
@low.get(url) | |
@holder = holder | |
end | |
attr_accessor :low | |
attr_accessor :holder | |
end | |
def login (mail, pass) | |
$mech.get("https://secure.nicovideo.jp/secure/login_form", "loginForm") | |
$mech.low.page.search("//div[@class='loginFormWrap']/form") { |form| | |
form['mail'] = mail | |
form['password'] = pass | |
form.click_button | |
} | |
puts "Logged in Niconico." | |
end | |
class Extractor | |
def extrMylistId (startDate, endDate) | |
sMlHTML = SearchMylistHTML.new() | |
sMlHTML.extrVideoInfo(startDate, endDate) | |
sMlHTML.movieHTML.each_key { |videoId| | |
sMlHTML.movieHTML[videoId].mlIdLikeAry = sMlHTML.movieHTML[videoId].extrMylistId | |
puts "Extracting mylist-id in " + videoId | |
idlike = sMlHTML.movieHTML[videoId].mlIdLikeAry | |
if idlike.length == 0 then puts videoId.to_s + " isn't contained mylist-id." end | |
idlike.each { |e| | |
puts e.to_s + " is extracted." | |
_isSeries = sMlHTML.movieHTML[videoId].isSeriesof(e.to_s) | |
@mlTitle = _isSeries['mlTitle'] | |
if _isSeries['judge'] then | |
puts e.to_s + "/" + @mlTitle + "\sis series." | |
#return {'mylistId' => e, 'title' => mlTitle} | |
else | |
puts e.to_s + "/" + @mlTitle + "\sis not series." | |
end | |
puts | |
} | |
puts | |
} | |
puts | |
end | |
end | |
# マイリスト検索結果画面の管理クラス | |
class SearchMylistHTML | |
def initialize | |
@word = "%E3%82%86%E3%81%A3%E3%81%8F%E3%82%8A%E5%AE%9F%E6%B3%81" | |
@url = "http://www.nicovideo.jp/search/" | |
@page = 0 | |
@eachMovie = "//div[@id='PAGEBODY']//div[@style='clear:both;']" | |
@mylist = [] | |
@movieHTML = {} | |
end | |
private | |
def get | |
@reqUrl = @url + @word + "?page=#{@page+1}&sort=f&order=d" | |
@holder = "search/" + @reqUrl | |
if $mech.holder != @holder then | |
$mech.get(@reqUrl, @holder) | |
end | |
@movNum = ($mech.low.page.search(@eachMovie).size) | |
end | |
# 動画の投稿日を取得する。 | |
def getItsDate (no) | |
dateLump = $mech.low.page.search(@eachMovie + "/div/table/tr/td[2]/div/p/strong") | |
itsDate = dateLump[no].text.scan(/[0-9]{4}年[0-9]{1,}月[0-9]{1,}日/)[0] | |
itsDate[4] = "-"; itsDate[7] = "-"; itsDate[10] = "" | |
itsDate = Date.strptime(itsDate) | |
return itsDate | |
end | |
# 共用するイテレータの類型。ページ毎と動画毎の2重ループ。 | |
# 32とは、検索画面に一度に表示される動画の数。 | |
def itelator (endDate, cb_eachPage, cb_eachMov) | |
termLoop = false | |
@page = 0 | |
until termLoop do | |
get | |
cb_eachPage.call(@page) | |
32.times { |no| | |
if getItsDate(no) == endDate - 1 then | |
termLoop = true | |
break | |
end | |
cb_eachMov.call(no) | |
} | |
@page += 1 | |
end | |
end | |
public | |
# 動画の動画IDとタイトルを同時に取得。 | |
def extrVideoInfo (startDate, endDate) | |
_page = 0 | |
itelator( | |
endDate, | |
lambda { |page| | |
@videoIdLump = $mech.low.page.search(@eachMovie + "/div/table") | |
@titleLump = $mech.low.page.search( | |
@eachMovie + | |
"/div/table" + | |
"/tr/td[2]/div/p[2]" + | |
"/nobr/a" | |
) | |
_page = page | |
}, | |
lambda { |no| | |
@videoId = @videoIdLump[no]['summary'] | |
@title = @titleLump[no]['title'] | |
if startDate > getItsDate(no) then | |
@movieHTML[@videoId] = MovieHTML.new() | |
@movieHTML[@videoId].videoId = @videoId | |
@movieHTML[@videoId].title = @title | |
puts _page.to_s + "-" + no.to_s + ": " + @videoId.to_s + " " + @title.to_s | |
end | |
} | |
) | |
end | |
attr_accessor :movieHTML | |
end | |
# 個別の動画画面管理クラス。 | |
class MovieHTML | |
private | |
def get | |
@videoId = videoId | |
@holder = "movie/" + @videoId | |
if $mech.holder != @holder then | |
$mech.get('http://www.nicovideo.jp/watch/' + @videoId, @holder) | |
end | |
end | |
public | |
def extrTitle | |
get | |
str = $mech.low.page.title | |
@title = str[0, str.length - $nikoTitle.length] | |
return @title | |
end | |
def extrMylistId | |
get | |
text = $mech.low.page.search( | |
"//div[@id='PAGECONTAINER']//p[@class='font12']" | |
).text | |
mlIdLikeAry = [] | |
text.scan(/mylist\/[0-9]{1,}/).each { |e| | |
mlIdLikeAry.push(e.scan(/[0-9]{1,}/)[0]) | |
} | |
mlIdLikeAry = mlIdLikeAry.uniq | |
return mlIdLikeAry | |
end | |
# 自分が、与えられたマイリストのシリーズの一員であるかどうかを判定する。 | |
# 自分がそのマイリストに含まれるか、そしてマイリス内の動画のタイトルとの類似性が一定以上かの二段階判定。 | |
def isSeriesof (mylistId) | |
judge = false | |
mylistHTML = MylistHTML.new(mylistId) | |
match = mylistHTML.match(@title) | |
if match['match'] then | |
if match['similarity'] > 0.5 then judge = true end | |
end | |
return { | |
'judge' => judge, | |
'mlTitle' => match['title'] | |
} | |
end | |
attr_accessor :videoId | |
attr_accessor :title | |
attr_accessor :mylistId | |
attr_accessor :mlIdLikeAry | |
attr_accessor :viewNum | |
attr_accessor :resNum | |
attr_accessor :mylNum | |
end | |
class MylistHTML | |
def initialize (mylistId) | |
@mylistId = mylistId | |
end | |
private | |
def get | |
@reqUrl = 'http://www.nicovideo.jp/mylist/' + @mylistId | |
@holder = "mylist/" + @reqUrl | |
if $mech.holder != @holder then | |
$mech.get(@reqUrl, @holder) | |
end | |
end | |
public | |
def extrTitle | |
get | |
str = $mech.low.page.title | |
@title = str[0, str.length - $nikoTitle.length] | |
return @title | |
end | |
def extrEachTitle | |
get | |
titleAry = $mech.low.page.search( | |
"/html/body" + | |
"/div[@id='PAGECONTAINER']" + | |
"/div[@id='PAGEMAIN']" + | |
"/div[@id='PAGEBODY']" + | |
"/script[7]" | |
).text | |
titleAry = titleAry.scan(/\"title\"\:\"[^\"]{1,}/) | |
count = 0 | |
titleAry.each { |e| | |
titleAry[count] = e["\"title\":\"".length, e.length] | |
count += 1 | |
} | |
return titleAry | |
end | |
def match (title) | |
titleAry = extrEachTitle | |
match = false | |
dl = DamerauLevenshtein | |
d = 0.0 | |
puts "matching..." | |
puts title | |
titleAry.each { |e| | |
_d = dl.distance(title, Unicode.unescape(e)) | |
d += _d | |
print "\t" + Unicode.unescape(e) | |
if Unicode.unescape(e) == title then | |
puts "\t[matched]\tlvd:" + _d.to_s | |
match = true | |
else | |
puts "\t[unmatched]\tlvd:" + _d.to_s | |
end | |
} | |
similarity = 1 - (d / titleAry.length / title.length) | |
puts "Similarity: " + similarity.to_s | |
return { | |
'match' => match, | |
'similarity' => similarity, | |
'title' => extrTitle | |
} | |
end | |
end | |
$mech = Mech.new() | |
$mech.holder = "mech isn't initialized yet." | |
login('***', '***') | |
extr = Extractor.new() | |
today = Date::new(Time.now.year, Time.now.month, Time.now.day) | |
yesterday = today - 1 | |
twoDaysAgo = today - 2 | |
extr.extrMylistId(yesterday, twoDaysAgo) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment