Created
May 19, 2014 15:46
-
-
Save panmari/fbf657b85fccce60708c to your computer and use it in GitHub Desktop.
patch for downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/lib/tasks/update_wiki/Downloader.rb b/lib/tasks/update_wiki/Downloader.rb | |
index bb58022..93a8ba5 100644 | |
--- a/lib/tasks/update_wiki/Downloader.rb | |
+++ b/lib/tasks/update_wiki/Downloader.rb | |
@@ -32,20 +32,24 @@ class Downloader | |
#calculate percentage | |
@pct = (@c.to_f/@length.to_f*100).round(3) | |
#check if article already exists | |
- check = @client.query("SELECT * FROM pages WHERE page_id = #{name["page_id"]}").count | |
- unless check == 1 | |
+ while @client.query("SELECT * FROM pages WHERE page_id = #{name["page_id"]}").first.nil? | |
#get raw text contents of article | |
url = "http://de.wikipedia.org/w/index.php?curid=#{name["page_id"]}" | |
- doc = Nokogiri::HTML(openURL(url)) | |
- text = '' | |
- doc.css('p,h1').each do |e| | |
- text << e.content | |
+ begin | |
+ doc = Nokogiri::HTML(openURL(url)) | |
+ text = '' | |
+ doc.css('p,h1').each do |e| | |
+ text << e.content | |
+ end | |
+ #insert into database | |
+ pquery = "INSERT INTO pages (page_id, page_title, text_id) VALUES(#{name["page_id"]}, '#{name["page_title"].gsub("'", %q(\\\'))}', #{name["text_id"]})" | |
+ tquery = "INSERT INTO texts (page_id, content, text_id) VALUES(#{name["page_id"]}, '#{text.gsub("'", %q(\\\'))}', #{name["text_id"]})" | |
+ @client.query(pquery) | |
+ @client.query(tquery) | |
+ rescue SocketError | |
+ puts 'Oooops, wikipedia blocked request, retrying after 10 seconds' | |
+ sleep(10) | |
end | |
- #insert into database | |
- pquery = "INSERT INTO pages (page_id, page_title, text_id) VALUES(#{name["page_id"]}, '#{name["page_title"].gsub("'", %q(\\\'))}', #{name["text_id"]})" | |
- tquery = "INSERT INTO texts (page_id, content, text_id) VALUES(#{name["page_id"]}, '#{text.gsub("'", %q(\\\'))}', #{name["text_id"]})" | |
- @client.query(pquery) | |
- @client.query(tquery) | |
end | |
@c+=1 | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment