Created
April 17, 2020 20:45
-
-
Save igneus/97c4e9a85ac02fcc5c1ee4694438b8b3 to your computer and use it in GitHub Desktop.
signaly blog scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'mechanize' | |
| require 'csv' | |
| blog = 'https://pokus-4.signaly.cz' | |
| agent = Mechanize.new | |
| agent.log = Logger.new "mech.log" | |
| # collect links to all archive pages (each listing blog posts of a single month) | |
| page = agent.get blog | |
| months = page.parser.css('#archive ul li a').collect do |i| | |
| i['href'] | |
| end | |
| #p months | |
| #months = months[0..0] # testing subset | |
| # collect links to individual blog posts | |
| posts = months.collect do |month_path| | |
| page = agent.get blog + month_path | |
| sleep 1 | |
| page.parser.css('.post-title h2 a').collect do |i| | |
| i['href'] | |
| end | |
| end.flatten | |
| #p posts | |
| #posts = posts[0..3] # testing subset | |
| csv = CSV.generate do |csv| | |
| begin | |
| csv << ['url','title','views','comments_count','unique_commenters_count','unique_commenters'] | |
| # visit each blog post, collect interesting contents | |
| posts.each do |post_path| | |
| post_uri = blog + post_path | |
| page = agent.get post_uri + '?do=comments-showAllComments' | |
| sleep(rand(3)) | |
| parser = page.parser | |
| unique_commenters = parser.css('#snippet-comments-comments .one__comment .comment__author__name a').collect do |a| | |
| # get nick from the URL, because some users don't use nick as displayed name | |
| URI.parse(a['href']).path.sub(/^\//, '') | |
| end.uniq | |
| csv << [ | |
| # URL | |
| post_uri, | |
| # post title | |
| parser.css('.post-title h2 a').first.text, | |
| # view count | |
| /Zobrazeno (\d+)/.match(parser.css('.section-post-share > div > div:last-child > div').text)[1], | |
| # comments count | |
| parser.css('#snippet-comments-comments .one__comment').size, | |
| # unique commenters | |
| unique_commenters.size, | |
| unique_commenters.join(','), | |
| ] | |
| end | |
| rescue StandardError => e | |
| # so we don't completely lose the data collected so far | |
| puts csv | |
| raise e | |
| end | |
| end | |
| puts csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment