Last active
November 9, 2015 13:56
-
-
Save tadast/defd1b6fc1e2eccd77f3 to your computer and use it in GitHub Desktop.
Service manual prototype hierarchy crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'open-uri' | |
require 'pry' | |
require 'yaml' | |
class Crawl | |
def initialize(start_url = "http://sm-11.herokuapp.com") | |
@start_url = start_url | |
@guides = [] | |
end | |
def go | |
File.open('section_hierarchy.yml', 'w') { |file| file.write(sections.to_yaml) } | |
File.open('guides.yml', 'w') { |file| file.write(@guides.to_yaml) } | |
end | |
def sections | |
doc = Nokogiri::HTML(open(@start_url)) | |
doc.css(".topic-block").map do |section_block| | |
base_path = service_manual_path(section_block.css(".heading-small a").attr('href').value) | |
{ | |
content_id: SecureRandom.uuid, | |
title: section_block.css(".heading-small").text().strip, | |
base_path: base_path, | |
description: section_block.css("p").text().strip, | |
format: "service_manual_topic", | |
publishing_app: "service-manual-publisher", | |
rendering_app: "government-frontend", | |
need_ids: [], | |
locale: "en", | |
update_type: 'minor', | |
public_updated_at: Time.now.iso8601, | |
details: { | |
link_groups: link_groups(section_block.css(".heading-small a").attr('href').value) | |
}, | |
links: { | |
linked_items: @links | |
}, | |
routes: [ | |
{ type: "exact", path: base_path } | |
], | |
} | |
end | |
end | |
def link_groups(section_path) | |
@links = [] | |
page_url = File.join(@start_url, section_path) | |
doc = Nokogiri::HTML(open(page_url)) | |
doc.css(".collapsible-subsections ul").map.with_index do |block, idx| | |
description = doc.css(".topic-description")[idx].text().strip | |
{ | |
title: doc.css(".collapsible-subsections h2")[idx].text().strip.gsub(description, ''), # both title and description live under h2 | |
description: description, | |
linked_items: linked_items(block) | |
} | |
end | |
end | |
def linked_items(block) | |
block.css("a").map do |link| | |
guide = add_guide(link) | |
if guide | |
@links << guide[:content_id] | |
guide[:content_id] | |
end | |
end.compact | |
end | |
def service_manual_path(path) | |
File.join( | |
"/service-manual", | |
path.chomp("/") | |
) | |
end | |
def add_guide(link) | |
page_url = File.join(@start_url, link.attr('href')) | |
puts page_url | |
doc = Nokogiri::HTML(open(page_url)) | |
body_block = doc.css(".markdown") | |
guide = { | |
content_id: SecureRandom.uuid, | |
title: link.text, | |
description: "-", | |
format: "service_manual_guide", | |
publishing_app: "service-manual-publisher", | |
rendering_app: "government-frontend", | |
need_ids: [], | |
locale: "en", | |
updated_at: Time.now.iso8601, | |
public_updated_at: Time.now.iso8601, | |
update_type: "minor", | |
phase: "beta", | |
base_path: service_manual_path(link.attr('href')), | |
routes: [ | |
{ type: "exact", path: service_manual_path(link.attr('href')) } | |
], | |
details: { | |
body: body_block.to_s, | |
header_links: [], | |
publisher: { | |
name: "Agile Community", | |
href: "http://sm-11.herokuapp.com/agile-delivery/agile-and-government-services" | |
} | |
} | |
} | |
@guides << guide | |
guide | |
rescue OpenURI::HTTPError => e | |
puts e.message | |
end | |
end | |
Crawl.new.go |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment