Created
June 9, 2017 08:56
-
-
Save muhammadyana/a9fbc4f94e6aedf1b3c2c8e33028b22c to your computer and use it in GitHub Desktop.
parsing HTML in Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#require 'socket' | |
require 'net/http' | |
require 'open-uri' | |
require 'nokogiri' | |
client = TCPSocket.open('indoexchanger.co.id', 'www') | |
client.send("OPTIONS /~dave/ HTTP/1.0\n\n", 0) | |
puts client.readlines | |
client.close | |
#scrap image in page | |
page = gets.chomp.to_s | |
http = Net::HTTP.new(url, 80) | |
response = http.get(page) | |
if response.message == "OK" | |
puts response.body.scan(/<img alt=".*?" src="(.*?)"/m).uniq[0,3] | |
else | |
puts "Failed" | |
end | |
puts response.message | |
if response.message == "OK" | |
puts response.body.scan(/<img src="(.*?)"/m) | |
else | |
puts "Failed" | |
end | |
#another way get image link | |
open('http://muhammadyana.web.id') do | |
|f| | |
puts f.read.scan(/<img src="(.*?)"/m).uniq[0,3] | |
end | |
#parsing HTMl | |
page = open('http://muhammadyana.web.id').read | |
if page =~ %r{<title>(.*?)</title>}m | |
puts "Title is #{$1.inspect}" | |
end | |
doc = Nokogiri::HTML(open("http://pragprog.com/")) | |
puts "Page title is " + doc.xpath("//title").inner_html | |
puts doc.css('div#copyright p') | |
puts "\nSecond hyperlink is" | |
puts doc.xpath('id("site-links")//a[2]') | |
puts doc.css('#site-links a:nth-of-type(2)') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment