Skip to content

Instantly share code, notes, and snippets.

@nowa
Created August 4, 2010 04:19
Show Gist options
  • Save nowa/507634 to your computer and use it in GitHub Desktop.
Save nowa/507634 to your computer and use it in GitHub Desktop.
iOS App Store Crawler
#!/usr/bin/env ruby
# iOS App Store Crawler by Nowa <[email protected]>
# 2010-08-04
require 'rubygems'
require 'hpricot'
require 'open-uri'
USERAGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4'
LETTERS = %w{A B C D E F G H I J K L M N O P Q R S T U V W X Y Z *}
cates = {}
doc = open("http://itunes.apple.com/us/genre/mobile-software-applications/id36", 'User-Agent' => USERAGENT) do |f|
Hpricot(f)
end
puts doc.search("//title")[0].innerText
# get root categories
doc.search("//a[@class='top-level-genre']").each { |root_cate|
cates[root_cate.innerText] = root_cate.attributes["href"] unless cates[root_cate.innerText]
}
puts "Got #{cates.size} categories."
cates.each { |cate_title, cate_url|
puts "Get in <#{cate_title}>"
LETTERS.each { |letter|
page = 1
the_url = "#{cate_url}&letter=#{letter}&page=#{page}"
puts "\tFetching Letter: #{letter} of <#{cate_title}>, Page: #{page}, URL: #{the_url}"
apps = Hpricot(open(the_url, 'User-Agent' => USERAGENT)).search("//div[@id='selectedcontent']//li/a")
while apps.size > 0
apps.each { |app|
puts "\t\tGot App <#{app.innerText}>, URL: #{app.attributes['href']}"
app_detail = {
:title => app.innerText,
:detail_url => app.attributes['href'],
:developer => {},
:desc => "",
:links => {},
:iphone_screenshots => [],
:ipad_screenshots => [],
:icon_175 => "",
:is_free => nil,
:price => "",
:updated_date => "",
:version => "",
:version_desc => "",
:pkg_size => "",
:languages => "",
:seller => "",
:requirements => {
:devices => [],
:base_ios => ""
}
}
app_doc = Hpricot(open(app.attributes['href'], 'User-Agent' => USERAGENT))
# title & developer
tmp_ele = app_doc.search("//div[@id='title']")
if tmp_ele
# app title
tmp_ele.search("/h1").each { |h1|
app_detail[:title] = h1.innerText
}
# app developer
tmp_ele.search("/h2").each { |h2|
app_detail[:developer][:display_name] = h2.innerText
}
tmp_ele.search("/a").each { |a|
app_detail[:developer][:nick] = a.attributes["href"].split("/")[-2]
}
end
# center-stack
tmp_ele = app_doc.search("//div[@class='center-stack']")
if tmp_ele
# app desc
tmp_ele.search("/div[@metrics-loc='Titledbox_Description']/p").each { |p|
app_detail[:desc] = p.innerHTML
}
# links
tmp_ele.search("/div[@class='app-links']/a").each { |a|
symbol = a.innerText.include?("Web Site") ? :website : :support
app_detail[:links][symbol] = a.attributes["href"]
}
# iPhone screenshots
tmp_ele.search("//div[@class='content iphone-screen-shots']//img").each { |img|
app_detail[:iphone_screenshots].push(img.attributes['src'])
}
# iPad screenshots
tmp_ele.search("//div[@class='content ipad-screen-shots']//img").each { |img|
app_detail[:ipad_screenshots].push(img.attributes['src'])
}
end
# left-stack
tmp_ele = app_doc.search("//div[@id='left-stack']")
if tmp_ele
# icon 175x175
tmp_ele.search("//img[@class='artwork']").each { |img|
app_detail[:icon_175] = img.attributes['src']
}
list = tmp_ele.search("//ul[@class='list']/li")
if list.size > 0
# is_free & price
app_detail[:is_free] = list[0].innerText.include?("Free")
app_detail[:price] = list[0].innerText.gsub('$', '') unless app_detail[:is_free]
app_detail[:updated_date] = list[2].innerText.gsub("\nUpdated:", "").gsub("\nReleased:", "")
app_detail[:version] = list[3].innerText.gsub("\nVersion:", "").gsub("\nCurrent Version:", "")
app_detail[:version_desc] = list[4].innerText
app_detail[:pkg_size] = list[5].innerText
app_detail[:languages] = list[6].innerText.gsub("\nLanguage:", "").gsub("\nLanguages:", "")
app_detail[:seller] = list[7].innerText.gsub("\nSeller:", "")
end
# requirements
tmp_ele.search("/div[@class='lockup product application']/p").each { |p|
requirements = p.innerText.gsub("\nRequirements:", "").split('. Requires ')
app_detail[:requirements][:devices] = requirements[0].gsub('Compatible with ', '').gsub('and ', '')
app_detail[:requirements][:base_ios] = requirements[1].gsub(' or later.', '').gsub(/(iPhone OS|iOS)/i, '').strip
}
end
puts "\t\t\tTitle: #{app_detail[:title]}"
puts "\t\t\tUpdated: #{app_detail[:updated_date]}"
puts "\t\t\tVersion: #{app_detail[:version]}"
puts "\t\t\tSize: #{app_detail[:pkg_size]}"
puts "\t\t\tLanguages: #{app_detail[:languages]}"
puts "\t\t\tSeller: #{app_detail[:seller]}"
puts ""
}
page += 1
the_url = "#{cate_url}&letter=#{letter}&page=#{page}"
puts ""
puts "\tFetching Letter: #{letter} of <#{cate_title}>, Page: #{page}, URL: #{the_url}"
apps = Hpricot(open(the_url, 'User-Agent' => USERAGENT)).search("//div[@id='selectedcontent']//li/a")
puts ""
end
puts ""
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment