Created
January 23, 2017 17:10
-
-
Save lucianghinda/c467156fec6e89a4cad5f0e5651ec3d5 to your computer and use it in GitHub Desktop.
Count domains and paths
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to counts subdomains and URL paths from a list | |
# Gem List Used: | |
# https://github.com/gilliek/opml-parser | |
# https://github.com/mhuggins/dominatrix | |
require 'opml-parser' | |
require 'domainatrix' | |
require 'pp' | |
require 'csv' | |
include OpmlParser | |
filename = ARGV[0] | |
file = File.open(filename) | |
content = file.readlines.join("") | |
file.close | |
extension = File.extname filename | |
name = File.basename filename, extension | |
outlines = OpmlParser.import(content) | |
subdomains = {} | |
paths = {} | |
outlines.each do |line| | |
# just for he first opml line | |
next unless line.attributes.has_key? :htmlUrl | |
domain = Domainatrix.parse line.attributes[:htmlUrl].chomp("/") | |
unless domain.path.empty? | |
paths[domain.path] ||= 0 #adds key with count=0 if does not exists yet | |
paths[domain.path] += 1 | |
end | |
unless domain.subdomain.empty? | |
subdomains[domain.subdomain] ||= 0 #adds key with count=0 if does not exists yet | |
subdomains[domain.subdomain] += 1 | |
end | |
end | |
subdomains = subdomains.sort_by{ |subdomain, count| count}.reverse | |
paths = paths.sort_by{ |paths, count| count}.reverse | |
csv_filename = "#{name}-subdomains-#{Time.now.strftime('%Y%m%d%H%M')}.csv" | |
CSV.open(csv_filename, "w+") do |csv| | |
csv << ["Subdomain", "Count"] | |
subdomains.each do |subdomain| | |
csv << subdomain | |
end | |
end | |
csv_filename = "#{name}-paths-#{Time.now.strftime('%Y%m%d%H%M')}.csv" | |
CSV.open(csv_filename, "w+") do |csv| | |
csv << ["Path", "Count"] | |
paths.each do |path| | |
csv << path | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment