jronallo · September 29, 2017 23:12
diff --git a/common_crawl_hostname_count.rb b/common_crawl_hostname_count.rb
 #!/usr/bin/env ruby

 # a quick, simple script to partially parse output from https://github.com/trivio/common_crawl_index/blob/master/bin/remote_read 
 # and output subdomains in order of count

 url_counts = {}
 total_urls = 0
 File.readlines(ARGV[0]).each do |line|
  url = line.split(' ').first
  reverse_hostname = url.split('/').first
  hostname = reverse_hostname.split('.').reverse.join('.')
  url_counts[hostname] ||= 0
  url_counts[hostname] += 1
  total_urls += 1
 end

 urls_sorted_by_count = url_counts.sort_by{|key, value| value}.reverse

 urls_sorted_by_count.each do |url, count|
  puts "#{count.to_s.rjust(5)}: #{url}"
 end

 puts "Total URLs: #{total_urls}"
 puts "Total hostnames: #{urls_sorted_by_count.length}"
diff --git a/common_crawl_hostname_search.rb b/common_crawl_hostname_search.rb
 #!/usr/bin/env ruby

 # common_crawl_hostname_search.rb
 # A simple script to take the output from the Common Crawl URL index and query 
 # for full URLs matching a particular hostname.

 # Note that queries are done in reverse order like queries on the URL index itself,
 # but the matching is more exact for the subdomain. URLs are output in normal order.
 # common_crawl_hostname_search.rb path/to/url_index_output.txt com.example

 require 'json'
 require 'date'

 file = ARGV[0]
 query = ARGV[1]

 matching_urls = File.readlines(ARGV[0]).map do |line|
  if /^#{query}/.match(line)
    line_parts = line.split(' ')
    reversed_url = line_parts.shift

    # url
    url_parts = reversed_url.split('/')
    reverse_hostname = url_parts.shift
    hostname = reverse_hostname.split('.').reverse.join('.')
    path_with_protocol = url_parts.join('/')
    protocol = path_with_protocol.split(':').last
    path = path_with_protocol.sub(/:#{protocol}$/, '')

    # segment date
    json = line_parts.join(' ')

    json.gsub!("'", '"')
    data = JSON.parse(json)
    epoch_date = data['arcFileDate'] / 1000 # milliseconds to seconds
    date = DateTime.strptime(epoch_date.to_s, '%s')

    "#{date}  #{protocol}://#{hostname}/#{path}"
  end
 end.compact

 puts matching_urls
 puts "Total matching URLs: #{matching_urls.length}"

diff --git a/nquad_context_extractor.rb b/nquad_context_extractor.rb
 #! /usr/bin/env ruby

 require 'rdf'
 require 'rdf/nquads'

 contexts = []

 RDF::NQuads::Reader.open(ARGV[0]) do |reader|
  reader.each_statement do |statement|
    if !statement.context.nil?
      contexts << statement.context
    end 
  end
 end

 puts contexts.sort
	#!/usr/bin/env ruby

	# a quick, simple script to partially parse output from https://github.com/trivio/common_crawl_index/blob/master/bin/remote_read
	# and output subdomains in order of count

	url_counts = {}
	total_urls = 0
	File.readlines(ARGV[0]).each do \|line\|
	url = line.split(' ').first
	reverse_hostname = url.split('/').first
	hostname = reverse_hostname.split('.').reverse.join('.')
	url_counts[hostname] \|\|= 0
	url_counts[hostname] += 1
	total_urls += 1
	end

	urls_sorted_by_count = url_counts.sort_by{\|key, value\| value}.reverse

	urls_sorted_by_count.each do \|url, count\|
	puts "#{count.to_s.rjust(5)}: #{url}"
	end

	puts "Total URLs: #{total_urls}"
	puts "Total hostnames: #{urls_sorted_by_count.length}"
	#!/usr/bin/env ruby

	# common_crawl_hostname_search.rb
	# A simple script to take the output from the Common Crawl URL index and query
	# for full URLs matching a particular hostname.

	# Note that queries are done in reverse order like queries on the URL index itself,
	# but the matching is more exact for the subdomain. URLs are output in normal order.
	# common_crawl_hostname_search.rb path/to/url_index_output.txt com.example

	require 'json'
	require 'date'

	file = ARGV[0]
	query = ARGV[1]

	matching_urls = File.readlines(ARGV[0]).map do \|line\|
	if /^#{query}/.match(line)
	line_parts = line.split(' ')
	reversed_url = line_parts.shift

	# url
	url_parts = reversed_url.split('/')
	reverse_hostname = url_parts.shift
	hostname = reverse_hostname.split('.').reverse.join('.')
	path_with_protocol = url_parts.join('/')
	protocol = path_with_protocol.split(':').last
	path = path_with_protocol.sub(/:#{protocol}$/, '')

	# segment date
	json = line_parts.join(' ')

	json.gsub!("'", '"')
	data = JSON.parse(json)
	epoch_date = data['arcFileDate'] / 1000 # milliseconds to seconds
	date = DateTime.strptime(epoch_date.to_s, '%s')

	"#{date} #{protocol}://#{hostname}/#{path}"
	end
	end.compact

	puts matching_urls
	puts "Total matching URLs: #{matching_urls.length}"
	#! /usr/bin/env ruby

	require 'rdf'
	require 'rdf/nquads'

	contexts = []

	RDF::NQuads::Reader.open(ARGV[0]) do \|reader\|
	reader.each_statement do \|statement\|
	if !statement.context.nil?
	contexts << statement.context
	end
	end
	end

	puts contexts.sort