jronallo · December 11, 2015 18:59
diff --git a/get_and_process_webdatacommons_data.sh b/get_and_process_webdatacommons_data.sh
 #!/usr/bin/env bash
 # These steps will take a long time to download the data set.
 # First, get the list of available NQuad files to download.
 wget http://webdatacommons.org/2012-08/stats/files.list 

 # We're only interested in the microdata set right now since that seems to be where schema.org/Book is used more. So create a file list
 cat files.list | grep html-microdata > microdata_files.list

 # OK, this will take a while depending on your connection. Let it run overnight.
 wget -i microdata_files.list

 # Gunzip all the compressed files.
 gunzip *gz

 # Use grep to filter through all the NQuads and select only those that have schema.org/Book on the line. This can be changed to get any type.
 cat html-microdata* | grep schema.org/Book >> microdata_books_nquads.nq

 # Then run the script to create the report.
 nquad_context_count_per_host.rb microdata_books_nquads.nq

 # Open the CSV file with the appropriate program. This works on Ubuntu.
 xdg-open microdata_books_nquads.csv
diff --git a/quad_context_count_per_host.rb b/quad_context_count_per_host.rb
 #! /usr/bin/env ruby

 # counts statements and lists properties for hosts in nquads file

 require 'rdf'
 require 'rdf/nquads'
 require 'csv'

 contexts = {}
 filename = ARGV[0]
 extension = File.extname(filename)
 basename = File.basename(filename, extension)

 RDF::NQuads::Reader.open(filename) do |reader|
  reader.each_statement do |statement|
    if !statement.context.nil?
      host = statement.context.host
      predicate = statement.predicate.to_s

      contexts[host] ||= {}
      contexts[host]['count'] ||= 0
      contexts[host]['count'] += 1
      contexts[host]['props'] ||= []
      if predicate.include?('http://schema.org/')
        predicate.sub!('http://schema.org/', '')
      end
      contexts[host]['props'] << predicate
      contexts[host]['props'].uniq!
    end 
  end
 end

 sorted_contexts = contexts.sort_by{|k,v| v['count']}.reverse

 CSV.open("#{basename}.csv", 'wb') do |csv|
  csv << %w(host count properties)
  sorted_contexts.each do |context, data|
    csv << [context, data['count'], data['props'].sort.join(' ')]
  end
 end
	#!/usr/bin/env bash
	# These steps will take a long time to download the data set.
	# First, get the list of available NQuad files to download.
	wget http://webdatacommons.org/2012-08/stats/files.list

	# We're only interested in the microdata set right now since that seems to be where schema.org/Book is used more. So create a file list
	cat files.list \| grep html-microdata > microdata_files.list

	# OK, this will take a while depending on your connection. Let it run overnight.
	wget -i microdata_files.list

	# Gunzip all the compressed files.
	gunzip *gz

	# Use grep to filter through all the NQuads and select only those that have schema.org/Book on the line. This can be changed to get any type.
	cat html-microdata* \| grep schema.org/Book >> microdata_books_nquads.nq

	# Then run the script to create the report.
	nquad_context_count_per_host.rb microdata_books_nquads.nq

	# Open the CSV file with the appropriate program. This works on Ubuntu.
	xdg-open microdata_books_nquads.csv
	#! /usr/bin/env ruby

	# counts statements and lists properties for hosts in nquads file

	require 'rdf'
	require 'rdf/nquads'
	require 'csv'

	contexts = {}
	filename = ARGV[0]
	extension = File.extname(filename)
	basename = File.basename(filename, extension)

	RDF::NQuads::Reader.open(filename) do \|reader\|
	reader.each_statement do \|statement\|
	if !statement.context.nil?
	host = statement.context.host
	predicate = statement.predicate.to_s

	contexts[host] \|\|= {}
	contexts[host]['count'] \|\|= 0
	contexts[host]['count'] += 1
	contexts[host]['props'] \|\|= []
	if predicate.include?('http://schema.org/')
	predicate.sub!('http://schema.org/', '')
	end
	contexts[host]['props'] << predicate
	contexts[host]['props'].uniq!
	end
	end
	end

	sorted_contexts = contexts.sort_by{\|k,v\| v['count']}.reverse

	CSV.open("#{basename}.csv", 'wb') do \|csv\|
	csv << %w(host count properties)
	sorted_contexts.each do \|context, data\|
	csv << [context, data['count'], data['props'].sort.join(' ')]
	end
	end