hunj · July 7, 2015 04:28
diff --git a/path_strip.rb b/path_strip.rb
 # path_strip(input_file, domain, output_file)
 # imports an xml file containing paths of the website's pages,
 # strips all unnecessary strings except for the path of the pages.
 ## Parameters:
 # +input_file+ name of the input xml file
 # +domain+ the domain of the URL to exclude in the result
 # +output_file+ name of the output file, ending in .csv (preferred)
 def path_strip(input_file, domain, output_file)
  raise "domain must be string form" unless domain.is_a? String
  raise "invalid input file name" unless input_file.is_a? String
  raise "invalid output file name" unless output_file.is_a? String

  file = File.open(input_file, "r")
  data = file.read.lines
  file.close

  result_file = File.open(output_file, "w")

  num = 0
  data.each do |line|
    if line =~ /<loc>http:\/\/#{Regexp.quote(domain)}\/.*<\/loc>/
      num += 1
      result_file.puts "link_#{num},#{line[5..-8].sub("http://#{domain}/", '')}"
    end
  end
  result_file.close
  p num
 end

 # example:
 path_strip "./sitemap.xml", "www.example.com", "./result.csv"
	# path_strip(input_file, domain, output_file)
	# imports an xml file containing paths of the website's pages,
	# strips all unnecessary strings except for the path of the pages.
	## Parameters:
	# +input_file+ name of the input xml file
	# +domain+ the domain of the URL to exclude in the result
	# +output_file+ name of the output file, ending in .csv (preferred)
	def path_strip(input_file, domain, output_file)
	raise "domain must be string form" unless domain.is_a? String
	raise "invalid input file name" unless input_file.is_a? String
	raise "invalid output file name" unless output_file.is_a? String

	file = File.open(input_file, "r")
	data = file.read.lines
	file.close

	result_file = File.open(output_file, "w")

	num = 0
	data.each do \|line\|
	if line =~ /<loc>http:\/\/#{Regexp.quote(domain)}\/.*<\/loc>/
	num += 1
	result_file.puts "link_#{num},#{line[5..-8].sub("http://#{domain}/", '')}"
	end
	end
	result_file.close
	p num
	end

	# example:
	path_strip "./sitemap.xml", "www.example.com", "./result.csv"
No results found