ibanez270dx · May 19, 2015 01:15
diff --git a/shkip.rb b/shkip.rb
 require 'io/console'
 require 'optparse'
 require 'ostruct'
 require 'fileutils'

 NAME = "Safari Keyword History Index Parser"
 VERSION = "v0.0.1"
 TIME = Time.now

 def box_me_up(str)
  width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join
  # width = (str.length+2).times.collect{'═'}.join
  str = str.center(STDOUT.winsize[-1]-4,' ')
  boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n"
 end

 def show_error_and_exit
  puts @option_parser.banner
  puts "  #{$!}\n    use --help for more information\n\n"
  exit 1
 end

 ################################################################################
 # Command Line Options
 ################################################################################

 # We set default values here.
 options = OpenStruct.new
  options.name = "HistoryIndex"
  options.path = "/Users/#{ENV['USER']}/Library/Safari/"
  options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}"
  options.line = STDOUT.winsize[-1].times.collect{'─'}.join
  options.stdo = true

 options.input = "#{options.name}.sk"
 options.output = "#{options.name}-#{options.time}.txt"
 options.backup = "#{options.name}-#{options.time}.backup"
  options.dump = "#{options.name}-#{options.time}.dump"

 # Start parsing those options
 @option_parser = OptionParser.new do |opts|
  opts.program_name = NAME
  opts.version = VERSION
  opts.banner = "#{box_me_up("#{NAME} #{VERSION}")}
    Usage: ruby skhip.rb [options]\n\n"

  opts.separator "  Specific options:"
  opts.on "-i", "--input FILE",
    "Path to HistoryIndex.sk" \
  do |input|
    options.input  = input
    options.name   = input[/(?<=\/)[\w]+(?=\.)/]
    options.output = "#{options.name}-#{options.time}.txt"
    File.open(input)
  end

  opts.separator ""
  opts.on "-o", "--output FILE",
    "Relative output location" \
  do |output|
    options.output = output
  end

  opts.separator ""
  opts.on "-d", "--use-dump [FILE]",
    "Skip dumping process by specifying an existing dump file.",
    "Leave blank to use default path." \
  do |dump|
    options.dump = dump
    File.open(dump) if dump
  end

  opts.separator ""
  opts.on_tail("-h", "--help", "What you're looking at :P") do
    puts opts
    exit
  end

  opts.on_tail("--version", "Show version") do
    puts opts.program_name
    puts opts.version
    exit
  end
 end

 begin
  @option_parser.parse!
  raise OptionParser::ParseError.
  new("arguments provided without switches!") \
  unless ARGV.empty?
 rescue show_error_and_exit
 end

 ################################################################################
 # Setup
 ################################################################################

 # Make a copy of the HistoryIndex.sk file
 FileUtils.cp options.input, "#{options.backup}" \
 rescue show_error_and_exit

 # Dump it to ASCII chars
 `xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \
 rescue show_error_and_exit

 # Read it in all the fragments
 IO.foreach(options.dump) do |input|
  (@acc || @acc=[]) << input.split(' ').last \
  rescue show_error_and_exit
 end

 # Make it one big ass string
 dumped = @acc.join

 ################################################################################
 # Parsing
 ################################################################################

 parser = []  # collect regexp's and their corresponding replacements

 # Mark as a line thingy
 parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' }

 # looks like there's some code in there. It's the only other place other than
 # the the URLs that have single dots. Here, we'll add an extra dot and parse it
 # along with the rest 'o that crap.
 parser << { regexp: /~(.)+big/, replacement: '*' }

 # remove dots between single letters by using regex lookaheads
 parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' }

 # Execute!
 parser.each do |r|
  # puts r.inspect
  dumped.gsub!(r[:regexp], r[:replacement])
 end

 ################################################################################
 # Tokenize that shit
 ################################################################################
 current_index = 0
 last_char = ""
 @token  = ""
 @tokens = []
 @rejected = []

 dumped.split('').each do |char|
  char.strip!

  if (char == "." && last_char != ".")
    # end of a word, add to array
    if @token.length > 1
      @tokens << case @token
      when "http" then "http://"
      else @token
      end
    else
      @rejected << @token
    end
    @token = ""
  end

  if char =~ /[\w|\-|\+|&|\=|\?]/
    @token << char
  elsif char != "."
    @rejected << char
  end
  last_char = char
 end

 ################################################################################
 # Iterate through tokens to create URL's and newlines
 ################################################################################

 @words = []
 last_token = ''
 is_url = false
 tmp = []

 @tokens.each do |token|

  if token=~/^http/ && is_url
    @words << tmp
    tmp = []
  elsif token=="SKHIP-PARSER-SEGMENT" && is_url
    @words << tmp
    tmp = []
    is_url = false
  elsif token == "http://"
    is_url = true
  end

  if is_url
    tmp << token
  elsif token=="SKHIP-PARSER-SEGMENT"
    @words << options.line
  elsif token=~/IADefault/
    @words << box_me_up(token)
  else
    @words << token
  end

  last_token = token
 end

 ################################################################################
 # Print it out
 ################################################################################

 @words.each do |word|
  puts word.is_a?(Array) ? word.join('.').sub('.','') : word
 end

 puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n"
diff --git a/skhip-original.rb b/skhip-original.rb
 require 'io/console'
 require 'optparse'
 require 'ostruct'
 require 'fileutils'

 NAME = "Safari Keyword History Index Parser"
 VERSION = "v0.0.1"
 TIME = Time.now

 def box_me_up(str)
  width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join
  # width = (str.length+2).times.collect{'═'}.join
  str = str.center(STDOUT.winsize[-1]-4,' ')
  boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n"
 end

 def show_error_and_exit
  puts @option_parser.banner
  puts "  #{$!}\n    use --help for more information\n\n"
  exit 1
 end

 ################################################################################
 # Command Line Options
 ################################################################################

 # We set default values here.
 options = OpenStruct.new
  options.name = "HistoryIndex"
  options.path = "/Users/#{ENV['USER']}/Library/Safari/"
  options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}"
  options.line = STDOUT.winsize[-1].times.collect{'─'}.join
  options.stdo = true

 options.input = "#{options.name}.sk"
 options.output = "#{options.name}-#{options.time}.txt"
 options.backup = "#{options.name}-#{options.time}.backup"
  options.dump = "#{options.name}-#{options.time}.dump"

 # Start parsing those options
 @option_parser = OptionParser.new do |opts|
  opts.program_name = NAME
  opts.version = VERSION
  opts.banner = "#{box_me_up("#{NAME} #{VERSION}")}
    Usage: ruby skhip.rb [options]\n\n"

  opts.separator "  Specific options:"
  opts.on "-i", "--input FILE",
    "Path to HistoryIndex.sk" \
  do |input|
    options.input  = input
    options.name   = input[/(?<=\/)[\w]+(?=\.)/]
    options.output = "#{options.name}-#{options.time}.txt"
    File.open(input)
  end

  opts.separator ""
  opts.on "-o", "--output FILE",
    "Relative output location" \
  do |output|
    options.output = output
  end

  opts.separator ""
  opts.on "-d", "--use-dump [FILE]",
    "Skip dumping process by specifying an existing dump file.",
    "Leave blank to use default path." \
  do |dump|
    options.dump = dump
    File.open(dump) if dump
  end

  opts.separator ""
  opts.on_tail("-h", "--help", "What you're looking at :P") do
    puts opts
    exit
  end

  opts.on_tail("--version", "Show version") do
    puts opts.program_name
    puts opts.version
    exit
  end
 end

 begin
  @option_parser.parse!
  raise OptionParser::ParseError.
  new("arguments provided without switches!") \
  unless ARGV.empty?
 rescue show_error_and_exit
 end

 ################################################################################
 # Setup
 ################################################################################

 # Make a copy of the HistoryIndex.sk file
 FileUtils.cp options.input, "#{options.backup}" \
 rescue show_error_and_exit

 # Dump it to ASCII chars
 `xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \
 rescue show_error_and_exit

 # Read it in all the fragments
 IO.foreach(options.dump) do |input|
  (@acc || @acc=[]) << input.split(' ').last \
  rescue show_error_and_exit
 end

 # Make it one big ass string
 dumped = @acc.join

 ################################################################################
 # Parsing
 ################################################################################

 parser = []  # collect regexp's and their corresponding replacements

 # Make a big o' line breaky thing if there's a lot of dots
 parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' }

 # replace sets of 3 dots with a single dot
 # parser << { regexp: /(?<=[^\.])\.{3}(?=[^\.])/, replacement: ',' }

 # remove single non-word characters (between two dots)
 # parser << { regexp: /(?<=\.)[](?=\.)/, replacement: '..' }

 # looks like there's some code in there. It's the only other place other than
 # the the URLs that have single dots. Here, we'll add an extra dot and parse it
 # along with the rest 'o that crap.
 parser << { regexp: /~(.)+big/, replacement: '*' }

 # remove dots between single letters by using regex lookaheads
 parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' }

 # remove single "stand-alone" characters
 # parser << { regexp: /(?:\.{2,}|\n)[^-\.]{1,2}(?=\.{2,}|\n)/, replacement: '' }

 # gonna assume that URL's HTTP part needs some slashes
 # parser << { regexp: /http\.\.(?=[\w])/, replacement: 'http://' }

 # put a newline before each URL
 # parser << { regexp: /(\/?\.{1,})(?=https?)/, replacement: " " }

 # Make a big o' line breaky thing if there's a lot of dots
 # parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: options.line }

 # clean up the ends of the URLs
 # parser << { regexp: /\.{2,}(\w|{|}|\\|\d|;)+\n/, replacement: "\n" }


 # replace all dot sequences longer than one with a comma
 # parser << { regexp: /\.{2,}/, replacement: "," }

 # add a newline after "html"... just makes things easier :P
 # parser << { regexp: /(?<=html)()[^\n]/, replacement: "\n" }

 # surround the headers with a border
 # parser << { regexp: /()(?=IA\w+)/, replacement: options.line }
 # parser << { regexp: /(?:IADefault)(?:I\w+|T\w+)(\n)/, replacement: options.line }

 # Put line breaks in the remaining word blocks
 # parser << { regexp: //, replacement: "\n" }

 # Execute!
 parser.each do |r|
  # puts r.inspect
  dumped.gsub!(r[:regexp], r[:replacement])
 end

 puts dumped

 ################################################################################
 # Tokenize that shit
 ################################################################################
 current_index = 0
 last_char = ""
 @token  = ""
 @tokens = []
 @rejected = []

 dumped.split('').each do |char|
  char.strip!

  if (char == "." && last_char != ".")
    # end of a word, add to array
    if @token.length > 1
      @tokens << case @token
      when "http" then "http://"
      else @token
      end
    else
      @rejected << @token
    end
    @token = ""
  end

  if char =~ /[\w|\-|\+|&|\=|\?]/
    @token << char
  elsif char != "."
    @rejected << char
  end
  last_char = char
 end


 puts @tokens.inspect
 # puts @rejected.inspect

 ################################################################################
 # Iterate through tokens to create URL's and newlines
 ################################################################################

 @words = []
 last_token = ''
 is_url = false
 tmp = []

 @tokens.each do |token|

  if token=~/^http/ && is_url
    @words << tmp
    tmp = []
  elsif token=="SKHIP-PARSER-SEGMENT" && is_url
    @words << tmp
    tmp = []
    is_url = false
  elsif token == "http://"
    is_url = true
  end

  if is_url
    tmp << token
  elsif token=="SKHIP-PARSER-SEGMENT"
    @words << options.line
  elsif token=~/IADefault/
    @words << box_me_up(token)
  else
    @words << token
  end

  last_token = token
 end

 puts @words.inspect

 @words.each do |word|
  w = word.is_a?(Array) ? word.join('.').sub('.','') : word
  puts w
 end

 ################################################################################
 # Filter Wierd Artifacts
 ################################################################################
 # artifacts = []


 # collect regexp's and their corresponding replacements
 # artifacts << { regexp: /\.\=\=/, replacement: '' }
 # artifacts << { regexp: /http0/, replacement: 'http:' }

 # artifacts << /http\n.+\n/
 # artifacts << /z\.{+\n/
 # artifacts << /E\.F\.\w/
 # artifacts << /Bud2/
 # artifacts << /.?\.["|-]/

 # Execute!
 # artifacts.each do |artifact|
  # regexp = Regexp.new artifact
  # words.match(regexp).to_a.each do |match|
    # (@removals || @removals=[]) << match.to_s
  # end
  # words.gsub! regexp, ''
 # end

 # Remove double spaces
 # words.gsub!(/\n{2,}/,"\n")
 #
 # artifacts.each do |a|
 #   dumped.gsub!(r[:regexp], r[:replacement])
 # end

 # puts words

 # puts box_me_up('Artifact Removals:')
 # @removals.each { |x| puts x }

 # output = File.open("history_index_output.txt", 'w+')

 # words.each do |word|
 #   output.puts word
 # end
 #
 # output.close

 puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n"
	require 'io/console'
	require 'optparse'
	require 'ostruct'
	require 'fileutils'

	NAME = "Safari Keyword History Index Parser"
	VERSION = "v0.0.1"
	TIME = Time.now

	def box_me_up(str)
	width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join
	# width = (str.length+2).times.collect{'═'}.join
	str = str.center(STDOUT.winsize[-1]-4,' ')
	boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n"
	end

	def show_error_and_exit
	puts @option_parser.banner
	puts " #{$!}\n use --help for more information\n\n"
	exit 1
	end

	################################################################################
	# Command Line Options
	################################################################################

	# We set default values here.
	options = OpenStruct.new
	options.name = "HistoryIndex"
	options.path = "/Users/#{ENV['USER']}/Library/Safari/"
	options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}"
	options.line = STDOUT.winsize[-1].times.collect{'─'}.join
	options.stdo = true

	options.input = "#{options.name}.sk"
	options.output = "#{options.name}-#{options.time}.txt"
	options.backup = "#{options.name}-#{options.time}.backup"
	options.dump = "#{options.name}-#{options.time}.dump"

	# Start parsing those options
	@option_parser = OptionParser.new do \|opts\|
	opts.program_name = NAME
	opts.version = VERSION
	opts.banner = "#{box_me_up("#{NAME} #{VERSION}")}
	Usage: ruby skhip.rb [options]\n\n"

	opts.separator " Specific options:"
	opts.on "-i", "--input FILE",
	"Path to HistoryIndex.sk" \
	do \|input\|
	options.input = input
	options.name = input[/(?<=\/)[\w]+(?=\.)/]
	options.output = "#{options.name}-#{options.time}.txt"
	File.open(input)
	end

	opts.separator ""
	opts.on "-o", "--output FILE",
	"Relative output location" \
	do \|output\|
	options.output = output
	end

	opts.separator ""
	opts.on "-d", "--use-dump [FILE]",
	"Skip dumping process by specifying an existing dump file.",
	"Leave blank to use default path." \
	do \|dump\|
	options.dump = dump
	File.open(dump) if dump
	end

	opts.separator ""
	opts.on_tail("-h", "--help", "What you're looking at :P") do
	puts opts
	exit
	end

	opts.on_tail("--version", "Show version") do
	puts opts.program_name
	puts opts.version
	exit
	end
	end

	begin
	@option_parser.parse!
	raise OptionParser::ParseError.
	new("arguments provided without switches!") \
	unless ARGV.empty?
	rescue show_error_and_exit
	end

	################################################################################
	# Setup
	################################################################################

	# Make a copy of the HistoryIndex.sk file
	FileUtils.cp options.input, "#{options.backup}" \
	rescue show_error_and_exit

	# Dump it to ASCII chars
	`xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \
	rescue show_error_and_exit

	# Read it in all the fragments
	IO.foreach(options.dump) do \|input\|
	(@acc \|\| @acc=[]) << input.split(' ').last \
	rescue show_error_and_exit
	end

	# Make it one big ass string
	dumped = @acc.join

	################################################################################
	# Parsing
	################################################################################

	parser = [] # collect regexp's and their corresponding replacements

	# Mark as a line thingy
	parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.\|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' }

	# looks like there's some code in there. It's the only other place other than
	# the the URLs that have single dots. Here, we'll add an extra dot and parse it
	# along with the rest 'o that crap.
	parser << { regexp: /~(.)+big/, replacement: '*' }

	# remove dots between single letters by using regex lookaheads
	parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' }

	# Execute!
	parser.each do \|r\|
	# puts r.inspect
	dumped.gsub!(r[:regexp], r[:replacement])
	end

	################################################################################
	# Tokenize that shit
	################################################################################
	current_index = 0
	last_char = ""
	@token = ""
	@tokens = []
	@rejected = []

	dumped.split('').each do \|char\|
	char.strip!

	if (char == "." && last_char != ".")
	# end of a word, add to array
	if @token.length > 1
	@tokens << case @token
	when "http" then "http://"
	else @token
	end
	else
	@rejected << @token
	end
	@token = ""
	end

	if char =~ /[\w\|\-\|\+\|&\|\=\|\?]/
	@token << char
	elsif char != "."
	@rejected << char
	end
	last_char = char
	end

	################################################################################
	# Iterate through tokens to create URL's and newlines
	################################################################################

	@words = []
	last_token = ''
	is_url = false
	tmp = []

	@tokens.each do \|token\|

	if token=~/^http/ && is_url
	@words << tmp
	tmp = []
	elsif token=="SKHIP-PARSER-SEGMENT" && is_url
	@words << tmp
	tmp = []
	is_url = false
	elsif token == "http://"
	is_url = true
	end

	if is_url
	tmp << token
	elsif token=="SKHIP-PARSER-SEGMENT"
	@words << options.line
	elsif token=~/IADefault/
	@words << box_me_up(token)
	else
	@words << token
	end

	last_token = token
	end

	################################################################################
	# Print it out
	################################################################################

	@words.each do \|word\|
	puts word.is_a?(Array) ? word.join('.').sub('.','') : word
	end

	puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n"