Created
May 19, 2015 01:15
-
-
Save ibanez270dx/ff7a66e26e9be9ea8357 to your computer and use it in GitHub Desktop.
SKHIP work in progress
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'io/console' | |
require 'optparse' | |
require 'ostruct' | |
require 'fileutils' | |
NAME = "Safari Keyword History Index Parser" | |
VERSION = "v0.0.1" | |
TIME = Time.now | |
def box_me_up(str) | |
width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join | |
# width = (str.length+2).times.collect{'═'}.join | |
str = str.center(STDOUT.winsize[-1]-4,' ') | |
boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n" | |
end | |
def show_error_and_exit | |
puts @option_parser.banner | |
puts " #{$!}\n use --help for more information\n\n" | |
exit 1 | |
end | |
################################################################################ | |
# Command Line Options | |
################################################################################ | |
# We set default values here. | |
options = OpenStruct.new | |
options.name = "HistoryIndex" | |
options.path = "/Users/#{ENV['USER']}/Library/Safari/" | |
options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}" | |
options.line = STDOUT.winsize[-1].times.collect{'─'}.join | |
options.stdo = true | |
options.input = "#{options.name}.sk" | |
options.output = "#{options.name}-#{options.time}.txt" | |
options.backup = "#{options.name}-#{options.time}.backup" | |
options.dump = "#{options.name}-#{options.time}.dump" | |
# Start parsing those options | |
@option_parser = OptionParser.new do |opts| | |
opts.program_name = NAME | |
opts.version = VERSION | |
opts.banner = "#{box_me_up("#{NAME} #{VERSION}")} | |
Usage: ruby skhip.rb [options]\n\n" | |
opts.separator " Specific options:" | |
opts.on "-i", "--input FILE", | |
"Path to HistoryIndex.sk" \ | |
do |input| | |
options.input = input | |
options.name = input[/(?<=\/)[\w]+(?=\.)/] | |
options.output = "#{options.name}-#{options.time}.txt" | |
File.open(input) | |
end | |
opts.separator "" | |
opts.on "-o", "--output FILE", | |
"Relative output location" \ | |
do |output| | |
options.output = output | |
end | |
opts.separator "" | |
opts.on "-d", "--use-dump [FILE]", | |
"Skip dumping process by specifying an existing dump file.", | |
"Leave blank to use default path." \ | |
do |dump| | |
options.dump = dump | |
File.open(dump) if dump | |
end | |
opts.separator "" | |
opts.on_tail("-h", "--help", "What you're looking at :P") do | |
puts opts | |
exit | |
end | |
opts.on_tail("--version", "Show version") do | |
puts opts.program_name | |
puts opts.version | |
exit | |
end | |
end | |
begin | |
@option_parser.parse! | |
raise OptionParser::ParseError. | |
new("arguments provided without switches!") \ | |
unless ARGV.empty? | |
rescue show_error_and_exit | |
end | |
################################################################################ | |
# Setup | |
################################################################################ | |
# Make a copy of the HistoryIndex.sk file | |
FileUtils.cp options.input, "#{options.backup}" \ | |
rescue show_error_and_exit | |
# Dump it to ASCII chars | |
`xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \ | |
rescue show_error_and_exit | |
# Read it in all the fragments | |
IO.foreach(options.dump) do |input| | |
(@acc || @acc=[]) << input.split(' ').last \ | |
rescue show_error_and_exit | |
end | |
# Make it one big ass string | |
dumped = @acc.join | |
################################################################################ | |
# Parsing | |
################################################################################ | |
parser = [] # collect regexp's and their corresponding replacements | |
# Mark as a line thingy | |
parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' } | |
# looks like there's some code in there. It's the only other place other than | |
# the the URLs that have single dots. Here, we'll add an extra dot and parse it | |
# along with the rest 'o that crap. | |
parser << { regexp: /~(.)+big/, replacement: '*' } | |
# remove dots between single letters by using regex lookaheads | |
parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' } | |
# Execute! | |
parser.each do |r| | |
# puts r.inspect | |
dumped.gsub!(r[:regexp], r[:replacement]) | |
end | |
################################################################################ | |
# Tokenize that shit | |
################################################################################ | |
current_index = 0 | |
last_char = "" | |
@token = "" | |
@tokens = [] | |
@rejected = [] | |
dumped.split('').each do |char| | |
char.strip! | |
if (char == "." && last_char != ".") | |
# end of a word, add to array | |
if @token.length > 1 | |
@tokens << case @token | |
when "http" then "http://" | |
else @token | |
end | |
else | |
@rejected << @token | |
end | |
@token = "" | |
end | |
if char =~ /[\w|\-|\+|&|\=|\?]/ | |
@token << char | |
elsif char != "." | |
@rejected << char | |
end | |
last_char = char | |
end | |
################################################################################ | |
# Iterate through tokens to create URL's and newlines | |
################################################################################ | |
@words = [] | |
last_token = '' | |
is_url = false | |
tmp = [] | |
@tokens.each do |token| | |
if token=~/^http/ && is_url | |
@words << tmp | |
tmp = [] | |
elsif token=="SKHIP-PARSER-SEGMENT" && is_url | |
@words << tmp | |
tmp = [] | |
is_url = false | |
elsif token == "http://" | |
is_url = true | |
end | |
if is_url | |
tmp << token | |
elsif token=="SKHIP-PARSER-SEGMENT" | |
@words << options.line | |
elsif token=~/IADefault/ | |
@words << box_me_up(token) | |
else | |
@words << token | |
end | |
last_token = token | |
end | |
################################################################################ | |
# Print it out | |
################################################################################ | |
@words.each do |word| | |
puts word.is_a?(Array) ? word.join('.').sub('.','') : word | |
end | |
puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'io/console' | |
require 'optparse' | |
require 'ostruct' | |
require 'fileutils' | |
NAME = "Safari Keyword History Index Parser" | |
VERSION = "v0.0.1" | |
TIME = Time.now | |
def box_me_up(str) | |
width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join | |
# width = (str.length+2).times.collect{'═'}.join | |
str = str.center(STDOUT.winsize[-1]-4,' ') | |
boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n" | |
end | |
def show_error_and_exit | |
puts @option_parser.banner | |
puts " #{$!}\n use --help for more information\n\n" | |
exit 1 | |
end | |
################################################################################ | |
# Command Line Options | |
################################################################################ | |
# We set default values here. | |
options = OpenStruct.new | |
options.name = "HistoryIndex" | |
options.path = "/Users/#{ENV['USER']}/Library/Safari/" | |
options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}" | |
options.line = STDOUT.winsize[-1].times.collect{'─'}.join | |
options.stdo = true | |
options.input = "#{options.name}.sk" | |
options.output = "#{options.name}-#{options.time}.txt" | |
options.backup = "#{options.name}-#{options.time}.backup" | |
options.dump = "#{options.name}-#{options.time}.dump" | |
# Start parsing those options | |
@option_parser = OptionParser.new do |opts| | |
opts.program_name = NAME | |
opts.version = VERSION | |
opts.banner = "#{box_me_up("#{NAME} #{VERSION}")} | |
Usage: ruby skhip.rb [options]\n\n" | |
opts.separator " Specific options:" | |
opts.on "-i", "--input FILE", | |
"Path to HistoryIndex.sk" \ | |
do |input| | |
options.input = input | |
options.name = input[/(?<=\/)[\w]+(?=\.)/] | |
options.output = "#{options.name}-#{options.time}.txt" | |
File.open(input) | |
end | |
opts.separator "" | |
opts.on "-o", "--output FILE", | |
"Relative output location" \ | |
do |output| | |
options.output = output | |
end | |
opts.separator "" | |
opts.on "-d", "--use-dump [FILE]", | |
"Skip dumping process by specifying an existing dump file.", | |
"Leave blank to use default path." \ | |
do |dump| | |
options.dump = dump | |
File.open(dump) if dump | |
end | |
opts.separator "" | |
opts.on_tail("-h", "--help", "What you're looking at :P") do | |
puts opts | |
exit | |
end | |
opts.on_tail("--version", "Show version") do | |
puts opts.program_name | |
puts opts.version | |
exit | |
end | |
end | |
begin | |
@option_parser.parse! | |
raise OptionParser::ParseError. | |
new("arguments provided without switches!") \ | |
unless ARGV.empty? | |
rescue show_error_and_exit | |
end | |
################################################################################ | |
# Setup | |
################################################################################ | |
# Make a copy of the HistoryIndex.sk file | |
FileUtils.cp options.input, "#{options.backup}" \ | |
rescue show_error_and_exit | |
# Dump it to ASCII chars | |
`xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \ | |
rescue show_error_and_exit | |
# Read it in all the fragments | |
IO.foreach(options.dump) do |input| | |
(@acc || @acc=[]) << input.split(' ').last \ | |
rescue show_error_and_exit | |
end | |
# Make it one big ass string | |
dumped = @acc.join | |
################################################################################ | |
# Parsing | |
################################################################################ | |
parser = [] # collect regexp's and their corresponding replacements | |
# Make a big o' line breaky thing if there's a lot of dots | |
parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' } | |
# replace sets of 3 dots with a single dot | |
# parser << { regexp: /(?<=[^\.])\.{3}(?=[^\.])/, replacement: ',' } | |
# remove single non-word characters (between two dots) | |
# parser << { regexp: /(?<=\.)[](?=\.)/, replacement: '..' } | |
# looks like there's some code in there. It's the only other place other than | |
# the the URLs that have single dots. Here, we'll add an extra dot and parse it | |
# along with the rest 'o that crap. | |
parser << { regexp: /~(.)+big/, replacement: '*' } | |
# remove dots between single letters by using regex lookaheads | |
parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' } | |
# remove single "stand-alone" characters | |
# parser << { regexp: /(?:\.{2,}|\n)[^-\.]{1,2}(?=\.{2,}|\n)/, replacement: '' } | |
# gonna assume that URL's HTTP part needs some slashes | |
# parser << { regexp: /http\.\.(?=[\w])/, replacement: 'http://' } | |
# put a newline before each URL | |
# parser << { regexp: /(\/?\.{1,})(?=https?)/, replacement: " " } | |
# Make a big o' line breaky thing if there's a lot of dots | |
# parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: options.line } | |
# clean up the ends of the URLs | |
# parser << { regexp: /\.{2,}(\w|{|}|\\|\d|;)+\n/, replacement: "\n" } | |
# replace all dot sequences longer than one with a comma | |
# parser << { regexp: /\.{2,}/, replacement: "," } | |
# add a newline after "html"... just makes things easier :P | |
# parser << { regexp: /(?<=html)()[^\n]/, replacement: "\n" } | |
# surround the headers with a border | |
# parser << { regexp: /()(?=IA\w+)/, replacement: options.line } | |
# parser << { regexp: /(?:IADefault)(?:I\w+|T\w+)(\n)/, replacement: options.line } | |
# Put line breaks in the remaining word blocks | |
# parser << { regexp: //, replacement: "\n" } | |
# Execute! | |
parser.each do |r| | |
# puts r.inspect | |
dumped.gsub!(r[:regexp], r[:replacement]) | |
end | |
puts dumped | |
################################################################################ | |
# Tokenize that shit | |
################################################################################ | |
current_index = 0 | |
last_char = "" | |
@token = "" | |
@tokens = [] | |
@rejected = [] | |
dumped.split('').each do |char| | |
char.strip! | |
if (char == "." && last_char != ".") | |
# end of a word, add to array | |
if @token.length > 1 | |
@tokens << case @token | |
when "http" then "http://" | |
else @token | |
end | |
else | |
@rejected << @token | |
end | |
@token = "" | |
end | |
if char =~ /[\w|\-|\+|&|\=|\?]/ | |
@token << char | |
elsif char != "." | |
@rejected << char | |
end | |
last_char = char | |
end | |
puts @tokens.inspect | |
# puts @rejected.inspect | |
################################################################################ | |
# Iterate through tokens to create URL's and newlines | |
################################################################################ | |
@words = [] | |
last_token = '' | |
is_url = false | |
tmp = [] | |
@tokens.each do |token| | |
if token=~/^http/ && is_url | |
@words << tmp | |
tmp = [] | |
elsif token=="SKHIP-PARSER-SEGMENT" && is_url | |
@words << tmp | |
tmp = [] | |
is_url = false | |
elsif token == "http://" | |
is_url = true | |
end | |
if is_url | |
tmp << token | |
elsif token=="SKHIP-PARSER-SEGMENT" | |
@words << options.line | |
elsif token=~/IADefault/ | |
@words << box_me_up(token) | |
else | |
@words << token | |
end | |
last_token = token | |
end | |
puts @words.inspect | |
@words.each do |word| | |
w = word.is_a?(Array) ? word.join('.').sub('.','') : word | |
puts w | |
end | |
################################################################################ | |
# Filter Wierd Artifacts | |
################################################################################ | |
# artifacts = [] | |
# collect regexp's and their corresponding replacements | |
# artifacts << { regexp: /\.\=\=/, replacement: '' } | |
# artifacts << { regexp: /http0/, replacement: 'http:' } | |
# artifacts << /http\n.+\n/ | |
# artifacts << /z\.{+\n/ | |
# artifacts << /E\.F\.\w/ | |
# artifacts << /Bud2/ | |
# artifacts << /.?\.["|-]/ | |
# Execute! | |
# artifacts.each do |artifact| | |
# regexp = Regexp.new artifact | |
# words.match(regexp).to_a.each do |match| | |
# (@removals || @removals=[]) << match.to_s | |
# end | |
# words.gsub! regexp, '' | |
# end | |
# Remove double spaces | |
# words.gsub!(/\n{2,}/,"\n") | |
# | |
# artifacts.each do |a| | |
# dumped.gsub!(r[:regexp], r[:replacement]) | |
# end | |
# puts words | |
# puts box_me_up('Artifact Removals:') | |
# @removals.each { |x| puts x } | |
# output = File.open("history_index_output.txt", 'w+') | |
# words.each do |word| | |
# output.puts word | |
# end | |
# | |
# output.close | |
puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment