Created
February 8, 2015 21:10
-
-
Save epitron/f1d1ea99d39838dfe1a3 to your computer and use it in GitHub Desktop.
Uses the CodeRay JavaScript parser to extract all the strings in JavaScript source code.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'coderay' | |
require 'pp' | |
require 'epitools' | |
# | |
# Example tokenization: | |
# | |
# [ | |
# ["toggle", :ident], ["(", :operator], | |
# [:begin_group, :string], | |
# ["'", :delimiter], ["clue_DJ_6_5", :content], ["'", :delimiter], | |
# [:end_group, :string], | |
# [",", :operator], [" ", :space], | |
# [:begin_group, :string], | |
# ["'", :delimiter], ["clue_DJ_6_5_stuck", :content], ["'", :delimiter], | |
# [:end_group, :string], | |
# [",", :operator], [" ", :space], | |
# [:begin_group, :string], ["'", :delimiter], ["After the physicist who discovered X-rays, it", :content], ["\\'", :char], ["s another name for a doctor who interprets X-rays", :content], ["'", :delimiter], | |
# [:end_group, :string], | |
# [")", :operator] | |
# ] | |
CHAR_TRANSLATE = { "\\'" => "'" } | |
def extract_strings_from_javascript(js) | |
toks = CodeRay::Scanners::JavaScript.new.tokenize(js).each_slice(2) | |
parse_toks(toks) | |
# pp toks.to_a | |
end | |
def parse_toks(toks) | |
result = [] | |
current = [] | |
toks.each do |tok, type| | |
if (tok == :begin_group and type == :string) .. (tok == :end_group and type == :string) | |
case type | |
when :content | |
current << tok | |
when :char | |
current << CHAR_TRANSLATE[tok] || tok | |
when :delimiter, :string | |
# skip | |
else | |
puts "Unknown token type: #{type}" | |
end | |
if tok == :end_group | |
result << current.join | |
current.clear | |
end | |
end | |
end | |
result | |
end | |
if __FILE__ == $0 | |
ex1 = "toggle('clue_DJ_6_5', 'clue_DJ_6_5_stuck', 'After the physicist who discovered X-rays, it\\'s another name for a doctor who interprets X-rays')" | |
ex2 = "toggle('clue_DJ_6_5', 'clue_DJ_6_5_stuck', '<em class=\"correct_response\">a roentgenologist</em><br /><br /><table width=\"100%\"><tr><td class=\"right\">Justin</td></tr></table>')" | |
[ex1, ex2].each do |str| | |
result = extract_strings_from_javascript(str) | |
result.each {|s| puts s} | |
# p toks.map { |tok, type| tok if type == :content }.compact | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment