Created
December 2, 2012 06:50
-
-
Save arn-e/4187393 to your computer and use it in GitHub Desktop.
text_analysis_sandbox
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'uri' | |
require 'net/http' | |
require 'net/https' | |
require 'json' | |
require 'tf_idf' | |
def get_access_token | |
request, http = set_connection_parameters("https://api.github.com/repos/pengwynn/octokit/issues?state=closed", 443) | |
response = http.request(request) | |
parsed = JSON.parse(response.body) | |
end | |
def set_connection_parameters(url, port = 80) | |
uri = URI.parse(url) | |
uri.port = port | |
http = Net::HTTP.new(uri.host, uri.port) | |
if port == 443 | |
http.use_ssl = true | |
http.verify_mode = OpenSSL::SSL::VERIFY_NONE | |
end | |
request = Net::HTTP::Get.new(uri.request_uri) | |
[request, http] | |
end | |
def write_file(data) | |
file = File.open("temp_data_file.txt","w") | |
data.each_with_index do |i, idx| | |
# p i | |
# p i["title"] | |
file.write("#{idx} ::: #{i["title"]} ::: #{i["body"]}") | |
end | |
file.close | |
end | |
def read_file | |
file = File.open("temp_data_file.txt","r") | |
end | |
def most_frequent_words | |
file = read_file | |
word_hash = Hash.new(0) | |
file.each_with_index do |i, idx| | |
words = i.split(' ').delete_if {|i| i == ":::"} | |
words.each_with_index do |j, idx| | |
word_hash[j] += 1 | |
end | |
end | |
freq = word_hash.sort_by {|key, value| value} | |
freq.each {|i| p i} | |
end | |
def word_importance | |
file = read_file | |
document = [] | |
file.each_with_index do |i, idx| | |
words = i.split(' ').delete_if {|i| i == ":::"} | |
document << words | |
end | |
# document.each {|i| p i} | |
a = TfIdf.new(document) | |
combined = Hash.new(0) | |
a.tf_idf.each do |i| | |
i.each do |key, value| | |
(combined[key] = value) if (value > combined[key]) | |
end | |
end | |
combined = combined.sort_by {|key, value| value} | |
combined.each {|i| p i} | |
end | |
word_importance |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment