jarsen · October 10, 2012 18:03
diff --git a/reddit_miner.rb b/reddit_miner.rb
 #!/usr/bin/env ruby -wKU

 require 'net/http'
 require 'json'

 STOPWORDS = %w(
 a about above after again against all am an and any are aren't as at be because been before
 being below between both but by can't cannot could couldn't did didn't do does doesn't doing
 don't down during each few for from further had hadn't has hasn't have haven't having he he'd
 he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in
 into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once
 only or other ought our ours ourselves out over own same shan't she she'd she'll she's should
 shouldn't so some such than that that's the their theirs them themselves then there there's
 these they they'd they'll they're they've this those through to too u under until up ur very
 was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which
 while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your
 yours yourself yourselves will)

 # TODO group synonyms together in histogram using treat gem
 def word_frequency text
 	text.gsub(/[?.!()"',]/, "").downcase.split.inject(Hash.new(0)) do |h, w|
 		word = w.strip
 		h[word] += 1 unless STOPWORDS.include? word
 		h
 	end
 end

 def words_by_frequency text
 	freq = word_frequency text
 	return [""] if freq.empty? # this could happen if the text is all stopwords
 	freq.sort_by{|word, count| -count}.map{|x|x[0]}
 end

 HOST = 'www.reddit.com'
 LIMIT = 100

 def fetch_posts api_url
 	url = api_url + "/.json?limit=#{LIMIT}"
 	response = Net::HTTP::get_response HOST, url
 	response_json = JSON::parse response.body
 	response_json['data']['children'].inject([]) {|arr, v| arr << v['data'] }
 end

 hot_posts = fetch_posts ""
 new_posts = fetch_posts "/new"

 data = []
 [hot_posts, new_posts].each do |source|
 	source.each do |post_json|
 		post = []

 		# score related
 		post << post_json['ups']
 		post << post_json['downs']
 		post << post_json['score']
 		post << post_json['num_comments']

 		#post information
 		post << post_json['subreddit']
 		post << post_json['domain']	# the domain wherever the url is pointing
 		post << post_json['author']
 		post << post_json['created_utc'] # the UTC timestamp
 		post << post_json['over_18'] # mature content
 		post << post_json['thumbnail'].empty? # Has a thumbnail image? True, False

 		# text
 		post << post_json['title'].length # title length
 		post << post_json['selftext'].length # the length of the text for the post
 		text = post_json['title'] + " " + post_json['selftext']
 		words = words_by_frequency text
 		post << words.count # lexical diversity (different number of words)
 		post << words.first

 		# class
 		post << source.equal?(hot_posts) # Hot post? True, False

 		data << post
 	end
 	# to get more info from comments... hot_post['permalink']+'.json?limit=100'
 end

 time = Time.now
 File.open("reddit data #{time}.arff", 'w') do |file|
 	file.puts "% Reddit Mining Project Data"
 	file.puts "% CS 478"
 	file.puts "% Group: Jason Larsen, Tyler Coleman, Kiersten Devenish, Tae Woo Kim"
 	file.puts "% Gathered #{time}"
 	file.puts

 	file.puts "@RELATION reddit"
 	file.puts

 	file.puts "\t@ATTRIBUTE ups 					NUMERIC"
 	file.puts "\t@ATTRIBUTE downs 				NUMERIC"
 	file.puts "\t@ATTRIBUTE score 				NUMERIC"
 	file.puts "\t@ATTRIBUTE num_comments 		NUMERIC"
 	file.puts "\t@ATTRIBUTE subreddit 			STRING"
 	file.puts "\t@ATTRIBUTE domain	 			STRING"
 	file.puts "\t@ATTRIBUTE author 				STRING"
 		file.puts "\t@ATTRIBUTE created_utc			NUMERIC"
 	file.puts "\t@ATTRIBUTE over_18				{true, false}"
 	file.puts "\t@ATTRIBUTE thumbnail			{true, false}"
 	file.puts "\t@ATTRIBUTE title_length 		NUMERIC"
 	file.puts "\t@ATTRIBUTE selftext_length		NUMERIC"
 	file.puts "\t@ATTRIBUTE lexical_diversity 	NUMERIC"
 	file.puts "\t@ATTRIBUTE most_common_word 	STRING"
 	file.puts "\t@ATTRIBUTE hot 					{true, false}"


 	file.puts "\n@DATA"
 	data.each do |row|
 		file.puts "\t" + row.join(', ')
 	end
 end
	#!/usr/bin/env ruby -wKU

	require 'net/http'
	require 'json'

	STOPWORDS = %w(
	a about above after again against all am an and any are aren't as at be because been before
	being below between both but by can't cannot could couldn't did didn't do does doesn't doing
	don't down during each few for from further had hadn't has hasn't have haven't having he he'd
	he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in
	into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once
	only or other ought our ours ourselves out over own same shan't she she'd she'll she's should
	shouldn't so some such than that that's the their theirs them themselves then there there's
	these they they'd they'll they're they've this those through to too u under until up ur very
	was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which
	while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your
	yours yourself yourselves will)

	# TODO group synonyms together in histogram using treat gem
	def word_frequency text
	text.gsub(/[?.!()"',]/, "").downcase.split.inject(Hash.new(0)) do \|h, w\|
	word = w.strip
	h[word] += 1 unless STOPWORDS.include? word
	h
	end
	end

	def words_by_frequency text
	freq = word_frequency text
	return [""] if freq.empty? # this could happen if the text is all stopwords
	freq.sort_by{\|word, count\| -count}.map{\|x\|x[0]}
	end

	HOST = 'www.reddit.com'
	LIMIT = 100

	def fetch_posts api_url
	url = api_url + "/.json?limit=#{LIMIT}"
	response = Net::HTTP::get_response HOST, url
	response_json = JSON::parse response.body
	response_json['data']['children'].inject([]) {\|arr, v\| arr << v['data'] }
	end

	hot_posts = fetch_posts ""
	new_posts = fetch_posts "/new"

	data = []
	[hot_posts, new_posts].each do \|source\|
	source.each do \|post_json\|
	post = []

	# score related
	post << post_json['ups']
	post << post_json['downs']
	post << post_json['score']
	post << post_json['num_comments']

	#post information
	post << post_json['subreddit']
	post << post_json['domain'] # the domain wherever the url is pointing
	post << post_json['author']
	post << post_json['created_utc'] # the UTC timestamp
	post << post_json['over_18'] # mature content
	post << post_json['thumbnail'].empty? # Has a thumbnail image? True, False

	# text
	post << post_json['title'].length # title length
	post << post_json['selftext'].length # the length of the text for the post
	text = post_json['title'] + " " + post_json['selftext']
	words = words_by_frequency text
	post << words.count # lexical diversity (different number of words)
	post << words.first

	# class
	post << source.equal?(hot_posts) # Hot post? True, False

	data << post
	end
	# to get more info from comments... hot_post['permalink']+'.json?limit=100'
	end

	time = Time.now
	File.open("reddit data #{time}.arff", 'w') do \|file\|
	file.puts "% Reddit Mining Project Data"
	file.puts "% CS 478"
	file.puts "% Group: Jason Larsen, Tyler Coleman, Kiersten Devenish, Tae Woo Kim"
	file.puts "% Gathered #{time}"
	file.puts

	file.puts "@RELATION reddit"
	file.puts

	file.puts "\t@ATTRIBUTE ups NUMERIC"
	file.puts "\t@ATTRIBUTE downs NUMERIC"
	file.puts "\t@ATTRIBUTE score NUMERIC"
	file.puts "\t@ATTRIBUTE num_comments NUMERIC"
	file.puts "\t@ATTRIBUTE subreddit STRING"
	file.puts "\t@ATTRIBUTE domain STRING"
	file.puts "\t@ATTRIBUTE author STRING"
	file.puts "\t@ATTRIBUTE created_utc NUMERIC"
	file.puts "\t@ATTRIBUTE over_18 {true, false}"
	file.puts "\t@ATTRIBUTE thumbnail {true, false}"
	file.puts "\t@ATTRIBUTE title_length NUMERIC"
	file.puts "\t@ATTRIBUTE selftext_length NUMERIC"
	file.puts "\t@ATTRIBUTE lexical_diversity NUMERIC"
	file.puts "\t@ATTRIBUTE most_common_word STRING"
	file.puts "\t@ATTRIBUTE hot {true, false}"


	file.puts "\n@DATA"
	data.each do \|row\|
	file.puts "\t" + row.join(', ')
	end
	end