Created
October 10, 2012 18:03
-
-
Save jarsen/3867226 to your computer and use it in GitHub Desktop.
Script for data mining reddit and producing ARFF file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby -wKU | |
require 'net/http' | |
require 'json' | |
STOPWORDS = %w( | |
a about above after again against all am an and any are aren't as at be because been before | |
being below between both but by can't cannot could couldn't did didn't do does doesn't doing | |
don't down during each few for from further had hadn't has hasn't have haven't having he he'd | |
he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in | |
into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once | |
only or other ought our ours ourselves out over own same shan't she she'd she'll she's should | |
shouldn't so some such than that that's the their theirs them themselves then there there's | |
these they they'd they'll they're they've this those through to too u under until up ur very | |
was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which | |
while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your | |
yours yourself yourselves will) | |
# TODO group synonyms together in histogram using treat gem | |
def word_frequency text | |
text.gsub(/[?.!()"',]/, "").downcase.split.inject(Hash.new(0)) do |h, w| | |
word = w.strip | |
h[word] += 1 unless STOPWORDS.include? word | |
h | |
end | |
end | |
def words_by_frequency text | |
freq = word_frequency text | |
return [""] if freq.empty? # this could happen if the text is all stopwords | |
freq.sort_by{|word, count| -count}.map{|x|x[0]} | |
end | |
HOST = 'www.reddit.com' | |
LIMIT = 100 | |
def fetch_posts api_url | |
url = api_url + "/.json?limit=#{LIMIT}" | |
response = Net::HTTP::get_response HOST, url | |
response_json = JSON::parse response.body | |
response_json['data']['children'].inject([]) {|arr, v| arr << v['data'] } | |
end | |
hot_posts = fetch_posts "" | |
new_posts = fetch_posts "/new" | |
data = [] | |
[hot_posts, new_posts].each do |source| | |
source.each do |post_json| | |
post = [] | |
# score related | |
post << post_json['ups'] | |
post << post_json['downs'] | |
post << post_json['score'] | |
post << post_json['num_comments'] | |
#post information | |
post << post_json['subreddit'] | |
post << post_json['domain'] # the domain wherever the url is pointing | |
post << post_json['author'] | |
post << post_json['created_utc'] # the UTC timestamp | |
post << post_json['over_18'] # mature content | |
post << post_json['thumbnail'].empty? # Has a thumbnail image? True, False | |
# text | |
post << post_json['title'].length # title length | |
post << post_json['selftext'].length # the length of the text for the post | |
text = post_json['title'] + " " + post_json['selftext'] | |
words = words_by_frequency text | |
post << words.count # lexical diversity (different number of words) | |
post << words.first | |
# class | |
post << source.equal?(hot_posts) # Hot post? True, False | |
data << post | |
end | |
# to get more info from comments... hot_post['permalink']+'.json?limit=100' | |
end | |
time = Time.now | |
File.open("reddit data #{time}.arff", 'w') do |file| | |
file.puts "% Reddit Mining Project Data" | |
file.puts "% CS 478" | |
file.puts "% Group: Jason Larsen, Tyler Coleman, Kiersten Devenish, Tae Woo Kim" | |
file.puts "% Gathered #{time}" | |
file.puts | |
file.puts "@RELATION reddit" | |
file.puts | |
file.puts "\t@ATTRIBUTE ups NUMERIC" | |
file.puts "\t@ATTRIBUTE downs NUMERIC" | |
file.puts "\t@ATTRIBUTE score NUMERIC" | |
file.puts "\t@ATTRIBUTE num_comments NUMERIC" | |
file.puts "\t@ATTRIBUTE subreddit STRING" | |
file.puts "\t@ATTRIBUTE domain STRING" | |
file.puts "\t@ATTRIBUTE author STRING" | |
file.puts "\t@ATTRIBUTE created_utc NUMERIC" | |
file.puts "\t@ATTRIBUTE over_18 {true, false}" | |
file.puts "\t@ATTRIBUTE thumbnail {true, false}" | |
file.puts "\t@ATTRIBUTE title_length NUMERIC" | |
file.puts "\t@ATTRIBUTE selftext_length NUMERIC" | |
file.puts "\t@ATTRIBUTE lexical_diversity NUMERIC" | |
file.puts "\t@ATTRIBUTE most_common_word STRING" | |
file.puts "\t@ATTRIBUTE hot {true, false}" | |
file.puts "\n@DATA" | |
data.each do |row| | |
file.puts "\t" + row.join(', ') | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment