Created
November 18, 2011 17:25
-
-
Save andreiolariu/1377118 to your computer and use it in GitHub Desktop.
Markov model based on youtube comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for more info check out http://webmining.olariu.org/interview-with-a-lady-gaga-fan | |
# made to be run in the ipython console | |
import urllib, urllib2, time, random | |
import simplejson as json | |
def fetch_url(url, get=None, post=None): | |
user_agent = 'Andrei Olariu\'s Web Mining for Dummies' | |
headers = {'User-Agent': user_agent} | |
if get: | |
data = urllib.urlencode(get) | |
url = "%s?%s" % (url, data) | |
req = urllib2.Request(url, post, headers) | |
try: | |
response = urllib2.urlopen(req).read() | |
response = json.loads(response) | |
except Exception, e: | |
print 'error in reading %s: %s' % (url, e) | |
return None | |
return response | |
# fetch comments for a youtube video (given a video id) by doing repeated | |
# api calls (one call returnes up to 50 comments) | |
def fetch_comments(yid, maxcount=1000): | |
url = 'http://gdata.youtube.com/feeds/api/videos/%s/comments' % yid | |
COUNT = 50 | |
values = { | |
'alt': 'json', | |
'max-results': COUNT, | |
} | |
results = [] | |
for i in range(1, maxcount, COUNT): | |
values['start-index'] = i | |
data = fetch_url(url, get=values) | |
if data and 'feed' in data and 'entry' in data['feed'] and \ | |
len(data['feed']['entry']) > 0: | |
results.extend([c['content']['$t'] for c in data['feed']['entry']]) | |
else: | |
break | |
time.sleep(0.1) | |
return results | |
# builds the markov model | |
# every state is defined as a pair of words | |
# state (word[k], word[k+1]) depends on state (word[k-1], word[k]) | |
def add_to_markov(markov, words): | |
if len(words) < 3: | |
return | |
if words[0] not in markov: | |
markov[words[0]] = {} | |
if words[1] not in markov[words[0]]: | |
markov[words[0]][words[1]] = {} | |
if words[2] not in markov[words[0]][words[1]]: | |
markov[words[0]][words[1]][words[2]] = 0 | |
markov[words[0]][words[1]][words[2]] += 1 | |
add_to_markov(markov, words[1:]) | |
# given a state (aka a pair of words (word1, word2)), | |
# find the next state (aka another pair of words (word2, word3)) | |
def get_next(markov, word1, word2): | |
if word1 not in markov or word2 not in markov[word1]: | |
return None | |
total = sum([c for c in markov[word1][word2].itervalues()]) | |
choose = random.randint(1, total) | |
for w, c in markov[word1][word2].iteritems(): | |
choose -= c | |
if choose <= 0: | |
return w | |
# given a starting state, find future states in a recursive way | |
def get_phrase(markov, word1, word2, limit=50): | |
if limit == 0: | |
return '' | |
word3 = get_next(markov, word1, word2) | |
if not word3: | |
return '' | |
return '%s %s' % (word3, get_phrase(markov, word2, word3, limit - 1)) | |
# given a sentence beginning, add words using the markov model | |
# the starting state is given by the last 2 words in the sentence, | |
# all other words are not used | |
def talk(markov, start): | |
words = re.findall(r'\w+', start.lower()) | |
if len(words) < 2: | |
return None | |
return '%s %s' % (start, get_phrase(markov, words[-2], words[-1])) | |
# get comments for a video | |
yid = 'UzxYlbK2c7E' # machine learning | |
# use 'qrO4YZeyl0I' for lady gaga | |
comments = fetch_comments(yid) | |
# split comments in phrases | |
texts = [] | |
r = re.compile("[.!?;]") | |
for c in comments: | |
for line in c.splitlines(): | |
texts.extend(r.split(line)) | |
# split phrases into words and build the markov model | |
markov = {} | |
for t in texts: | |
remove_first = t.startswith('@') # remove usernames | |
t = t.lower() | |
words = re.findall(r'\w+', t) | |
if remove_first: | |
words = words[1:] | |
add_to_markov(markov, words) | |
# have fun | |
print talk(markov, 'i like') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment