Skip to content

Instantly share code, notes, and snippets.

@garbados
Last active December 17, 2015 21:09
Show Gist options
  • Save garbados/5672627 to your computer and use it in GitHub Desktop.
Save garbados/5672627 to your computer and use it in GitHub Desktop.
map function for ngram mapreduce
map = (doc) ->
size = 4 # = the n in ngram
# chunk function
chunk = (arr, len) ->
chunks = []
i = 0
while i < arr.length
chunks.push arr.slice(i, i += len)
return (x for x in chunks when x.length is len)
# reduce to tokens
tokenize = (value) ->
# isolate tokens
toSplit = new RegExp('\\s+', 'g');
return value.split(toSplit)
# yield ngrams
process = (value) ->
tokens = tokenize value
chunks = []
if tokens
for i in [0...size]
chunks = chunks.concat chunk(tokens.slice(i), size)
else
# string composed of entirely non-alphanumeric characters
return chunks
if doc.user and doc.text
emit(doc.user.screen_name, process(doc.text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment