Created
September 19, 2015 22:06
-
-
Save psinger/a65b00daf31dc66fa4fa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function features(row, emit) { | |
text = row.body | |
text_no_url = text | |
url_count = 0 | |
matcher = text.match(/(https:[\/][\/]|http:[\/][\/]|www.)[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?\/?([a-zA-Z0-9\-\._\?\,\'\/\\\+&%\$#\=~])*/g) | |
if (matcher != null) { | |
text_no_url = text.replace(/(https:[\/][\/]|http:[\/][\/]|www.)[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?\/?([a-zA-Z0-9\-\._\?\,\'\/\\\+&%\$#\=~])*/g, "") | |
} | |
emit({ | |
flesch_reading_ease: flesch_reading_ease(text_no_url), | |
flesch_kincaid_grade: flesch_kincaid_grade(text_no_url), | |
smog_index: smog_index(text_no_url), | |
gunning_fog_index: gunning_fog_index(text_no_url) | |
}); | |
} | |
function word_count(s) { | |
counter = 0 | |
words = s.split(" ") | |
for (i=0; i<words.length;i++) { | |
if (words[i].replace(/^\W+|\W+$/gm,'').length != 0) { | |
counter += 1 | |
} | |
} | |
return counter | |
} | |
function sentence_count(s) { | |
counter = 0 | |
sentences = s.split(/ *[\.\?!][\'"\)\]]* +/) | |
for (i=0; i<sentences.length;i++) { | |
if (sentences[i].length != 0) { | |
counter += 1 | |
} | |
} | |
return counter | |
} | |
// http://eayd.in/?p=232 | |
function syllable_count_word(s) { | |
var word = s | |
var exception_add = ['serious','crucial']; | |
var exception_del = ['fortunately','unfortunately']; | |
var co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']; | |
var co_two = ['coapt','coed','coinci']; | |
var pre_one = ['preach']; | |
var syls = 0; | |
var disc = 0; | |
if (word.length <= 3) { | |
syls = 1; | |
return syls | |
} | |
if (word.slice(word.length-2, word.length) == "es" || word.slice(word.length-2, word.length) == "ed") { | |
var matcher = word.match(/[eaoui][eaoui]/g); | |
var matcher2 = word.match(/[eaoui][^eaoui]/g); | |
var checker = -1; | |
if (matcher != null) { | |
if (matcher.length > 1) { | |
checker = 1; | |
} | |
} | |
if (matcher2 != null) { | |
if (matcher2.length > 1) { | |
checker = 1; | |
} | |
} | |
if ((word.slice(word.length-3, word.length) != "ted") && (word.slice(word.length-3, word.length) != "tes") && (word.slice(word.length-3, word.length) != "ses") && (word.slice(word.length-3, word.length) != "ied") && (word.slice(word.length-3, word.length) != "ies") && (checker == 1)) { | |
disc = disc + 1; | |
} | |
} | |
var le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']; | |
if (word[word.length-1] == "e") { | |
if (word.slice(word.length-2, word.length) != "le" || le_except.indexOf(word) > -1) { | |
disc = disc + 1 | |
} | |
} | |
var matcher = word.match(/[eaoui][eaoui]/g); | |
var doubleAndtriple = 0 | |
if (matcher != null) { | |
doubleAndtriple = matcher.length | |
} | |
var matcher = word.match(/[eaoui][eaoui][eaoui]/g); | |
var triple = 0; | |
if (matcher != null) { | |
triple = matcher.length | |
} | |
disc = disc + doubleAndtriple + triple | |
var numVowels = 0 | |
var matcher = word.match(/[eaoui]/g); | |
if (matcher != null) { | |
numVowels = matcher.length | |
} | |
if (word.slice(0, 2) == "mc") { | |
syls = syls + 1; | |
} | |
if (word[word.length-1] == "y" && "aeoui".indexOf(word[word.length-2]) == -1) { | |
syls = syls + 1; | |
} | |
for (var i=0; i < word.length; i++) { | |
if (word[i] == "y") { | |
if (i != 0 && i != word.length-1) { | |
if ("aeoui".indexOf(word[i-1]) == -1 && "aeoui".indexOf(word[i+1]) == -1) { | |
syls = syls + 1; | |
} | |
} | |
} | |
} | |
if (word.slice(0, 3) == "tri" && "aeoui".indexOf(word[3]) > -1) { | |
syls = syls + 1; | |
} | |
if (word.slice(0, 2) == "bi" && "aeoui".indexOf(word[2]) > -1) { | |
syls = syls + 1; | |
} | |
if (word.slice(word.length-3, word.length) == "ian") { | |
if (word.slice(word.length-4, word.length) != "cian" && word.slice(word.length-4, word.length) != "tian") { | |
syls = syls + 1; | |
} | |
} | |
if (word.slice(0,2) == "co" && "aeoui".indexOf(word[2]) > -1) { | |
if (co_two.indexOf(word.slice(0,4)) > -1 || co_two.indexOf(word.slice(0,5)) > -1 || co_two.indexOf(word.slice(0,6)) > -1) { | |
syls = syls + 1; | |
} | |
else if (co_one.indexOf(word.slice(0,4)) > -1 || co_one.indexOf(word.slice(0,5)) > -1 || co_one.indexOf(word.slice(0,6)) > -1) { | |
} | |
else { | |
syls = syls + 1; | |
} | |
} | |
if (word.slice(0,3) == "pre" && "aeoui".indexOf(word[3]) > -1) { | |
if (pre_one.indexOf(word.slice(0,6)) == -1) { | |
syls = syls +1 | |
} | |
} | |
var negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"] | |
if (word.slice(word.length-3, word.length) == "n't") { | |
if (negative.indexOf(word) > -1) { | |
syls = syls + 1; | |
} | |
} | |
if (exception_del.indexOf(word) > -1) { | |
syls = syls + 1; | |
} | |
if (exception_add.indexOf(word) > -1) { | |
syls = syls + 1; | |
} | |
return numVowels - disc + syls | |
} | |
function syllable_count(s) { | |
var words = s.split(" ") | |
syllables = 0 | |
for (i=0;i<words.length;i++) { | |
word = words[i].toLowerCase().replace(/^\W+|\W+$/gm,'') | |
if (word.length != 0) { | |
syllables = syllables + syllable_count_word(word) | |
} | |
} | |
return syllables | |
} | |
function polysyllable_count(s) { | |
var words = s.split(" ") | |
polysyllables = 0 | |
for (i=0;i<words.length;i++) { | |
word = words[i].toLowerCase().replace(/^\W+|\W+$/gm,'') | |
if (word.length != 0) { | |
if (syllable_count_word(word) >= 3) { | |
polysyllables = polysyllables + 1 | |
} | |
} | |
} | |
return polysyllables | |
} | |
function avg_sentence_length(s) { | |
wc = word_count(s) | |
sc = sentence_count(s) | |
return wc / sc | |
} | |
function avg_syllables_per_word(s) { | |
sc = syllable_count(s) | |
wc = word_count(s) | |
return sc / wc | |
} | |
function flesch_reading_ease(s) { | |
asl = avg_sentence_length(s) | |
asw = avg_syllables_per_word(s) | |
flesch = 206.835 - 1.015 * asl - 84.6 * asw | |
if (flesch > 120.) { | |
flesch = 120. | |
} | |
if (flesch < 0.) { | |
flesch = 0. | |
} | |
if (isNaN(flesch)) { | |
flesch = 120. | |
} | |
return flesch | |
} | |
function flesch_kincaid_grade(s) { | |
asl = avg_sentence_length(s) | |
asw = avg_syllables_per_word(s) | |
flesch_kincaid = 0.39 * asl + 11.8 * asw - 15.59 | |
if (flesch_kincaid > 22.) { | |
flesch_kincaid = 22. | |
} | |
if (flesch_kincaid < 0.) { | |
flesch_kincaid = 0. | |
} | |
if (isNaN(flesch_kincaid)) { | |
flesch_kincaid = 0. | |
} | |
return flesch_kincaid | |
} | |
function smog_index(s) { | |
polys = polysyllable_count(s) | |
ns = sentence_count(s) | |
smog = (1.043 * Math.sqrt(polys * (30/ns)) + 3.1291) | |
if (smog > 22.) { | |
smog = 22. | |
} | |
if (isNaN(smog)) { | |
smog = 0. | |
} | |
return smog | |
} | |
function gunning_fog_index(s) { | |
polys = polysyllable_count(s) | |
nw = word_count(s) | |
asl = avg_sentence_length(s) | |
gunning_fog = 0.4*(asl+100*(polys/nw)) | |
if (gunning_fog > 22.) { | |
gunning_fog = 22. | |
} | |
if (isNaN(gunning_fog)) { | |
gunning_fog = 0. | |
} | |
return gunning_fog | |
} | |
bigquery.defineFunction( | |
'features', // Name of the function exported to SQL | |
['body'], // Names of input columns | |
[ | |
{'name': 'flesch_reading_ease', 'type': 'float'}, | |
{'name': 'flesch_kincaid_grade', 'type': 'float'}, | |
{'name': 'smog_index', 'type': 'float'}, | |
{'name': 'gunning_fog_index', 'type': 'float'}], | |
features // Reference to JavaScript UDF | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment