Last active
June 8, 2020 11:19
-
-
Save thesephist/ed90cc1b3798aa8b4651f2e43b5d7e18 to your computer and use it in GitHub Desktop.
Blog writing analysis script, referenced in thesephist.com/posts/blog-analysis/ ✍️
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
` Count sentence length, word size distribution over past posts ` | |
std := load('std') | |
str := load('str') | |
quicksort := load('quicksort') | |
log := std.log | |
f := std.format | |
append := std.append | |
cat := std.cat | |
slice := std.slice | |
flatten := std.flatten | |
reduce := std.reduce | |
map := std.map | |
each := std.each | |
filter := std.filter | |
readFile := std.readFile | |
writeFile := std.writeFile | |
hasPrefix? := str.hasPrefix? | |
split := str.split | |
trim := str.trim | |
trimS := s => trim(s, ' ') | |
sortBy := quicksort.sortBy | |
sort := quicksort.sort | |
` Constants ` | |
PostsDir := './content/posts' | |
Newline := char(10) | |
blank? := s => trimS(s) = '' | |
` find all blog posts on the site and callback with all file names ` | |
withAllPosts := cb => ( | |
postFiles := dir(PostsDir, evt => evt.type :: { | |
'error' -> log('error: could not read posts directory!') | |
'data' -> cb(filter( | |
map(evt.data, entry => entry.name) | |
` filter out hidden files and the _index.md file ` | |
fName => ~(hasPrefix?(fName, '.') | hasPrefix?(fName, '_')) | |
)) | |
}) | |
) | |
` given a potentially double-quoted string, strip the quotes ` | |
stripQuotes := s => s.0 :: { | |
'"' -> slice(s, 1, len(s) - 1) | |
_ -> s | |
} | |
` given a file name to a blog post, parse it completely | |
and return a PostRecord structure with parsed metadata and body ` | |
withPostRecord := (fileName, cb) => ( | |
readFile(PostsDir + '/' + fileName, bytes => ( | |
lines := filter( | |
split(bytes, Newline) | |
` remove raw HTML lines ` | |
line => ~(blank?(line) | hasPrefix?(line, '<')) | |
) | |
` sanitize lines ` | |
lines := map(lines, line => (sub := i => i :: { | |
0 -> line | |
_ -> ( | |
line.(i) :: { | |
'_' -> line.(i) := ' ' | |
'*' -> line.(i) := ' ' | |
'[' -> line.(i) := ' ' | |
']' -> line.(i) := ' ' | |
'(' -> line.(i) := ' ' | |
')' -> line.(i) := ' ' | |
} | |
sub(i - 1) | |
) | |
})(len(line) - 1)) | |
record := { | |
` parse state: | |
0 -> start | |
1 -> inside front matter | |
2 -> after front matter | |
3 -> error, stop parsing ` | |
parseState: 0 | |
title: () | |
date: () | |
body: [] | |
} | |
each(lines, line => record.parseState :: { | |
0 -> line :: { | |
'---' -> record.parseState := 1 | |
_ -> ( | |
log(f('error: unexpected line in post file, {{0}}', [line])) | |
record.parseState := 3 | |
) | |
} | |
1 -> line :: { | |
'---' -> record.parseState := 2 | |
_ -> split(line, ':').0 :: { | |
'title' -> record.title := stripQuotes(trimS(split(line, 'title:').1)) | |
'date' -> record.date := trimS(split(line, 'date:').1) | |
} | |
} | |
2 -> record.body.len(record.body) := line | |
3 -> () | |
}) | |
cb(record) | |
)) | |
) | |
` mean of an array ` | |
mean := xs => len(xs) :: { | |
0 -> ~1 | |
_ -> reduce(xs, (a, b) => a + b, 0) / len(xs) | |
} | |
` median of an array ` | |
median := xs => xs :: { | |
[] -> ~1 | |
_ -> ( | |
sorted := sort(xs) | |
mid := floor(len(sorted) / 2) | |
(len(sorted) % 2) :: { | |
0 -> (sorted.(mid) + sorted.(mid - 1)) / 2 | |
1 -> sorted.(mid) | |
} | |
) | |
} | |
` split up a blog post body into a flat list of words | |
includes doing some sanitization ` | |
getWords := record => filter( | |
flatten(map(record.body, line => split(line, ' '))) | |
` try to remove links and empty words` | |
word => blank?(word) :: { | |
true -> false | |
_ -> ~(hasPrefix?(word, 'http') | hasPrefix?(word, '/')) | |
} | |
) | |
` split up a blog post body into a flat list of sentences | |
includes doing some sanitization ` | |
getSentences := record => flatten(map(record.body, line => split(line, '. '))) | |
` main analysis function that works per-PostRecord, computing | |
statistics over the post body and publishing a CSV ` | |
analyze := records => ( | |
sorted := sortBy(records, r => r.date) | |
log('Serializing word list...') | |
wordLengths := map(sorted, r => map(getWords(r), len)) | |
log('Serializing sentence list...') | |
sentenceLengths := map(sorted, r => map( | |
getSentences(r) | |
sent => len(filter(split(sent, ' '), w => ~blank?(w))) | |
)) | |
log('Computing mean word lengths') | |
meanWordLengths := map(wordLengths, mean) | |
log('Computing median word lengths') | |
medianWordLengths := map(wordLengths, median) | |
log('Computing median sentence lengths') | |
medianSentenceLengths := map(sentenceLengths, median) | |
log('Computing median paragraph lengths') | |
paragraphLengths := map(sorted, record => map( | |
record.body | |
para => len(filter(split(para, ' '), w => ~blank?(w))) | |
)) | |
medianParagraphLengths := map(paragraphLengths, median) | |
results := { | |
dates: map(sorted, r => r.date) | |
meanWordLengths: meanWordLengths | |
medianWordLengths: medianWordLengths | |
medianSentenceLengths: medianSentenceLengths | |
medianParagraphLengths: medianParagraphLengths | |
} | |
csv := renderCSV(results) | |
log(csv) | |
writeFile('./analysis.csv', csv, done => done :: { | |
true -> log('File saved to ./analysis.csv successfully!') | |
() -> log('error: failed to save analysis results csv!') | |
}) | |
) | |
` render results into a CSV for importing into Google Sheets ` | |
renderCSV := results => ( | |
csvLines := [] | |
each(keys(results), key => ( | |
rowData := append([key], map(results.(key), string)) | |
csvLines.len(csvLines) := cat(rowData, ',') | |
)) | |
cat(csvLines, Newline) | |
) | |
` main analysis routine ` | |
postRecords := [] | |
withAllPosts(fileNames => each( | |
fileNames | |
fName => withPostRecord(fName, record => ( | |
log(f('read: [{{ date }}] {{ title }}', record)) | |
postRecords.len(postRecords) := record | |
len(postRecords) :: { | |
len(fileNames) -> analyze(postRecords) | |
} | |
)) | |
)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
` minimal quicksort implementation | |
using hoare partition ` | |
std := load('std') | |
map := std.map | |
clone := std.clone | |
sortBy := (v, pred) => ( | |
vPred := map(v, pred) | |
partition := (v, lo, hi) => ( | |
pivot := vPred.(lo) | |
lsub := i => (vPred.(i) < pivot) :: { | |
true -> lsub(i + 1) | |
false -> i | |
} | |
rsub := j => (vPred.(j) > pivot) :: { | |
true -> rsub(j - 1) | |
false -> j | |
} | |
(sub := (i, j) => ( | |
i := lsub(i) | |
j := rsub(j) | |
(i < j) :: { | |
false -> j | |
true -> ( | |
` inlined swap! ` | |
tmp := v.(i) | |
tmpPred := vPred.(i) | |
v.(i) := v.(j) | |
v.(j) := tmp | |
vPred.(i) := vPred.(j) | |
vPred.(j) := tmpPred | |
sub(i + 1, j - 1) | |
) | |
} | |
))(lo, hi) | |
) | |
(quicksort := (v, lo, hi) => len(v) :: { | |
0 -> v | |
_ -> (lo < hi) :: { | |
false -> v | |
true -> ( | |
p := partition(v, lo, hi) | |
quicksort(v, lo, p) | |
quicksort(v, p + 1, hi) | |
) | |
} | |
})(v, 0, len(v) - 1) | |
) | |
sort! := v => sortBy(v, x => x) | |
sort := v => sort!(clone(v)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment