repo is here:https://github.com/aozorabunko/aozorabunko
$ git clone https://github.com/aozorabunko/aozorabunko.git
$ du -sh .
7.7G .
$ du -sh * | grep [0-9][GM] | sort --human-numeric-sort --reverse
2.0G cards
50M index_pages
25M access_ranking
18M gaiji
17M kizokeikaku
9.2M soramoyou
6.8M gaiji0213
6.2M kaikei
6.0M banner
4.4M readingTEXT
3.6M hosetsu_kijyun
3.3M kanji_table
2.8M KOSAKU
2.6M KOSAKU1
2.6M gaiji_chuki
2.5M guide
2.3M newJIS-Kanji
2.3M lib
2.1M annotation
1.3M KOSAKU2
1.1M shomei
$ find cards/00* -name '*.html' | xargs grep --no-filename -io 'charset=.*"' | tr '[:upper:]' '[:lower:]' | sort | uniq -c
11 charset=euc-jp"
12801 charset=shift_jis"
13102 charset=utf-8"
294 charset=x-sjis"
$ find ./* -size +1M | xargs ls -lhS | head -30
-rw-r--r-- 1 root root 27M May 30 20:31 ./cards/001727/files/55345_txt_53433.zip
-rw-r--r-- 1 root root 26M May 30 20:31 ./cards/001728/files/55346_txt_53428.zip
-rw-r--r-- 1 root root 21M May 30 20:31 ./cards/001687/files/54927_txt_53431.zip
-rw-r--r-- 1 root root 20M May 30 20:31 ./cards/001505/files/55889_txt_50049.zip
-rw-r--r-- 1 root root 18M May 30 20:31 ./cards/001686/files/54925_txt_53435.zip
-rw-r--r-- 1 root root 17M May 30 20:31 ./cards/001505/files/54923_txt_47242.zip
-rw-r--r-- 1 root root 14M May 30 20:31 ./cards/001505/files/56841_txt_53387.zip
-rw-r--r-- 1 root root 14M May 30 20:31 ./cards/001505/files/54919_txt_47234.zip
-rw-r--r-- 1 root root 13M May 30 20:31 ./cards/001505/files/51344_txt_38341.zip
-rw-r--r-- 1 root root 13M May 30 20:31 ./cards/001727/files/56840_txt_53393.zip
-rw-r--r-- 1 root root 12M May 30 20:31 ./cards/001756/files/55884_txt_50047.zip
-rw-r--r-- 1 root root 12M May 30 20:31 ./cards/001505/files/56843_txt_53391.zip
-rw-r--r-- 1 root root 11M May 30 20:31 ./cards/001505/files/54917_txt_47251.zip
-rw-r--r-- 1 root root 9.5M May 30 20:31 ./cards/001505/files/51347_txt_38347.zip
-rw-r--r-- 1 root root 9.2M May 30 20:32 ./kizokeikaku/aozorabunko_zen.zip
-rw-r--r-- 1 root root 8.7M May 30 20:31 ./cards/001505/files/54921_txt_47238.zip
-rw-r--r-- 1 root root 8.5M May 30 20:31 ./cards/001505/files/55890_txt_50051.zip
-rw-r--r-- 1 root root 8.5M May 30 20:31 ./cards/001505/files/54920_txt_47236.zip
-rw-r--r-- 1 root root 8.5M May 30 20:31 ./cards/001505/files/51346_txt_38345.zip
-rw-r--r-- 1 root root 8.1M May 30 20:31 ./cards/001505/files/54922_txt_47240.zip
-rw-r--r-- 1 root root 7.5M May 30 20:31 ./cards/001505/files/54916_txt_47230.zip
-rw-r--r-- 1 root root 7.3M May 30 20:31 ./cards/001505/files/51345_txt_38343.zip
-rw-r--r-- 1 root root 7.2M May 30 20:31 ./cards/001686/files/54924_txt_47244.zip
-rw-r--r-- 1 root root 6.6M May 30 20:31 ./cards/001393/files/54926_txt_47247.zip
-rw-r--r-- 1 root root 6.0M May 30 20:31 ./cards/001265/files/46817_ruby_24669.zip
-rw-r--r-- 1 root root 5.4M May 30 20:31 ./cards/001393/files/55880_ruby_50041.zip
-rw-r--r-- 1 root root 5.2M May 30 20:31 ./cards/001505/files/56842_txt_53389.zip
-rw-r--r-- 1 root root 4.7M May 30 20:31 ./cards/000212/files/33229_ruby_19203.zip
-rw-r--r-- 1 root root 4.5M May 30 20:31 ./cards/001569/files/52468_ruby_49668.zip
-rw-r--r-- 1 root root 4.1M May 30 20:31 ./cards/001688/files/54928_txt_53429.zip
$ find . -type f | sed 's/^.*\.\(.*\)$/\1/' | sort | uniq -c | sort -n -k1 -r
30835 html
13179 zip
7198 png
2421 gif
1489 ebk
123 GIF
114 book
51 jpg
13 pdf
9 sample
9 css
6 js
6 JPG
6 DS_Store
5 txt
5 ttz
4 so
3 swf
3 Parent
2 idx
2 cgi
2 2
1 sit
1 /README
1 pl
1 pict
1 pack
1 /kizokeikaku/Icon_
1 /kanji_table/touyoukanji_jitaihyou/fig/table7
1 /kanji_table/touyoukanji_jitaihyou/fig/table6
1 /kanji_table/touyoukanji_jitaihyou/fig/table5
1 /kanji_table/touyoukanji_jitaihyou/fig/table4
1 /kanji_table/touyoukanji_jitaihyou/fig/table3
1 /kanji_table/touyoukanji_jitaihyou/fig/table2
1 /kanji_table/touyoukanji_jitaihyou/fig/table1
1 ini
1 ico
1 htm
1 git/refs/remotes/origin/HEAD
1 git/refs/heads/master
1 git/packed-refs
1 git/logs/refs/remotes/origin/HEAD
1 git/logs/refs/heads/master
1 git/logs/HEAD
1 git/info/exclude
1 git/index
1 git/HEAD
1 git/description
1 git/config
1 /gaiji/README
1 doc
1 /cards/README
1 /annotation/fig/Icon_
1 6
1 1