Last active
December 20, 2015 08:49
-
-
Save coderiot/6103574 to your computer and use it in GitHub Desktop.
download and processing ngram dataset from http://books.google.com/ngrams by language and ngram length with gnu parallel.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| langs=('eng' 'eng-1M' 'eng-us' 'eng-gb' 'eng-fiction' 'chi-sim' 'fre' 'ger' 'heb' 'ita' 'rus' 'spa') | |
| if [ $# -le 1 ] | |
| then | |
| echo "Usage: $0 [LANGUAGE] [N]" >&2 | |
| exit 1 | |
| fi | |
| FOUND=$(printf "%s\n" "${langs[@]}" | grep "^$1$") | |
| if [ "$FOUND" == "" ] | |
| then | |
| echo "Usage: $0 [LANGUAGE] [N]" >&2 | |
| echo "supported LANGUAGES: ${langs[@]}" >&2 | |
| exit 1 | |
| fi | |
| if [[ $2 < 1 || $2 > 5 ]] | |
| then | |
| echo "Usage: $0 [LANGUAGE] [N]" >&2 | |
| echo "supported NGRAMS length: 1-5" >&2 | |
| exit 1 | |
| fi | |
| # get download links by language and ngram length | |
| urls=$(wget -qO - http://storage.googleapis.com/books/ngrams/books/datasetsv2.html | grep -E href="(.*$1-all-$2gram-[0-9]+-[a-z0-9_]+\.gz)" | sed -r "s/.*href='(.+\.gz)'.*/\1/" | uniq) | |
| # run number of cpu cores jobs simultaneously | |
| for url in $urls | |
| do | |
| echo "processing ${url##*/}" >&2 | |
| # download, get first column, delete duplicates and remove string like _NOUN | |
| # sem -j+0 "wget -qO - $url | gzip -c -d | cut -f1 | uniq | sed 's/_[A-Z]*//g' | uniq" | |
| sem -j+0 "wget -q $url" | |
| done | |
| # wait for jobs get finished | |
| sem --wait |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment