Created
April 24, 2021 16:52
-
-
Save hrishikeshrt/14297fc93b612050915bb93942c64460 to your computer and use it in GitHub Desktop.
Download stardict dictionaries from indic-dict
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Sat Apr 24 19:25:34 2021 | |
@author: Hrishikesh Terdalkar | |
Original: https://github.com/sanskrit-coders/pydictupdater | |
""" | |
############################################################################### | |
import os | |
import tarfile | |
from urllib.request import urlopen | |
from urllib.error import URLError, HTTPError | |
############################################################################### | |
INDEX_BASE = "https://raw.githubusercontent.com/indic-dict/" | |
INDEX_SUFFIX = "tars/tars.MD" | |
INDEX_LIST = { | |
"sanskrit": [ | |
"stardict-sanskrit/gh-pages/sa-head/sa-entries/", | |
"stardict-sanskrit/gh-pages/sa-head/en-entries/", | |
"stardict-sanskrit/gh-pages/en-head/", | |
"stardict-sanskrit-vyAkaraNa/gh-pages/", | |
"stardict-sanskrit-kAvya/gh-pages/", | |
], | |
"marathi": [ | |
"stardict-marathi/gh-pages/ma-head/ma-entries/", | |
"stardict-marathi/gh-pages/ma-head/other-entries/" | |
], | |
"bengali": [ | |
"stardict-bengali/gh-pages/bn-head/bn-entries/", | |
"stardict-bengali/gh-pages/bn-head/en-entries/" | |
], | |
"hindi": [ | |
"stardict-hindi/gh-pages/hi-head/hi-entries/", | |
"stardict-hindi/gh-pages/hi-head/en-entries/", | |
"stardict-hindi/gh-pages/en-head/", | |
], | |
"english": [ | |
"stardict-english/gh-pages/en-head/en-entries/", | |
"stardict-english/gh-pages/en-head/other-entries/", | |
"stardict-english/gh-pages/other-head/" | |
] | |
} | |
############################################################################### | |
def vprint(*args, **kwargs): | |
pass | |
############################################################################### | |
# download the url into dir | |
# if dir does not exist create it | |
def download_file(url, dir, force_download=True): | |
os.makedirs(dir, exist_ok=True) | |
# Open the url | |
try: | |
f = urlopen(url) | |
localpath = os.path.join(dir, os.path.basename(url)) | |
# Open our local file for writing | |
if not force_download: | |
if os.path.isfile(localpath): # check if this file exists | |
print("Skipped '{localpath}' as it already exists") | |
return | |
with open(localpath, "wb") as local_file: | |
local_file.write(f.read()) | |
# handle errors | |
except HTTPError as e: | |
print("HTTP Error:", e.code, url) | |
except URLError as e: | |
print("URL Error:", e.reason, url) | |
# take an index_url and return list of .tar.gz listed in it | |
def get_list_of_download_files(index_url): | |
encoding = 'utf-8' | |
returnlist = [] | |
vprint(f"Processing index '{index_url}' ...") | |
# download this index and go through it line by line | |
response = urlopen(index_url) | |
for line in response: | |
line = line.rstrip() # remove line marker | |
# dict_url is a URL to a .tar.gz file | |
dict_url = line.decode(encoding) | |
returnlist.append(dict_url) | |
return returnlist | |
def download_and_extract_dictionary(dict_url, download_dir, extract_dir, | |
force_download=False): | |
dictfilename = os.path.basename(dict_url) | |
vprint(f"Downloading '{dictfilename}', to '{download_dir}' ...") | |
download_file(dict_url, download_dir, force_download) | |
# assert(dictfilename[-7:] == ".tar.gz", dictfilename) | |
t = tarfile.open(os.path.join(download_dir, dictfilename), 'r') | |
# thedictfilenamelen = len(dictfilename) | |
# Handle filenames like: kRdanta-rUpa-mAlA__2016-02-20_23-22-27 | |
sub_dirname_to_extract = dictfilename[:-8].split("__")[0] | |
full_path_of_subdir = os.path.join(extract_dir, sub_dirname_to_extract) | |
vprint(f"Extracting to '{full_path_of_subdir}' ...") | |
t.extractall(full_path_of_subdir) | |
def download_dictionaries(index_base, index_list, index_suffix, language, | |
tgz_download_dir, dict_extract_dir, | |
maxcount=1, force_download=False): | |
count = 0 | |
for index_url in index_list[language]: | |
full_index_path = index_base + index_url + index_suffix | |
# download this index | |
vprint("============================================") | |
vprint(f"Downloading index '{full_index_path}'.") | |
dictlist = get_list_of_download_files(full_index_path) | |
for adict in dictlist: | |
download_and_extract_dictionary(adict, tgz_download_dir, | |
dict_extract_dir, force_download) | |
count += 1 | |
if count == -1: | |
continue # no limit to download | |
if count == maxcount: | |
return | |
vprint("============================================") | |
def get_master_list_to_download(base, index_list): | |
masterlist = [] | |
for language_urls in index_list.values(): | |
for index_url in language_urls: | |
full_index_path = base + index_url | |
# download this index | |
vprint("============================================") | |
vprint("Fetching index '{full_index_path}' ...") | |
dictlist = get_list_of_download_files(full_index_path) | |
masterlist.extend(dictlist) | |
vprint("============================================") | |
return masterlist | |
############################################################################### | |
if __name__ == '__main__': | |
import argparse | |
import tempfile | |
home_dir = os.path.expanduser('~') | |
class Config: | |
download = os.path.join(tempfile.gettempdir(), "dictdata") | |
install = os.path.join(home_dir, "dictdata") | |
languages = ["sanskrit", "english"] | |
verbose = False | |
force = False | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-d", "--download", help="Download Location") | |
parser.add_argument("-i", "--install", help="Install Location") | |
parser.add_argument("-l", "--languages", nargs="+", help="Languages") | |
parser.add_argument("-v", "--verbose", action='store_true', | |
help="Verbose") | |
parser.add_argument("-f", "--force", action='store_true', | |
help="Force Download") | |
args = parser.parse_args(namespace=Config()) | |
# ----------------------------------------------------------------------- # | |
install_dir = args.install | |
download_dir = args.download | |
languages = args.languages | |
force_download = args.force | |
verbose = args.verbose | |
# ----------------------------------------------------------------------- # | |
vprint = print if verbose else vprint | |
for language in languages: | |
if language not in INDEX_LIST: | |
vprint(f"Error: Language '{language}' not found.") | |
continue | |
vprint("============================================") | |
vprint(f"Language: {language.title()}") | |
download_dictionaries(INDEX_BASE, INDEX_LIST, INDEX_SUFFIX, language, | |
download_dir, install_dir, maxcount=-1, | |
force_download=force_download) | |
vprint("============================================") | |
############################################################################### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment