Last active
March 18, 2024 03:35
-
-
Save probonopd/49b2cb67aef4370ba4c7 to your computer and use it in GitHub Desktop.
Convert Babylon dictionary to QuickDic. It is unclear under which the Babylon "free" dictionaries are. They appear to be crowd-sourced and there is no aparent license.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| sudo apt-get update | |
| sudo apt-get install -y git python-tk tix default-jre | |
| git clone https://github.com/ilius/pyglossary.git | |
| cd pyglossary/ | |
| # Dictionary Details | |
| # Created by: ADO | |
| # Submitted to Babylon's Dictionary, Translation and Information | |
| # Platform under the title: ADO's SPANISCH-DEUTSCH | |
| # Number of definitions found in this dictionary: 65693 | |
| # Source Language: Spanish | |
| # Target Language: German | |
| # http://www.babylon.com/free-dictionaries/reference/dictionaries-thesauri/ADO%27s-SPANISCH-DEUTSCH/47378.html | |
| wget http://dl.babylon.com/info/glossaries/B912/ADO_s_SPANISCH_DEUTSCH.BGL | |
| # Convert to TXT tabfile | |
| python pyglossary.pyw ADO_s_SPANISCH_DEUTSCH.BGL ADO_s_SPANISCH_DEUTSCH.txt | |
| # Create "::" separated chemnitz format wordlist | |
| python cleanup.py ADO_s_SPANISCH_DEUTSCH.txt > ADO_s_SPANISCH_DEUTSCH.chemnitz | |
| # Cleanup | |
| sed -i -e 's|;|; |g' ADO_s_SPANISCH_DEUTSCH.chemnitz | |
| # Get stoplists from http://members.unine.ch/jacques.savoy/clef/ | |
| wget http://members.unine.ch/jacques.savoy/clef/germanST.txt | |
| wget http://members.unine.ch/jacques.savoy/clef/spanishST.txt | |
| wget http://dictionarypc.quickdic-dictionary.googlecode.com/git/custom_dictionary/DictionaryBuilder.jar | |
| # Convert | |
| java -Xmx512m -jar DictionaryBuilder.jar \ | |
| --dictOut=ADO_s_SPANISCH_DEUTSCH.quickdic \ | |
| --lang1=ES \ | |
| --lang2=DE \ | |
| --lang1Stoplist=spanishST.txt \ | |
| --lang2Stoplist=germanST.txt \ | |
| --dictInfo="" \ | |
| --input1=ADO_s_SPANISCH_DEUTSCH.chemnitz \ | |
| --input1Name="" \ | |
| --input1Charset=UTF8 \ | |
| --input1Format=chemnitz \ | |
| --input1FlipColumns=false | |
| ##### | |
| # Similarly for French: | |
| # http://www.babylon.com/free-dictionaries/reference/dictionaries-thesauri/ADO%27s-FRENCH-GERMAN/45238.html | |
| wget http://members.unine.ch/jacques.savoy/clef/frenchST.txt | |
| wget http://dl.babylon.com/info/glossaries/B0B6/ADO_s_FRENCH_GERMAN.BGL | |
| java -Xmx512m -jar DictionaryBuilder.jar \ | |
| --dictOut=ADO_s_FRENCH_GERMAN.quickdic \ | |
| --lang1=FR \ | |
| --lang2=DE \ | |
| --lang1Stoplist=frenchST.txt \ | |
| --lang2Stoplist=germanST.txt \ | |
| --dictInfo="" \ | |
| --input1=ADO_s_FRENCH_GERMAN.chemnitz \ | |
| --input1Name="" \ | |
| --input1Charset=UTF8 \ | |
| --input1Format=chemnitz \ | |
| --input1FlipColumns=false |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # -*- coding: utf-8 -*- | |
| # This is specifically for DE-EN and EN-DE so far; needs to be extended manually for other languages | |
| import argparse | |
| import re | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("filename") | |
| args = parser.parse_args() | |
| with open(args.filename) as f: | |
| notags = f.read() | |
| notags = notags.replace("\t", " :: ") | |
| notags = notags.replace(" <br>\\n", ", ") | |
| notags = notags.replace("<br>\\n", " ") | |
| notags = notags.replace("|", ", ") | |
| # Remove all tags | |
| notags = re.sub(r'(<.*>)', r'', notags) | |
| # Remove all brackets | |
| notags = re.sub(r'(\[.*\])', r'', notags) | |
| notags = re.sub(r'(\(.*\))', r'', notags) | |
| # Mark all persons for deletion | |
| # Remove all brackets | |
| notags = re.sub(r'\d\d\d\d-\d\d\d\d\)', r'XXXPERSONXXX', notags) | |
| notags = re.sub(r'\(geboren \d\d\d\d\)', r'XXXPERSONXXX', notags) | |
| notags = re.sub(r'\(geb. \d\d\d\d\)', r'XXXPERSONXXX', notags) | |
| notags = re.sub(r'\\d\d\d\d geboren\)', r'XXXPERSONXXX', notags) | |
| notags = notags.replace(" ", " ") | |
| notags = notags.replace(" , ", ", ") | |
| for line in notags.split("\n"): | |
| if("::" in line): | |
| # We do not want Babylon comments | |
| if "##" in line: | |
| continue | |
| # We do not want surnames | |
| if "Nachname" in line or 'XXXPERSONXXX' in line or 'geboren' in line or "family name" in line: | |
| continue | |
| # We do not want first names | |
| if "Vorname" in line or "first name" in line: | |
| continue | |
| # We do not want brand names | |
| if "Markenname" in line or "®" in line or "Inc." in line or "Ltd." in line or " Corporation" in line or " Limited" in line or "sches Unternehmen" in line: | |
| continue | |
| # We do not want long-winded explanations | |
| # if line.split("::")[1].count (',') > 2: | |
| # print "shortened -->" | |
| # line = "".join(line.split("::")[0]) + " :: " + line.split("::")[1].split(',')[0] + ", " + line.split("::")[1].split(',')[1] | |
| # There must be as may closing as opening | |
| if line.count ('(') > line.count (')'): | |
| print "added ) -->" | |
| line = line + ")" | |
| print line |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # http://www.lingoes.net/en/dictionary/dict_down.php?id=79ACB1ECC9C3D044BFAE9961C2B9E1B5 | |
| # Extract lingoes format | |
| git clone https://github.com/librehat/kdictionary-lingoes.git | |
| sudo apt-get install cmake qt5-default g++ | |
| cmake . | |
| make | |
| echo "y" | ./kdictionary-lingoes -i ../Downloads/ADO*ld2 -o output.txt | |
| # Cleanup | |
| sed -i -e 's| = | :: |g' ../Downloads/output.chemnitz | |
| sed -i -e 's|;|; |g' ../Downloads/output.chemnitz | |
| # Convert | |
| java -Xmx512m -jar DictionaryBuilder.jar \ | |
| --dictOut=ADO_s_ITALIAN_GERMAN.quickdic \ | |
| --lang1=IT \ | |
| --lang2=DE \ | |
| --lang1Stoplist=italianST.txt \ | |
| --lang2Stoplist=germanST.txt \ | |
| --dictInfo="" \ | |
| --input1=output.chemnitz \ | |
| --input1Name="" \ | |
| --input1Charset=UTF8 \ | |
| --input1Format=chemnitz \ | |
| --input1FlipColumns=false |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment