probonopd · March 18, 2024 03:35
diff --git a/babylon2quickdic.sh b/babylon2quickdic.sh
 sudo apt-get update
 sudo apt-get install -y git python-tk tix default-jre
 git clone https://github.com/ilius/pyglossary.git
 cd pyglossary/

 # Dictionary Details
 # Created by: ADO
 # Submitted to Babylon's Dictionary, Translation and Information
 # Platform under the title: ADO's SPANISCH-DEUTSCH
 # Number of definitions found in this dictionary: 65693
 # Source Language: Spanish
 # Target Language: German

 # http://www.babylon.com/free-dictionaries/reference/dictionaries-thesauri/ADO%27s-SPANISCH-DEUTSCH/47378.html

 wget http://dl.babylon.com/info/glossaries/B912/ADO_s_SPANISCH_DEUTSCH.BGL

 # Convert to TXT tabfile
 python pyglossary.pyw ADO_s_SPANISCH_DEUTSCH.BGL ADO_s_SPANISCH_DEUTSCH.txt

 # Create "::" separated chemnitz format wordlist
 python cleanup.py ADO_s_SPANISCH_DEUTSCH.txt > ADO_s_SPANISCH_DEUTSCH.chemnitz

 # Cleanup
 sed -i -e 's|;|; |g' ADO_s_SPANISCH_DEUTSCH.chemnitz

 # Get stoplists from http://members.unine.ch/jacques.savoy/clef/
 wget http://members.unine.ch/jacques.savoy/clef/germanST.txt
 wget http://members.unine.ch/jacques.savoy/clef/spanishST.txt

 wget http://dictionarypc.quickdic-dictionary.googlecode.com/git/custom_dictionary/DictionaryBuilder.jar

 # Convert
 java -Xmx512m -jar DictionaryBuilder.jar \
  --dictOut=ADO_s_SPANISCH_DEUTSCH.quickdic \
  --lang1=ES \
  --lang2=DE \
  --lang1Stoplist=spanishST.txt \
  --lang2Stoplist=germanST.txt \
  --dictInfo="" \
  --input1=ADO_s_SPANISCH_DEUTSCH.chemnitz \
  --input1Name="" \
  --input1Charset=UTF8 \
  --input1Format=chemnitz \
  --input1FlipColumns=false 

 #####

 # Similarly for French:

 # http://www.babylon.com/free-dictionaries/reference/dictionaries-thesauri/ADO%27s-FRENCH-GERMAN/45238.html

 wget http://members.unine.ch/jacques.savoy/clef/frenchST.txt
 wget http://dl.babylon.com/info/glossaries/B0B6/ADO_s_FRENCH_GERMAN.BGL

 java -Xmx512m -jar DictionaryBuilder.jar \
 --dictOut=ADO_s_FRENCH_GERMAN.quickdic \
 --lang1=FR \
 --lang2=DE \
 --lang1Stoplist=frenchST.txt \
 --lang2Stoplist=germanST.txt \
 --dictInfo="" \
 --input1=ADO_s_FRENCH_GERMAN.chemnitz \
 --input1Name="" \
 --input1Charset=UTF8 \
 --input1Format=chemnitz \
 --input1FlipColumns=false 
diff --git a/cleanup.py b/cleanup.py
 #!/usr/bin/python
 # -*- coding: utf-8 -*-

 # This is specifically for DE-EN and EN-DE so far; needs to be extended manually for other languages

 import argparse
 import re

 parser = argparse.ArgumentParser()
 parser.add_argument("filename")
 args = parser.parse_args()

 with open(args.filename) as f:
    notags = f.read()

 notags = notags.replace("\t", " :: ")
 notags = notags.replace(" <br>\\n", ", ")
 notags = notags.replace("<br>\\n", " ")
 notags = notags.replace("|", ", ")

 # Remove all tags
 notags = re.sub(r'(<.*>)', r'', notags)

 # Remove all brackets
 notags = re.sub(r'(\[.*\])', r'', notags)
 notags = re.sub(r'(\(.*\))', r'', notags)

 # Mark all persons for deletion
 # Remove all brackets
 notags = re.sub(r'\d\d\d\d-\d\d\d\d\)', r'XXXPERSONXXX', notags)
 notags = re.sub(r'\(geboren \d\d\d\d\)', r'XXXPERSONXXX', notags)
 notags = re.sub(r'\(geb. \d\d\d\d\)', r'XXXPERSONXXX', notags)
 notags = re.sub(r'\\d\d\d\d geboren\)', r'XXXPERSONXXX', notags)

 notags = notags.replace("  ", " ")
 notags = notags.replace(" , ", ", ")

 for line in notags.split("\n"):
    if("::" in line):
        # We do not want Babylon comments
        if "##" in line:
            continue
        # We do not want surnames
        if "Nachname" in line or 'XXXPERSONXXX' in line or 'geboren' in line or "family name" in line:
            continue
        # We do not want first names
        if "Vorname" in line or "first name" in line:
            continue
        # We do not want brand names
        if "Markenname" in line or "®" in line or "Inc." in line or "Ltd." in line or " Corporation" in line or " Limited" in line or "sches Unternehmen" in line:
            continue

        # We do not want long-winded explanations
        # if line.split("::")[1].count (',') > 2:
        #     print "shortened -->" 
        #     line = "".join(line.split("::")[0]) + " :: " + line.split("::")[1].split(',')[0] + ", " + line.split("::")[1].split(',')[1]

        # There must be as may closing as opening
        if line.count ('(') > line.count (')'):
            print "added ) -->" 
            line = line + ")" 

        print line
diff --git a/lingoes2quickdic.sh b/lingoes2quickdic.sh
 # http://www.lingoes.net/en/dictionary/dict_down.php?id=79ACB1ECC9C3D044BFAE9961C2B9E1B5

 # Extract lingoes format
 git clone https://github.com/librehat/kdictionary-lingoes.git
 sudo apt-get install cmake qt5-default g++
 cmake .
 make
 echo "y" | ./kdictionary-lingoes -i ../Downloads/ADO*ld2 -o output.txt 

 # Cleanup
 sed -i -e 's| = | :: |g' ../Downloads/output.chemnitz
 sed -i -e 's|;|; |g' ../Downloads/output.chemnitz

 # Convert
 java -Xmx512m -jar DictionaryBuilder.jar \
  --dictOut=ADO_s_ITALIAN_GERMAN.quickdic \
  --lang1=IT \
  --lang2=DE \
  --lang1Stoplist=italianST.txt \
  --lang2Stoplist=germanST.txt \
  --dictInfo="" \
  --input1=output.chemnitz \
  --input1Name="" \
  --input1Charset=UTF8 \
  --input1Format=chemnitz \
  --input1FlipColumns=false
	sudo apt-get update
	sudo apt-get install -y git python-tk tix default-jre
	git clone https://github.com/ilius/pyglossary.git
	cd pyglossary/

	# Dictionary Details
	# Created by: ADO
	# Submitted to Babylon's Dictionary, Translation and Information
	# Platform under the title: ADO's SPANISCH-DEUTSCH
	# Number of definitions found in this dictionary: 65693
	# Source Language: Spanish
	# Target Language: German

	# http://www.babylon.com/free-dictionaries/reference/dictionaries-thesauri/ADO%27s-SPANISCH-DEUTSCH/47378.html

	wget http://dl.babylon.com/info/glossaries/B912/ADO_s_SPANISCH_DEUTSCH.BGL

	# Convert to TXT tabfile
	python pyglossary.pyw ADO_s_SPANISCH_DEUTSCH.BGL ADO_s_SPANISCH_DEUTSCH.txt

	# Create "::" separated chemnitz format wordlist
	python cleanup.py ADO_s_SPANISCH_DEUTSCH.txt > ADO_s_SPANISCH_DEUTSCH.chemnitz

	# Cleanup
	sed -i -e 's\|;\|; \|g' ADO_s_SPANISCH_DEUTSCH.chemnitz

	# Get stoplists from http://members.unine.ch/jacques.savoy/clef/
	wget http://members.unine.ch/jacques.savoy/clef/germanST.txt
	wget http://members.unine.ch/jacques.savoy/clef/spanishST.txt

	wget http://dictionarypc.quickdic-dictionary.googlecode.com/git/custom_dictionary/DictionaryBuilder.jar

	# Convert
	java -Xmx512m -jar DictionaryBuilder.jar \
	--dictOut=ADO_s_SPANISCH_DEUTSCH.quickdic \
	--lang1=ES \
	--lang2=DE \
	--lang1Stoplist=spanishST.txt \
	--lang2Stoplist=germanST.txt \
	--dictInfo="" \
	--input1=ADO_s_SPANISCH_DEUTSCH.chemnitz \
	--input1Name="" \
	--input1Charset=UTF8 \
	--input1Format=chemnitz \
	--input1FlipColumns=false

	#####

	# Similarly for French:

	# http://www.babylon.com/free-dictionaries/reference/dictionaries-thesauri/ADO%27s-FRENCH-GERMAN/45238.html

	wget http://members.unine.ch/jacques.savoy/clef/frenchST.txt
	wget http://dl.babylon.com/info/glossaries/B0B6/ADO_s_FRENCH_GERMAN.BGL

	java -Xmx512m -jar DictionaryBuilder.jar \
	--dictOut=ADO_s_FRENCH_GERMAN.quickdic \
	--lang1=FR \
	--lang2=DE \
	--lang1Stoplist=frenchST.txt \
	--lang2Stoplist=germanST.txt \
	--dictInfo="" \
	--input1=ADO_s_FRENCH_GERMAN.chemnitz \
	--input1Name="" \
	--input1Charset=UTF8 \
	--input1Format=chemnitz \
	--input1FlipColumns=false
	#!/usr/bin/python
	# -- coding: utf-8 --

	# This is specifically for DE-EN and EN-DE so far; needs to be extended manually for other languages

	import argparse
	import re

	parser = argparse.ArgumentParser()
	parser.add_argument("filename")
	args = parser.parse_args()

	with open(args.filename) as f:
	notags = f.read()

	notags = notags.replace("\t", " :: ")
	notags = notags.replace(" <br>\\n", ", ")
	notags = notags.replace("<br>\\n", " ")
	notags = notags.replace("\|", ", ")

	# Remove all tags
	notags = re.sub(r'(<.*>)', r'', notags)

	# Remove all brackets
	notags = re.sub(r'(\[.*\])', r'', notags)
	notags = re.sub(r'(\(.*\))', r'', notags)

	# Mark all persons for deletion
	# Remove all brackets
	notags = re.sub(r'\d\d\d\d-\d\d\d\d\)', r'XXXPERSONXXX', notags)
	notags = re.sub(r'\(geboren \d\d\d\d\)', r'XXXPERSONXXX', notags)
	notags = re.sub(r'\(geb. \d\d\d\d\)', r'XXXPERSONXXX', notags)
	notags = re.sub(r'\\d\d\d\d geboren\)', r'XXXPERSONXXX', notags)

	notags = notags.replace(" ", " ")
	notags = notags.replace(" , ", ", ")

	for line in notags.split("\n"):
	if("::" in line):
	# We do not want Babylon comments
	if "##" in line:
	continue
	# We do not want surnames
	if "Nachname" in line or 'XXXPERSONXXX' in line or 'geboren' in line or "family name" in line:
	continue
	# We do not want first names
	if "Vorname" in line or "first name" in line:
	continue
	# We do not want brand names
	if "Markenname" in line or "®" in line or "Inc." in line or "Ltd." in line or " Corporation" in line or " Limited" in line or "sches Unternehmen" in line:
	continue

	# We do not want long-winded explanations
	# if line.split("::")[1].count (',') > 2:
	# print "shortened -->"
	# line = "".join(line.split("::")[0]) + " :: " + line.split("::")[1].split(',')[0] + ", " + line.split("::")[1].split(',')[1]

	# There must be as may closing as opening
	if line.count ('(') > line.count (')'):
	print "added ) -->"
	line = line + ")"

	print line
	# http://www.lingoes.net/en/dictionary/dict_down.php?id=79ACB1ECC9C3D044BFAE9961C2B9E1B5

	# Extract lingoes format
	git clone https://github.com/librehat/kdictionary-lingoes.git
	sudo apt-get install cmake qt5-default g++
	cmake .
	make
	echo "y" \| ./kdictionary-lingoes -i ../Downloads/ADO*ld2 -o output.txt

	# Cleanup
	sed -i -e 's\| = \| :: \|g' ../Downloads/output.chemnitz
	sed -i -e 's\|;\|; \|g' ../Downloads/output.chemnitz

	# Convert
	java -Xmx512m -jar DictionaryBuilder.jar \
	--dictOut=ADO_s_ITALIAN_GERMAN.quickdic \
	--lang1=IT \
	--lang2=DE \
	--lang1Stoplist=italianST.txt \
	--lang2Stoplist=germanST.txt \
	--dictInfo="" \
	--input1=output.chemnitz \
	--input1Name="" \
	--input1Charset=UTF8 \
	--input1Format=chemnitz \
	--input1FlipColumns=false