alrojo · March 2, 2018 11:54
diff --git a/dl.py b/dl.py
 # made by Alexander Rosenberg Johansen
 # BSD-3 License
 # Hardcoded for German, but should be easily extendable to all other wmt languages
 # notice: you will need handle-sgm.py as well
 from six.moves.urllib.request import urlretrieve
 import json
 import glob
 import subprocess
 import os
 from tqdm import tqdm
 
 sc = subprocess.call

 #normailze script
 sc(["wget", "-O", "normalize-punctuation.perl", "http://statmt.org/wmt11/normalize-punctuation.perl"])
 
 sc(["mkdir", "data"])
 
 #download europarl
 sc(["wget", "-O", "data/europarl.tgz", "statmt.org/wmt13/training-parallel-europarl-v7.tgz"])
 sc(["tar", "-xvzf", "data/europarl.tgz", "-C", "data/"])
 
 #download commoncrawl
 sc(["wget", "-O", "data/cc.tgz", "statmt.org/wmt13/training-parallel-commoncrawl.tgz"])
 sc(["tar", "-xvzf", "data/cc.tgz", "-C", "data/"])
 
 #download news commentary
 sc(["wget", "-O", "data/nc.tgz", "statmt.org/wmt14/training-parallel-nc-v9.tgz"])
 sc(["tar", "-xvzf", "data/nc.tgz", "-C", "data/"])

 #move around and delete
 print('moving data around')
 sc("mv ./data/training/* ./data", shell=True)
 sc("rm -rf data/training", shell=True)
 sc("rm -rf data/*fr*", shell=True)
 sc("rm -rf data/*es*", shell=True)
 sc("rm -rf data/*cs*", shell=True)
 sc("rm -rf data/*ru*", shell=True)
 sc("rm -rf data/*annotation", shell=True)

 #DEV SET
 #download newstest
 print('dev set')
 sc(["wget", "-O", "data/dev.tgz", "statmt.org/wmt14/dev.tgz"])
 sc(["tar", "-xvzf", "data/dev.tgz", "-C", "data/"])
 sc("mv ./data/dev/*.de ./data", shell=True)
 sc("mv ./data/dev/*.en ./data", shell=True)
 sc("rm -rf data/dev", shell=True)
 sc("rm data/newsdev2014.en", shell=True)

 #TEST SET
 #download newstest

 print('test set')
 sc(["wget", "-O", "data/test-full.tgz", "statmt.org/wmt14/test-full.tgz"])
 sc(["tar", "-xvzf", "data/test-full.tgz", "-C", "data/"])
 sc(["python", "handle-sgm.py"])
 sc("mv ./data/test-full/newstest2014.deen.de ./data", shell=True)
 sc("mv ./data/test-full/newstest2014.deen.en ./data", shell=True)
 sc("rm -rf data/test-full", shell=True)

 #NORMALIZE PUNCTUATION
 paths = glob.glob("data/*.de") + glob.glob("data/*.en")
 for path in paths:
    lang = path[-2:]
    command1 = 'perl normalize-punctuation.perl -l'
    path_norm = path + '.prenorm'
    call1 = '%s %s < %s > %s' % (command1, lang, path, path_norm)
    sc(call1, shell=True)
    print(path_norm)

 sc("rm -rf data/*.en",shell = True)
 sc("rm -rf data/*.de",shell = True)

 sc("cat data/commoncrawl.de-en.de.prenorm data/europarl-v7.de-en.de.prenorm data/news-commentary-v9.de-en.de.prenorm > data/WMT2014.train.deen.de.norm", shell = True )
 sc("cat data/commoncrawl.de-en.en.prenorm data/europarl-v7.de-en.en.prenorm data/news-commentary-v9.de-en.en.prenorm > data/WMT2014.train.deen.en.norm", shell = True )

 sc("cat data/newssyscomb2009.de.prenorm data/news-test2008.de.prenorm data/newstest2009.de.prenorm data/newstest2010.de.prenorm data/newstest2011.de.prenorm data/newstest2012.de.prenorm data/newstest2013.de.prenorm > data/WMT2014.dev.deen.de.norm", shell = True )
 sc("cat data/newssyscomb2009.en.prenorm data/news-test2008.en.prenorm data/newstest2009.en.prenorm data/newstest2010.en.prenorm data/newstest2011.en.prenorm data/newstest2012.en.prenorm data/newstest2013.en.prenorm > data/WMT2014.dev.deen.en.norm", shell = True )

 sc("mv data/newstest2014.deen.de.prenorm data/WMT2014.test.deen.de.norm", shell=True)
 sc("mv data/newstest2014.deen.en.prenorm data/WMT2014.test.deen.en.norm", shell=True)

 sc("rm -rf data/*.prenorm", shell=True)
	# made by Alexander Rosenberg Johansen
	# BSD-3 License
	# Hardcoded for German, but should be easily extendable to all other wmt languages
	# notice: you will need handle-sgm.py as well
	from six.moves.urllib.request import urlretrieve
	import json
	import glob
	import subprocess
	import os
	from tqdm import tqdm

	sc = subprocess.call

	#normailze script
	sc(["wget", "-O", "normalize-punctuation.perl", "http://statmt.org/wmt11/normalize-punctuation.perl"])

	sc(["mkdir", "data"])

	#download europarl
	sc(["wget", "-O", "data/europarl.tgz", "statmt.org/wmt13/training-parallel-europarl-v7.tgz"])
	sc(["tar", "-xvzf", "data/europarl.tgz", "-C", "data/"])

	#download commoncrawl
	sc(["wget", "-O", "data/cc.tgz", "statmt.org/wmt13/training-parallel-commoncrawl.tgz"])
	sc(["tar", "-xvzf", "data/cc.tgz", "-C", "data/"])

	#download news commentary
	sc(["wget", "-O", "data/nc.tgz", "statmt.org/wmt14/training-parallel-nc-v9.tgz"])
	sc(["tar", "-xvzf", "data/nc.tgz", "-C", "data/"])

	#move around and delete
	print('moving data around')
	sc("mv ./data/training/* ./data", shell=True)
	sc("rm -rf data/training", shell=True)
	sc("rm -rf data/fr", shell=True)
	sc("rm -rf data/es", shell=True)
	sc("rm -rf data/cs", shell=True)
	sc("rm -rf data/ru", shell=True)
	sc("rm -rf data/*annotation", shell=True)

	#DEV SET
	#download newstest
	print('dev set')
	sc(["wget", "-O", "data/dev.tgz", "statmt.org/wmt14/dev.tgz"])
	sc(["tar", "-xvzf", "data/dev.tgz", "-C", "data/"])
	sc("mv ./data/dev/*.de ./data", shell=True)
	sc("mv ./data/dev/*.en ./data", shell=True)
	sc("rm -rf data/dev", shell=True)
	sc("rm data/newsdev2014.en", shell=True)

	#TEST SET
	#download newstest

	print('test set')
	sc(["wget", "-O", "data/test-full.tgz", "statmt.org/wmt14/test-full.tgz"])
	sc(["tar", "-xvzf", "data/test-full.tgz", "-C", "data/"])
	sc(["python", "handle-sgm.py"])
	sc("mv ./data/test-full/newstest2014.deen.de ./data", shell=True)
	sc("mv ./data/test-full/newstest2014.deen.en ./data", shell=True)
	sc("rm -rf data/test-full", shell=True)

	#NORMALIZE PUNCTUATION
	paths = glob.glob("data/.de") + glob.glob("data/.en")
	for path in paths:
	lang = path[-2:]
	command1 = 'perl normalize-punctuation.perl -l'
	path_norm = path + '.prenorm'
	call1 = '%s %s < %s > %s' % (command1, lang, path, path_norm)
	sc(call1, shell=True)
	print(path_norm)

	sc("rm -rf data/*.en",shell = True)
	sc("rm -rf data/*.de",shell = True)

	sc("cat data/commoncrawl.de-en.de.prenorm data/europarl-v7.de-en.de.prenorm data/news-commentary-v9.de-en.de.prenorm > data/WMT2014.train.deen.de.norm", shell = True )
	sc("cat data/commoncrawl.de-en.en.prenorm data/europarl-v7.de-en.en.prenorm data/news-commentary-v9.de-en.en.prenorm > data/WMT2014.train.deen.en.norm", shell = True )

	sc("cat data/newssyscomb2009.de.prenorm data/news-test2008.de.prenorm data/newstest2009.de.prenorm data/newstest2010.de.prenorm data/newstest2011.de.prenorm data/newstest2012.de.prenorm data/newstest2013.de.prenorm > data/WMT2014.dev.deen.de.norm", shell = True )
	sc("cat data/newssyscomb2009.en.prenorm data/news-test2008.en.prenorm data/newstest2009.en.prenorm data/newstest2010.en.prenorm data/newstest2011.en.prenorm data/newstest2012.en.prenorm data/newstest2013.en.prenorm > data/WMT2014.dev.deen.en.norm", shell = True )

	sc("mv data/newstest2014.deen.de.prenorm data/WMT2014.test.deen.de.norm", shell=True)
	sc("mv data/newstest2014.deen.en.prenorm data/WMT2014.test.deen.en.norm", shell=True)

	sc("rm -rf data/*.prenorm", shell=True)