aanastasiou · February 13, 2018 11:24
diff --git a/README.md b/README.md
diff --git a/main.py b/main.py
 '''Athanasios Anastasiou Feb 2018

 A brief script to compile vectors to Spacy models
 '''

 import sys
 import spacy
 import numpy

 try:
    import en_core_web_md
 except:
    sys.stdout.write("en_core_web_md not installed, it can be obtained from:https://spacy.io/usage/models\n");
    sys.exit(-1);
    


 if __name__ == "__main__":
    nlp = en_core_web_md.load(); #This will take some time
    
    #This is a parameter of the whole process and it is the length of the vector for each word.
    #It seems to be NVectors+1, where NVectors is the number of vectors parameter passed to GloVe
    width = 300;
   
    with open("./vectors.txt") as fd:
        vectors = list(fd)
    
    justData = []
    #In my case, GloVe would generate some strings with spaces between then which of course interferes with "split".
    #This is taken into account here by first splitting at blanks and then taking N Vectors from the end of the list. The remaining entries are concatenated into the "key".
    #This can be optimised further of course, within an iteration
    justData = list(map(lambda x:(" ".join(x.split()[0:-width]),list(map(lambda y:float(y),x.split()[-width:]))),vectors));
    
    nlp.vocab.reset_vectors()
    for i, line in enumerate(justData):
        nlp.vocab.set_vector(line[0], numpy.array(line[1]))  
        

    ##Need to alter the metadata right here
    metaData = {'author': 'Athanasios Anastasiou',
                'description': 'Tech vocab',
                'email': '[email protected]',
                'lang': 'en',
                'license': 'CC BY-SA 3.0',
                'name': 'techVocab_test',
                'parent_package': 'spacy',
                'pipeline': ['tagger', 'parser', 'ner'],
                'spacy_version': '>=2.0.0a18',
                'version': '1.0.0'}
    nlp.meta = metaData
    #Now that the model is ready it needs to be saved
    nlp.to_disk('../ModelInputDir')
	'''Athanasios Anastasiou Feb 2018

	A brief script to compile vectors to Spacy models
	'''

	import sys
	import spacy
	import numpy

	try:
	import en_core_web_md
	except:
	sys.stdout.write("en_core_web_md not installed, it can be obtained from:https://spacy.io/usage/models\n");
	sys.exit(-1);



	if __name__ == "__main__":
	nlp = en_core_web_md.load(); #This will take some time

	#This is a parameter of the whole process and it is the length of the vector for each word.
	#It seems to be NVectors+1, where NVectors is the number of vectors parameter passed to GloVe
	width = 300;

	with open("./vectors.txt") as fd:
	vectors = list(fd)

	justData = []
	#In my case, GloVe would generate some strings with spaces between then which of course interferes with "split".
	#This is taken into account here by first splitting at blanks and then taking N Vectors from the end of the list. The remaining entries are concatenated into the "key".
	#This can be optimised further of course, within an iteration
	justData = list(map(lambda x:(" ".join(x.split()[0:-width]),list(map(lambda y:float(y),x.split()[-width:]))),vectors));

	nlp.vocab.reset_vectors()
	for i, line in enumerate(justData):
	nlp.vocab.set_vector(line[0], numpy.array(line[1]))


	##Need to alter the metadata right here
	metaData = {'author': 'Athanasios Anastasiou',
	'description': 'Tech vocab',
	'email': '[email protected]',
	'lang': 'en',
	'license': 'CC BY-SA 3.0',
	'name': 'techVocab_test',
	'parent_package': 'spacy',
	'pipeline': ['tagger', 'parser', 'ner'],
	'spacy_version': '>=2.0.0a18',
	'version': '1.0.0'}
	nlp.meta = metaData
	#Now that the model is ready it needs to be saved
	nlp.to_disk('../ModelInputDir')