jeremy-rutman · June 25, 2020 09:14
diff --git a/torchtext_info.txt b/torchtext_info.txt
 Some notes on torchtext

 you can read a csv and generate vocab like this 

 tokenize = lambda x: str(x).split()  # see if this fixes float vs. string error
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True,
                       fix_length=200)

    LABEL = data.LabelField()  # LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    fields = {'textcol': ('text', TEXT), 'real': ('label',LABEL)}
    train_data, test_data, validate_data = data.TabularDataset.splits(
        path=path,
        train=trainfile,
        test=testfile,
        validation=validationfile,
        format='csv',  #csv_reader_params=
        fields=fields
    )
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))

 You can check that out-of-vocab words have been mapped to 0 vectors by getting the index then lookng at the vector
 TEXT.vocab.stoi['hello']
 TEXT.vectors[22]
 TEXT.vocab.stoi['shmoogywoogies']
 TEXT.vectors[5202]
	Some notes on torchtext

	you can read a csv and generate vocab like this

	tokenize = lambda x: str(x).split() # see if this fixes float vs. string error
	TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True,
	fix_length=200)

	LABEL = data.LabelField() # LABEL = data.LabelField(tensor_type=torch.FloatTensor)
	fields = {'textcol': ('text', TEXT), 'real': ('label',LABEL)}
	train_data, test_data, validate_data = data.TabularDataset.splits(
	path=path,
	train=trainfile,
	test=testfile,
	validation=validationfile,
	format='csv', #csv_reader_params=
	fields=fields
	)
	TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))

	You can check that out-of-vocab words have been mapped to 0 vectors by getting the index then lookng at the vector
	TEXT.vocab.stoi['hello']
	TEXT.vectors[22]
	TEXT.vocab.stoi['shmoogywoogies']
	TEXT.vectors[5202]
No results found