vallantin · October 29, 2018 14:19
diff --git a/005-rasa.py b/005-rasa.py
 from rasa_nlu.training_data import load_data
 from rasa_nlu.config import RasaNLUModelConfig
 from rasa_nlu.model import Trainer, Metadata, Interpreter
 from rasa_nlu import config

 from bs4 import BeautifulSoup
 from urllib.request import Request, urlopen
 import jsonpickle

 def remove_html(filename):    
    soup = BeautifulSoup(open(filename), 'lxml')
        
    [x.extract() for x in soup.findAll('script')]
    [x.extract() for x in soup.findAll('noscript')]
    [x.extract() for x in soup.findAll('style')]
    
    raw_txt = soup.find('div', {'class':'centerColAlign'}).get_text()
    
    text = []
    for word in raw_txt.split(' '):
        for subword in word.split('\n'):
            if subword is not '':
                text.append(subword)
    text = ' '.join(text)
    text = text.replace('\n', '')
    return text

 def train(data, config_file, model_dir):
    print('Training, wait...')
    training_data = load_data(data)
    configuration = config.load(config_file)
    trainer = Trainer(configuration)
    trainer.train(training_data)
    model_directory = trainer.persist(model_dir, fixed_model_name = 'amazon')
    print('training completed.')

 def predict(filename):
    print('Starting to predict...')
    interpreter = Interpreter.load('./models/nlu/default/amazon')
    
    text = remove_html(filename)
    
    json = interpreter.parse(text)
    json = jsonpickle.encode(json)
    json = jsonpickle.decode(json)
    
    return json
 
 train('./data/smartwatches_training.json', './config/config.yml', './models/nlu')
 predictions = predict('amazon.html')
 print(predictions)
	from rasa_nlu.training_data import load_data
	from rasa_nlu.config import RasaNLUModelConfig
	from rasa_nlu.model import Trainer, Metadata, Interpreter
	from rasa_nlu import config

	from bs4 import BeautifulSoup
	from urllib.request import Request, urlopen
	import jsonpickle

	def remove_html(filename):
	soup = BeautifulSoup(open(filename), 'lxml')

	[x.extract() for x in soup.findAll('script')]
	[x.extract() for x in soup.findAll('noscript')]
	[x.extract() for x in soup.findAll('style')]

	raw_txt = soup.find('div', {'class':'centerColAlign'}).get_text()

	text = []
	for word in raw_txt.split(' '):
	for subword in word.split('\n'):
	if subword is not '':
	text.append(subword)
	text = ' '.join(text)
	text = text.replace('\n', '')
	return text

	def train(data, config_file, model_dir):
	print('Training, wait...')
	training_data = load_data(data)
	configuration = config.load(config_file)
	trainer = Trainer(configuration)
	trainer.train(training_data)
	model_directory = trainer.persist(model_dir, fixed_model_name = 'amazon')
	print('training completed.')

	def predict(filename):
	print('Starting to predict...')
	interpreter = Interpreter.load('./models/nlu/default/amazon')

	text = remove_html(filename)

	json = interpreter.parse(text)
	json = jsonpickle.encode(json)
	json = jsonpickle.decode(json)

	return json

	train('./data/smartwatches_training.json', './config/config.yml', './models/nlu')
	predictions = predict('amazon.html')
	print(predictions)