jovianlin · November 26, 2016 07:22
diff --git a/mini_batch_learning.py b/mini_batch_learning.py
 from sklearn.linear_model import SGDRegressor
 
 # https://adventuresindatascience.wordpress.com/2014/12/30/minibatch-learning-for-large-scale-data-using-scikit-learn/
  
 def iter_minibatches(chunksize, numtrainingpoints):
    # Provide chunks one by one
    chunkstartmarker = 0
    while chunkstartmarker < numtrainingpoints:
        chunkrows = range(chunkstartmarker,chunkstartmarker+chunksize)
        X_chunk, y_chunk = getrows(chunkrows)
        # We haven’t said anything about the getrows() function in the code above, since it 
        # pretty much depends on the specifics of where the data resides. Common situations 
        # might involve the data being stored on disk, stored in distributed fashion, obtained 
        # from an interface etc.
        yield X_chunk, y_chunk
        chunkstartmarker += chunksize
 
 def main():
    batcherator = iter_minibatches(chunksize=1000)
    model = SGDRegressor() # SGDClassifier
    
    # Train model
    for X_chunk, y_chunk in batcherator:
        model.partial_fit(X_chunk, y_chunk)
    # End of for loop.
    
    # Now make predictions with trained model
    y_predicted = model.predict(X_test)
	from sklearn.linear_model import SGDRegressor

	# https://adventuresindatascience.wordpress.com/2014/12/30/minibatch-learning-for-large-scale-data-using-scikit-learn/

	def iter_minibatches(chunksize, numtrainingpoints):
	# Provide chunks one by one
	chunkstartmarker = 0
	while chunkstartmarker < numtrainingpoints:
	chunkrows = range(chunkstartmarker,chunkstartmarker+chunksize)
	X_chunk, y_chunk = getrows(chunkrows)
	# We haven’t said anything about the getrows() function in the code above, since it
	# pretty much depends on the specifics of where the data resides. Common situations
	# might involve the data being stored on disk, stored in distributed fashion, obtained
	# from an interface etc.
	yield X_chunk, y_chunk
	chunkstartmarker += chunksize

	def main():
	batcherator = iter_minibatches(chunksize=1000)
	model = SGDRegressor() # SGDClassifier

	# Train model
	for X_chunk, y_chunk in batcherator:
	model.partial_fit(X_chunk, y_chunk)
	# End of for loop.

	# Now make predictions with trained model
	y_predicted = model.predict(X_test)