jamesbeedy · November 30, 2017 23:25
diff --git a/trial_pip.py b/trial_pip.py
 from pyspark.sql import SparkSession
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.feature import HashingTF, Tokenizer

 import random
 from random import randint

 spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()


 my_list = []

 rand_ints = [randint(0,1000), randint(0,1000)]



 def randstring(length=10):
    valid_letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    return ''.join((random.choice(valid_letters) for i in xrange(length)))


 for i in range(0,100000):
    my_int = 0.0
    if i in rand_ints:
        my_int = 1.0
    my_list.append((i, randstring(100000), my_int))


 # Prepare training documents from a list of (id, text, label) tuples.
 training = spark.createDataFrame(my_list, ["id", "text", "label"])





 # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 tokenizer = Tokenizer(inputCol="text", outputCol="words")
 hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
 lr = LogisticRegression(maxIter=10, regParam=0.001)
 pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

 # Fit the pipeline to training documents.
 model = pipeline.fit(training)

 # Prepare test documents, which are unlabeled (id, text) tuples.
 my_test_data = []
 for i in range(100000, 150000):
    my_test_data.append((i, randstring(100000)))


 test = spark.createDataFrame(my_test_data, ["id", "text"])

 # Make predictions on test documents and print columns of interest.
 prediction = model.transform(test)
 selected = prediction.select("id", "text", "probability", "prediction")
 for row in selected.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
	from pyspark.sql import SparkSession
	from pyspark.ml import Pipeline
	from pyspark.ml.classification import LogisticRegression
	from pyspark.ml.feature import HashingTF, Tokenizer

	import random
	from random import randint

	spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()


	my_list = []

	rand_ints = [randint(0,1000), randint(0,1000)]



	def randstring(length=10):
	valid_letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
	return ''.join((random.choice(valid_letters) for i in xrange(length)))


	for i in range(0,100000):
	my_int = 0.0
	if i in rand_ints:
	my_int = 1.0
	my_list.append((i, randstring(100000), my_int))


	# Prepare training documents from a list of (id, text, label) tuples.
	training = spark.createDataFrame(my_list, ["id", "text", "label"])





	# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
	tokenizer = Tokenizer(inputCol="text", outputCol="words")
	hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
	lr = LogisticRegression(maxIter=10, regParam=0.001)
	pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

	# Fit the pipeline to training documents.
	model = pipeline.fit(training)

	# Prepare test documents, which are unlabeled (id, text) tuples.
	my_test_data = []
	for i in range(100000, 150000):
	my_test_data.append((i, randstring(100000)))


	test = spark.createDataFrame(my_test_data, ["id", "text"])

	# Make predictions on test documents and print columns of interest.
	prediction = model.transform(test)
	selected = prediction.select("id", "text", "probability", "prediction")
	for row in selected.collect():
	rid, text, prob, prediction = row
	print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))