Skip to content

Instantly share code, notes, and snippets.

@jamesbeedy
Created November 30, 2017 23:25
Show Gist options
  • Save jamesbeedy/c84c0d252b2dd82b8580767a42db1d1c to your computer and use it in GitHub Desktop.
Save jamesbeedy/c84c0d252b2dd82b8580767a42db1d1c to your computer and use it in GitHub Desktop.
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
import random
from random import randint
spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()
my_list = []
rand_ints = [randint(0,1000), randint(0,1000)]
def randstring(length=10):
valid_letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
return ''.join((random.choice(valid_letters) for i in xrange(length)))
for i in range(0,100000):
my_int = 0.0
if i in rand_ints:
my_int = 1.0
my_list.append((i, randstring(100000), my_int))
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame(my_list, ["id", "text", "label"])
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
model = pipeline.fit(training)
# Prepare test documents, which are unlabeled (id, text) tuples.
my_test_data = []
for i in range(100000, 150000):
my_test_data.append((i, randstring(100000)))
test = spark.createDataFrame(my_test_data, ["id", "text"])
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
rid, text, prob, prediction = row
print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment