Last active
February 2, 2016 09:01
-
-
Save mitallast/c1d779d83a2698a212e8 to your computer and use it in GitHub Desktop.
Apache spark NN test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+--------+--------------------+-----+--------------------+--------------------+----------+ | |
|category| text|label| words| features|prediction| | |
+--------+--------------------+-----+--------------------+--------------------+----------+ | |
| 0|"Мышь беспроводна...| 0.0|["мышь, беспровод...|(10000,[372,634,6...| 3.0| | |
| 9|покрышка Данлоп 2...| 8.0|[покрышка, данлоп...|(10000,[118,1828,...| 0.0| | |
| 0|"Стилус для Nokia...| 0.0|["стилус, для, no...|(10000,[45,290,57...| 1.0| | |
| 9|покрышка Континен...| 8.0|[покрышка, контин...|(10000,[50,121,18...| 0.0| | |
| 833|Alcatel OT-890 St...| 1.0|[alcatel, ot-890,...|(10000,[971,1031,...| 0.0| | |
| 833|"Nokia Asha 200 G...| 1.0|["nokia, asha, 20...|(10000,[544,548,1...| 0.0| | |
| 833|"Samsung Champ Ne...| 1.0|["samsung, champ,...|(10000,[182,325,6...| 0.0| | |
| 833|"Samsung Champ Ne...| 1.0|["samsung, champ,...|(10000,[182,325,6...| 0.0| | |
| 833|"BB-mobile MH315 ...| 1.0|["bb-mobile, mh31...|(10000,[36,268,47...| 4.0| | |
| 0|Ароматизатор Carm...| 0.0|[ароматизатор, ca...|(10000,[554,4377,...| 4.0| | |
| 0|"Бумага Epson C13...| 0.0|["бумага, epson, ...|(10000,[45,53,297...| 3.0| | |
| 833|"Air J UKJ-XP Red...| 1.0|["air, j, ukj-xp,...|(10000,[106,171,2...| 0.0| | |
| 833|"Чехол для Samsun...| 1.0|["чехол, для, sam...|(10000,[50,657,86...| 2.0| | |
| 34|"Explay Onliner3 ...| 16.0|["explay, onliner...|(10000,[306,347,1...| 0.0| | |
| 833|"Intro HSW501 Whi...| 1.0|["intro, hsw501, ...|(10000,[45,372,61...| 0.0| | |
| 833|Ross&Moor PB-LS00...| 1.0|[ross&moor, pb-ls...|(10000,[779,796,1...| 14.0| | |
| 833|Чехол-кобура для ...| 1.0|[чехол-кобура, дл...|(10000,[0,54,335,...| 2.0| | |
| 0|Кабель телефонный...| 0.0|[кабель, телефонн...|(10000,[51,235,23...| 14.0| | |
| 0|Удлинитель-катушк...| 0.0|[удлинитель-катуш...|(10000,[170,171,3...| 4.0| | |
| 833|"Чехол-книжка для...| 1.0|["чехол-книжка, д...|(10000,[45,51,171...| 2.0| | |
+--------+--------------------+-----+--------------------+--------------------+----------+ | |
F1 metric = 0.597259 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.mllib.regression import LabeledPoint | |
from pyspark.ml.classification import NaiveBayes, MultilayerPerceptronClassifier | |
from pyspark.ml.evaluation import MulticlassClassificationEvaluator | |
from pyspark.ml.feature import HashingTF, Tokenizer, StringIndexer, Word2Vec, IndexToString | |
from pyspark.ml import Pipeline | |
from pyspark.sql import Row | |
textFile = sc.textFile("/Users/mitallast/Sites/spark/sell.csv") | |
data = textFile.map(lambda line: line.split(',', 1)).map(lambda p: Row(category=p[0], text=p[1])) | |
schemaSell = sqlContext.createDataFrame(data) | |
schemaSell.write.save("/Users/mitallast/Sites/spark/sell.parquet", format="parquet") | |
schemaSell = sqlContext.read.load("/Users/mitallast/Sites/spark/sell.parquet") | |
train_data, test_data = schemaSell.randomSplit([0.8, 0.2]) | |
categoryIndexer = StringIndexer(inputCol="category", outputCol="label") | |
tokenizer = Tokenizer(inputCol="text", outputCol="words") | |
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=10000) | |
mp = MultilayerPerceptronClassifier(maxIter=10, layers=[10000, 200]) | |
pipeline = Pipeline(stages=[categoryIndexer, tokenizer, hashingTF, mp]) | |
model = pipeline.fit(train_data) | |
pr = model.transform(test_data) | |
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") | |
metric = evaluator.evaluate(pr) | |
print "F1 metric = %g" % metric |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment