Хочу оценить наивную байесовскую модель для классификации.Я могу создать и обучить свою модель, но у меня есть проблемы с процессом оценки.Я использую библиотеку PySpark mllib.
Вот так выглядит мой CSV-файл:
ID,SCALE,PRICE,SUBCATEGORY
30,3,23,1
20,1,45,1
:
:
Сначала я конвертирую CSV-файл в правильный формат, затем создаю и обучаю свою модель:
spark = SparkSession \
.builder \
.appName("NaiveBayesExample") \
.getOrCreate()
dat = spark.read \
.format('csv') \
.option('header', 'true') \
.option('inferSchema', 'true') \
.load("Project_List_Data_Set.csv")
# Split the data into train and test
c = dat.rdd
print(c.take(3))
data = c.map(lambda line: LabeledPoint(line[0], [line[1:]])) # arbitrary mapping, it's just an example
splits = data.randomSplit([0.8, 0.2], 1234)
train = splits[0]
test = splits[1]
# Split data approximately into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4])
# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)
затем я делаю предсказания и создаю экземпляр объекта метрики:
# Make prediction
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabel)
precision = metrics.weightedTruePositiveRate()
Когда я пытаюсь выполнить последнюю строку моего кода, у меня появляются ошибки
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\opt\spark\spark-2.1.2-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 171, in main
File "C:\opt\spark\spark-2.1.2-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 166, in process
File "C:\opt\spark\spark-2.1.2-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "C:/Users/Tomek/Downloads/docplex-examples-master/examples/cp/visu/job_shop_flexible.py", line 71, in <lambda>
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
File "C:\opt\spark\spark-2.1.2-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\mllib\classification.py", line 620, in predict
return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]
File "C:\opt\spark\spark-2.1.2-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\mllib\linalg\__init__.py", line 366, in dot
assert len(self) == other.shape[0], "dimension mismatch"
AssertionError: dimension mismatch
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:325)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
После этого я хочу развернуть эту модельв облаке IBM Watson.