Не могли бы вы помочь мне разрешить эту ошибку в PySpark (Zeppelin)?
java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:589)
at org.apache.thrift.transport.TSocket.open(TSocket.java:182)
at org.apache.zeppelin.interpreter.remote.ClientFactory.create(ClientFactory.java:51)
at org.apache.zeppelin.interpreter.remote.ClientFactory.create(ClientFactory.java:37)
at org.apache.commons.pool2.BasePooledObjectFactory.makeObject(BasePooledObjectFactory.java:60)
at org.apache.commons.pool2.impl.GenericObjectPool.create(GenericObjectPool.java:861)
at org.apache.commons.pool2.impl.GenericObjectPool.borrowObject(GenericObjectPool.java:435)
at org.apache.commons.pool2.impl.GenericObjectPool.borrowObject(GenericObjectPool.java:363)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterProcess.getClient(RemoteInterpreterProcess.java:62)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterProcess.callRemoteFunction(RemoteInterpreterProcess.java:133)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.open(RemoteInterpreter.java:139)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.getFormType(RemoteInterpreter.java:299)
at org.apache.zeppelin.notebook.Paragraph.jobRun(Paragraph.java:408)
at org.apache.zeppelin.scheduler.Job.run(Job.java:188)
at org.apache.zeppelin.scheduler.RemoteScheduler$JobRunner.run(RemoteScheduler.java:315)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Я пытаюсь запустить этот код PySpark:
%pyspark
df = pd.read_csv('/datos/cite75_99.txt.bz2', compression='bz2', header=0, sep=',', quotechar='"')
df.show()
Я запускаю этот код на записная книжка Zeppelin, построенная на Docker compose.
Вот что содержит файл docker -compose.yml:
version: '2'
services:
zeppelin:
build: .
ports:
- "8080:8080"
- "4040:4040"
volumes:
- ./logs:/logs
- ./notebook:/notebook
- ./datos:/datos
environment:
- ZEPPELIN_LOG_DIR=/logs
- ZEPPELIN_NOTEBOOK_DIR=/notebook
Файл Docker для службы Zeppelin следующим образом:
FROM apache/zeppelin:0.8.2
MAINTAINER tf.pena@usc.es
USER root
ENV LANG=es_ES.UTF-8 \
JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
ENV ZEPPELIN_ADDR=0.0.0.0
RUN echo "Instalando locales" && \
apt-get -y update && \
apt-get install -y locales
RUN locale-gen $LANG &&\
update-locale LANG=$LANG
RUN echo "Instalando paquetes adicionales" && \
apt-get install -y netcat-openbsd
ENV SPARK_VERSION=2.2.0 \
HADOOP_VERSION=2.7 \
SPARK_DIR=/usr/local
ENV SPARK_HOME=${SPARK_DIR}/spark
RUN echo "Instalando Spark" && \
mkdir -p ${SPARK_DIR} && \
wget -O /tmp/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
tar -zxf /tmp/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
rm -rf /tmp/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
mv -f spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} ${SPARK_DIR} && \
ln -s ${SPARK_DIR}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} ${SPARK_HOME}
EXPOSE 8080
EXPOSE 4040
Стоит отметить, что коды системной оболочки успешно работают в том же ноутбуке Zeppelin.
Жду ваших отзывов.
Спасибо!