У меня было такое же требование несколько месяцев назад, и я создал следующий скрипт PowerShell, который вызывает PuTTY для загрузки JAR и файла конфигурации, а затем отправляет команду spark-submit.
cls
$PuttyBasePath = "C:\Program Files\PuTTY\"
$PlinkPath = $PuttyBasePath + "plink.exe"
$PscpPath = $PuttyBasePath + "pscp.exe"
$SshHost = "someCluster-ssh.azurehdinsight.net"
$SshUserName = "sshuser"
$SshPassword = "somePassword"
$MainPath = "/home/sshuser/"
$BaseDirectoryToUploadFiles = "/home/sshuser/testPath"
$DirectoryToUploadFiles = $BaseDirectoryToUploadFiles + "20180505235534"
# Files to load
$ConfigFilePath = "C:\config.json"
$JarFilePath = "C:\somejar.jar"
function UploadFiles
{
&$PlinkPath -v -pw $SshPassword $SshUserName@$SshHost mkdir -p $DirectoryToUploadFiles
$FilesToUpload = @($JarFilePath, $ConfigFilePath)
foreach ($FileToUpload in $FilesToUpload)
{
&$PscpPath -v -pw $SshPassword -r $FileToUpload $SshUserName@${SshHost}:$DirectoryToUploadFiles
}
}
function RunSparkJob
{
cls
echo y | &$PlinkPath -v -pw $SshPassword $SshUserName@$SshHost ("/usr/hdp/current/spark2-client/bin/spark-submit" +
" --master yarn" +
" --deploy-mode cluster" +
" --conf spark.yarn.maxAppAttempts=8" +
" --conf spark.yarn.am.attemptFailuresValidityInterval=1h" +
" --conf spark.yarn.max.executor.failures=240" +
" --conf spark.yarn.executor.failuresValidityInterval=1h" +
" --conf spark.task.maxFailures=4" +
" --conf spark.speculation=true" +
" --conf spark.speculation.multiplier=40" +
" --conf spark.speculation.quantile=0.85" +
" --num-executors 30" +
" --executor-cores 10" +
" --files ${DirectoryToUploadFiles}/config.json" +
" --packages org.apache.spark:spark-streaming_2.11:2.2.0,com.microsoft.azure:azure-eventhubs-spark_2.11:2.2.0" +
" --class com.test.someClass ${DirectoryToUploadFiles}/somejar.jar")
}
UploadFiles
RunSparkJob