Я изучаю курс «Удар Apache с помощью Scala - Практические занятия с большими данными» в Udemy.
В одной из лекций вы должны настроить среду EMR и отправить файл JAR в кластер.
При отправке кода я получаю следующую ошибку.
Редактировать: код действительно работает после ошибки.
[hadoop@ip-172-31-27-160 ~]$ spark-submit MovieSimilarities1M-assembly-1.0.jar 250
log4j:ERROR setFile(null,true) call failed.
java.io.FileNotFoundException: /stderr (Permission denied)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at java.io.FileOutputStream.<init>(FileOutputStream.java:133)
at org.apache.log4j.FileAppender.setFile(FileAppender.java:294)
at org.apache.log4j.FileAppender.activateOptions(FileAppender.java:165)
at org.apache.log4j.DailyRollingFileAppender.activateOptions(DailyRollingFileAppender.java:223)
at org.apache.log4j.config.PropertySetter.activate(PropertySetter.java:307)
at org.apache.log4j.config.PropertySetter.setProperties(PropertySetter.java:172)
at org.apache.log4j.config.PropertySetter.setProperties(PropertySetter.java:104)
at org.apache.log4j.PropertyConfigurator.parseAppender(PropertyConfigurator.java:842)
at org.apache.log4j.PropertyConfigurator.parseCategory(PropertyConfigurator.java:768)
at org.apache.log4j.PropertyConfigurator.parseCatsAndRenderers(PropertyConfigurator.java:672)
at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:516)
at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:580)
at org.apache.log4j.helpers.OptionConverter.selectAndConfigure(OptionConverter.java:526)
at org.apache.log4j.LogManager.<clinit>(LogManager.java:127)
at org.apache.spark.internal.Logging$class.initializeLogging(Logging.scala:120)
at org.apache.spark.internal.Logging$class.initializeLogIfNecessary(Logging.scala:108)
at org.apache.spark.deploy.SparkSubmit$.initializeLogIfNecessary(SparkSubmit.scala:71)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:128)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
log4j:ERROR Either File or DatePattern options are not set for appender [DRFA-stderr].
log4j:ERROR setFile(null,true) call failed.
java.io.FileNotFoundException: /stdout (Permission denied)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at java.io.FileOutputStream.<init>(FileOutputStream.java:133)
at org.apache.log4j.FileAppender.setFile(FileAppender.java:294)
at org.apache.log4j.FileAppender.activateOptions(FileAppender.java:165)
at org.apache.log4j.DailyRollingFileAppender.activateOptions(DailyRollingFileAppender.java:223)
at org.apache.log4j.config.PropertySetter.activate(PropertySetter.java:307)
at org.apache.log4j.config.PropertySetter.setProperties(PropertySetter.java:172)
at org.apache.log4j.config.PropertySetter.setProperties(PropertySetter.java:104)
at org.apache.log4j.PropertyConfigurator.parseAppender(PropertyConfigurator.java:842)
at org.apache.log4j.PropertyConfigurator.parseCategory(PropertyConfigurator.java:768)
at org.apache.log4j.PropertyConfigurator.parseCatsAndRenderers(PropertyConfigurator.java:672)
at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:516)
at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:580)
at org.apache.log4j.helpers.OptionConverter.selectAndConfigure(OptionConverter.java:526)
at org.apache.log4j.LogManager.<clinit>(LogManager.java:127)
at org.apache.spark.internal.Logging$class.initializeLogging(Logging.scala:120)
at org.apache.spark.internal.Logging$class.initializeLogIfNecessary(Logging.scala:108)
at org.apache.spark.deploy.SparkSubmit$.initializeLogIfNecessary(SparkSubmit.scala:71)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:128)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
log4j:ERROR Either File or DatePattern options are not set for appender [DRFA-stdout].
Код spark-submit
работает, когда я запускаю его локально, и отображается только в среде EMR.
Что может вызвать эту ошибку? Может ли это быть что-то с разрешениями пользователя hadoop в среде?
Объект MovieShapsities выглядит следующим образом:
package com.sundogsoftware.spark
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.log4j._
import scala.io.Source
import java.nio.charset.CodingErrorAction
import scala.io.Codec
import scala.math.sqrt
// To run on EMR successfully + output results for Star Wars:
// aws s3 cp s3://sundog-spark/MovieSimilarities1M.jar ./
// aws s3 cp s3://sundog-spark/ml-1m/movies.dat ./
// spark-submit --executor-memory 1g MovieSimilarities1M.jar 260
object MovieSimilarities1M {
/** Load up a Map of movie IDs to movie names. */
def loadMovieNames() : Map[Int, String] = {
// Handle character encoding issues:
implicit val codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
// Create a Map of Ints to Strings, and populate it from u.item.
var movieNames:Map[Int, String] = Map()
val lines = Source.fromFile("movies.dat").getLines()
for (line <- lines) {
var fields = line.split("::")
if (fields.length > 1) {
movieNames += (fields(0).toInt -> fields(1))
}
}
return movieNames
}
type MovieRating = (Int, Double)
type UserRatingPair = (Int, (MovieRating, MovieRating))
def makePairs(userRatings:UserRatingPair) = {
val movieRating1 = userRatings._2._1
val movieRating2 = userRatings._2._2
val movie1 = movieRating1._1
val rating1 = movieRating1._2
val movie2 = movieRating2._1
val rating2 = movieRating2._2
((movie1, movie2), (rating1, rating2))
}
def filterDuplicates(userRatings:UserRatingPair):Boolean = {
val movieRating1 = userRatings._2._1
val movieRating2 = userRatings._2._2
val movie1 = movieRating1._1
val movie2 = movieRating2._1
return movie1 < movie2
}
type RatingPair = (Double, Double)
type RatingPairs = Iterable[RatingPair]
def computeCosineSimilarity(ratingPairs:RatingPairs): (Double, Int) = {
var numPairs:Int = 0
var sum_xx:Double = 0.0
var sum_yy:Double = 0.0
var sum_xy:Double = 0.0
for (pair <- ratingPairs) {
val ratingX = pair._1
val ratingY = pair._2
sum_xx += ratingX * ratingX
sum_yy += ratingY * ratingY
sum_xy += ratingX * ratingY
numPairs += 1
}
val numerator:Double = sum_xy
val denominator = sqrt(sum_xx) * sqrt(sum_yy)
var score:Double = 0.0
if (denominator != 0) {
score = numerator / denominator
}
return (score, numPairs)
}
/** Our main function where the action happens */
def main(args: Array[String]) {
// Set the log level to only print errors
Logger.getLogger("org").setLevel(Level.ERROR)
// Create a SparkContext without much actual configuration
// We want EMR's config defaults to be used.
val conf = new SparkConf()
conf.setAppName("MovieSimilarities1M")
val sc = new SparkContext(conf)
println("\nLoading movie names...")
val nameDict = loadMovieNames()
val data = sc.textFile("s3n://[MY-BUCKET-NAME]/ml-1m/ratings.dat")
// Map ratings to key / value pairs: user ID => movie ID, rating
val ratings = data.map(l => l.split("::")).map(l => (l(0).toInt, (l(1).toInt, l(2).toDouble)))
// Emit every movie rated together by the same user.
// Self-join to find every combination.
val joinedRatings = ratings.join(ratings)
// At this point our RDD consists of userID => ((movieID, rating), (movieID, rating))
// Filter out duplicate pairs
val uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)
// Now key by (movie1, movie2) pairs.
val moviePairs = uniqueJoinedRatings.map(makePairs).partitionBy(new HashPartitioner(100))
// We now have (movie1, movie2) => (rating1, rating2)
// Now collect all ratings for each movie pair and compute similarity
val moviePairRatings = moviePairs.groupByKey()
// We now have (movie1, movie2) = > (rating1, rating2), (rating1, rating2) ...
// Can now compute similarities.
val moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).cache()
//Save the results if desired
//val sorted = moviePairSimilarities.sortByKey()
//sorted.saveAsTextFile("movie-sims")
// Extract similarities for the movie we care about that are "good".
if (args.length > 0) {
val scoreThreshold = 0.97
val coOccurenceThreshold = 1000.0
val movieID:Int = args(0).toInt
// Filter for movies with this sim that are "good" as defined by
// our quality thresholds above
val filteredResults = moviePairSimilarities.filter( x =>
{
val pair = x._1
val sim = x._2
(pair._1 == movieID || pair._2 == movieID) && sim._1 > scoreThreshold && sim._2 > coOccurenceThreshold
}
)
// Sort by quality score.
val results = filteredResults.map( x => (x._2, x._1)).sortByKey(false).take(50)
println("\nTop 50 similar movies for " + nameDict(movieID))
for (result <- results) {
val sim = result._1
val pair = result._2
// Display the similarity result that isn't the movie we're looking at
var similarMovieID = pair._1
if (similarMovieID == movieID) {
similarMovieID = pair._2
}
println(nameDict(similarMovieID) + "\tscore: " + sim._1 + "\tstrength: " + sim._2)
}
}
}
}
Редактировать : после терпения код действительно продолжился
Потому что это заняло несколько мгновений, казалось, что он ничего не делал и что вышел, но на самом деле это не так.
...previous stacktrace
log4j:ERROR Either File or DatePattern options are not set for appender [DRFA-stdout].
Loading movie names...
18/11/06 13:14:11 INFO GPLNativeCodeLoader: Loaded native gpl library 18/11/06 13:14:11 INFO LzoCodec: Successfully loaded & initialized native-lzo library [hadoop-lzo rev 4a14a96f353432301b136f851837191211fcf807]
Top 50 similar movies for Star Wars: Episode IV - A New Hope (1977) Star Wars: Episode V - The Empire Strikes Back (1980) score:
0.9897917106566659 strength: 2355 Raiders of the Lost Ark (1981) score: 0.9855548278565054 strength: 1972 Star Wars: Episode VI
- Return of the Jedi (1983) score: 0.9841248359926177 strength: 2113 Indiana Jones and the Last Crusade (1989) score:
0.9774440028650038 strength: 1397 Shawshank Redemption, The (1994) score: 0.9768332708746131 strength: 1412 Usual Suspects, The (1995) score: 0.9766875136831684 strength: 1194 Godfather, The (1972) score: 0.9759284503618028 strength: 1583 Sixth Sense, The (1999) score: 0.974688767430798 strength: 1480 Schindler's List (1993) score: 0.9746820121947888 strength: 1422 Terminator, The (1984) score: 0.9745821991816754 strength: 1746 Back to the Future (1985) score: 0.9743476892310179 strength: 1845 Fugitive, The (1993) score: 0.9740503810950097 strength: 1429 Princess Bride, The (1987) score: 0.9737384179609926 strength: 1657 Matrix, The (1999) score: 0.9732130645719457 strength: 1908 Butch Cassidy and the Sundance Kid (1969) score: 0.9731825975678353 strength: 1048 Hunt for Red October, The (1990) score: 0.9731286559518592 strength: 1229 Casablanca (1942) score: 0.9730078799612648 strength: 1113 Saving Private Ryan (1998) score: 0.9729484985516464 strength: 1709 Ghostbusters (1984) score: 0.9726721862046535 strength: 1447 Die Hard (1988) score: 0.9724843514829112 strength: 1369 L.A. Confidential (1997) score: 0.9722077641949141 strength: 1416 Toy Story (1995) score: 0.9721270419610062 strength: 1382 Stand by Me (1986) score: 0.9718025936506943 strength: 1212 Close Encounters of the Third Kind (1977) score: 0.9717491756795117 strength: 1242 Monty Python and the Holy Grail (1974) score: 0.9717238750026624 strength: 1248 Silence of the Lambs, The (1991) score:
0.9714472073187363 strength: 1587 Wizard of Oz, The (1939) score: 0.9713633100564869 strength: 1346 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) score:
0.9713269232938938 strength: 1149 One Flew Over the Cuckoo's Nest (1975) score: 0.9708527915400245 strength: 1125 Ferris Bueller's Day Off (1986) score: 0.9705811698208009 strength: 1073 Godfather: Part II, The (1974) score: 0.9704073574007531 strength: 1246 Terminator 2: Judgment Day (1991) score: 0.9703674024729073 strength: 1889 E.T. the Extra-Terrestrial (1982) score: 0.9702456868065551 strength: 1714
Это не значит, что ошибка должна выглядеть следующим образом.
Я предполагаю, что log4j не работает должным образом, и ошибка не появляется на видео курса. Может, у кого-то есть решение?