Я новичок в Spark. Я написал несколько кодов в scala и выполнил в spark-shell.
Однако я хочу, чтобы все коды были в искровом приложении. Я попытался отформатировать это как можно больше. но я все еще получаю ошибки форматирования. Может кто-нибудь отладить полностью?
просьба игнорировать комментарии, так как они являются вопросами, которые мне нужно решить
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql
object functions {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("test").master("local").getOrCreate()
val df = spark.read.option("header", "true").option("inferSchema", "true").csv("aadhaar_data.csv").toDF("date", "registrar", "private_agency", "state", "district", "sub_district", "pincode", "gender", "age", "aadhaar_generated", "rejected", "mobile_number", "email_id")
df.registerTempTable("data")
//1. View/result of the top 25 rows from each individual store.
spark.sql("select * from (select *,row_number() over (partition by private_agency order by private_agency desc) as Row_Num from data) as b where b.Row_Num<=25").show(30)
//Checkpoint 2
//1. Describe the schema
df.printSchema()
//2. Find the count and names of registrars in the table.
df.select("registrar").distinct().show()
df.select("registrar").distinct().count()
//3. Find the number of states, districts in each state and sub-districts in each district.
df.select("state").distinct().count()
spark.sql("SELECT state, COUNT(district) AS district_count FROM data GROUP BY state ORDER BY COUNT(district) DESC").show()
spark.sql("SELECT district, COUNT(sub_district) AS sub_district_count FROM data GROUP BY district ORDER BY COUNT(sub_district) DESC").show()
//4. Find out the names of private agencies for each state.
spark.sql("SELECT state, private_agency FROM data GROUP BY state, private_agency ORDER BY state").show(2000, false)
//Checkpoint3
//1. Find top 3 states generating most number of Aadhaar cards?
spark.sqlContext.sql("SELECT state, SUM(aadhaar_generated) AS aadhaar_count FROM data GROUP BY state ORDER BY aadhaar_count DESC LIMIT 3").show()
//2. Find top 3 districts where enrolment numbers are maximum?
val generated = df.groupBy("district").sum("aadhaar_generated")
val rejected = df.groupBy("district").sum("rejected")
val concat = generated.withColumn("id", monotonically_increasing_id()).join(rejected.withColumn("id", monotonically_increasing_id()), Seq("id")).drop("id")
val final = concat.withColumn("Sum_Value", $"sum(aadhaar_generated)" + $"sum(rejected)")
println("top 3 districts where enrolment numbers are maximum along with the number of enrolments")
final.show(3, false)
//3. Find the no. of Aadhaar cards generated in each state?
spark.sqlContext.sql("SELECT state, SUM(aadhaar_generated) AS aadhaar_count FROM data GROUP BY state").show()
//Checkpoint 4:
//1. Find the number of unique pincodes in the data?
df.select("pincode").distinct.show()
//2. Find the number of Aadhaar registrations rejected in Uttar Pradesh and
Maharashtra ?
spark.sqlContext.sql("SELECT state, SUM(rejected) AS rejected_count FROM data GROUP BY state having (state='Uttar Pradesh' OR state='Maharashtra')").show()
PS: я пытался сделать приложение в интеллигентной идее