Если вы ищете размер в байтах, а также размер в количестве строк, следуйте этому-
Альтернатива-1
// ### Alternative -1
/**
* file content
* spark-test-data.json
* --------------------
* {"id":1,"name":"abc1"}
* {"id":2,"name":"abc2"}
* {"id":3,"name":"abc3"}
*/
val fileName = "spark-test-data.json"
val path = getClass.getResource("/" + fileName).getPath
spark.catalog.createTable("df", path, "json")
.show(false)
/**
* +---+----+
* |id |name|
* +---+----+
* |1 |abc1|
* |2 |abc2|
* |3 |abc3|
* +---+----+
*/
// Collect only statistics that do not require scanning the whole table (that is, size in bytes).
spark.sql("ANALYZE TABLE df COMPUTE STATISTICS NOSCAN")
spark.sql("DESCRIBE EXTENDED df ").filter(col("col_name") === "Statistics").show(false)
/**
* +----------+---------+-------+
* |col_name |data_type|comment|
* +----------+---------+-------+
* |Statistics|68 bytes | |
* +----------+---------+-------+
*/
spark.sql("ANALYZE TABLE df COMPUTE STATISTICS")
spark.sql("DESCRIBE EXTENDED df ").filter(col("col_name") === "Statistics").show(false)
/**
* +----------+----------------+-------+
* |col_name |data_type |comment|
* +----------+----------------+-------+
* |Statistics|68 bytes, 3 rows| |
* +----------+----------------+-------+
*/
Альтернатива-2
// ### Alternative 2
val df = spark.range(10)
df.createOrReplaceTempView("myView")
spark.sql("explain cost select * from myView").show(false)
/**
* +------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |plan |
* +------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |== Optimized Logical Plan ==
* Range (0, 10, step=1, splits=Some(2)), Statistics(sizeInBytes=80.0 B, hints=none)
*
* == Physical Plan ==
* *(1) Range (0, 10, step=1, splits=2)|
* +------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
*/
Альтернатива-3
// ### altervative 3
println(spark.sessionState.executePlan(df.queryExecution.logical).optimizedPlan.stats.sizeInBytes)
// 80