Разбор Json с использованием Play Json с разными полями - PullRequest
0 голосов
/ 29 мая 2018

У меня ниже JSON, и я анализирую его, используя play-json.Каким-то образом "datafeeds/schema/fields" Узел не обрабатывается должным образом.

Я создал стандартные операции чтения для анализа этого Json, но узел "datafeeds", кажется, не обрабатывает правильно из-за узла "format" (datafeeds / schema / fields)будучи иногда String или JsObject, то же самое относится и к узлу "type".

Если я рассматриваю Schema как JsObject, тогда весь Json анализируется правильно, и мне кажется, что мне придется обрабатывать Schema отдельно.

MyJson выглядит следующим образом

{
    "entities": [
        {
            "name": "customers",
            "number_of_buckets": 5,
            "entity_column_name": "customer_id",
            "entity_column_type": "integer"
        },
        {
            "name": "accounts",
            "number_of_buckets": 7,
            "entity_column_name": "account_id",
            "entity_column_type": "string"
        },
        {
            "name": "products",
            "number_of_buckets": 1,
            "entity_column_name": "product_id",
            "entity_column_type": "integer"
        }
    ],
    "datafeeds": [
        {
            "name": "customer_demographics",
            "version": "1",
            "delimiter": "|",
            "filename_re_pattern": ".*(customer_demographics_v1_[0-9]{8}\\.psv)$",
            "frequency": {
                "days": 1
            },
            "from": "2015-07-01",
            "drop_threshold": {
                "rows": null,
                "percentage": 0.05
            },
            "dry_run": false,
            "header": true,
            "text_qualifier": null,
            "landing_path": "landing",
            "schema": {
                "fields": [
                    {
                        "time_key": true,
                        "format": "yyyy-MM-dd",
                        "metadata": {},
                        "name": "record_date",
                        "nullable": false,
                        "primary_key": true,
                        "type": "timestamp",
                        "timezone": "Australia/Sydney"
                    },
                    {
                        "format": "yyyy-MM-dd",
                        "metadata": {},
                        "name": "extract_date",
                        "nullable": false,
                        "primary_key": true,
                        "type": "timestamp",
                        "timezone": "Australia/Sydney"
                    },
                    {
                        "entity_type": "customers",
                        "metadata": {},
                        "name": "customer_id",
                        "nullable": false,
                        "primary_key": true,
                        "type": "integer"
                    },
                    {
                        "metadata": {},
                        "name": "year_of_birth",
                        "nullable": true,
                        "type": "integer"
                    },
                    {
                        "metadata": {},
                        "name": "month_of_birth",
                        "nullable": true,
                        "type": "integer"
                    },
                    {
                        "metadata": {},
                        "name": "postcode",
                        "nullable": true,
                        "type": "string"
                    },
                    {
                        "metadata": {},
                        "name": "state",
                        "nullable": true,
                        "type": "string"
                    },
                    {
                        "format": {
                            "false": "N",
                            "true": "Y"
                        },
                        "metadata": {},
                        "name": "marketing_consent",
                        "nullable": true,
                        "type": "boolean"
                    }
                ],
                "type": "struct"
            }
        },
        {
            "name": "customer_statistics",
            "version": "1",
            "delimiter": "|",
            "filename_re_pattern": ".*(customer_statistics_v1_[0-9]{8}\\.psv)$",
            "frequency": {
                "days": 1
            },
            "from": "2015-07-01",
            "drop_threshold": {
                "rows": null,
                "percentage": 0.05
            },
            "dry_run": false,
            "header": true,
            "text_qualifier": null,
            "landing_path": "landing",
            "schema": {
                "fields": [
                    {
                        "time_key": true,
                        "format": "yyyy-MM-dd",
                        "metadata": {},
                        "name": "record_date",
                        "nullable": false,
                        "primary_key": true,
                        "type": "timestamp",
                        "timezone": "Australia/Sydney"
                    },
                    {
                        "format": "yyyy-MM-dd",
                        "metadata": {},
                        "name": "extract_date",
                        "nullable": false,
                        "primary_key": true,
                        "type": "timestamp",
                        "timezone": "Australia/Sydney"
                    },
                    {
                        "entity_type": "customers",
                        "metadata": {},
                        "name": "customer_id",
                        "nullable": false,
                        "primary_key": true,
                        "type": "integer"
                    },
                    {
                        "metadata": {},
                        "name": "risk_score",
                        "nullable": true,
                        "type": "double"
                    },
                    {
                        "metadata": {},
                        "name": "mkg_segments",
                        "nullable": true,
                        "type": {
                            "type":"array",
                            "elementType":"string",
                            "containsNull": false
                        }
                    },
                    {
                        "metadata": {},
                        "name": "avg_balance",
                        "nullable": true,
                        "type": "decimal"
                    },
                    {
                        "metadata": {},
                        "name": "num_accounts",
                        "nullable": true,
                        "type": "integer"
                    }
                ],
                "type": "struct"
            }
        }

    ],
    "tables": [
        {
            "name": "table_name",
            "version": "version",
            "augmentations": [
                {
                    "left_table_name": "left_table_name",
                    "left_table_version": "v1",
                    "right_table_name": "right_table_name",
                    "right_table_version": "v1",
                    "columns": [
                        "column_a",
                        "column_b",
                        "column_c"
                    ],
                    "join_cols": [
                        {
                            "left_table": "system_code",
                            "right_table": "key_a"
                        },
                        {
                            "left_table": "group_product_code",
                            "right_table": "key_b"
                        },
                        {
                            "left_table": "sub_product_code",
                            "right_table": "key_c"
                        }
                    ]
                }
            ],
            "sources": [
                {
                    "name": "table_name",
                    "version": "v1",
                    "mandatory": true,
                    "type": "datafeed | table"
                }
            ],
            "aggregations": [
                {
                    "column_name": "customer_age_customer_age",
                    "column_type": "long",
                    "description": "date_diff",
                    "expression": "max_by",
                    "source_columns": [
                        {
                            "column_name": "customer_age_year_of_birth",
                            "source": {
                                "name": "customers",
                                "type": "table",
                                "version": "v1"
                            }
                        },
                        {
                            "column_name": "customer_age_month_of_birth",
                            "source": {
                                "name": "customers",
                                "type": "table",
                                "version": "v1"
                            }
                        }
                    ]
                }
            ],
            "column_level_transformations": [
                {
                    "column_name": "column_added",
                    "column_type": "long",
                    "description": "adding two columns to return something else",
                    "expression": "column_a+column_b",
                    "source_columns": [
                        {
                            "column_name": "column_a",
                            "source": {
                                "name": "source_a",
                                "type": "table",
                                "version": "v1"
                            }
                        },
                        {
                            "column_name": "column_b",
                            "source": {
                                "name": "source_b",
                                "type": "table",
                                "version": "v1"
                            }
                        }
                    ]
                }
            ],
            "frequency": {
                "months": 1
            },
            "joins": [
                {
                    "name": "table_name",
                    "version": "v1"
                },
                {
                    "name": "table_name_b",
                    "version": "v2"
                }
            ],
            "from": "2015-07-01",
            "format": "parquet",
            "structure": "primitives",
            "index_query": "sql statement",
            "insert_query": "sql statement"
        }
    ]
}

Есть идеи, как разобрать этот Json?

1 Ответ

0 голосов
/ 29 мая 2018

Редактировать: обновлено, чтобы ответить на обновленный вопрос

Я не уверен, как вы анализируете сейчас, но вы можете попробовать это:

import play.api.libs.json.Reads._
import play.api.libs.json._

case class Frequency(days: Int)

case class DropThreshold(
  rows: Option[Int], //guessing type here
  percentage: Double
)

case class Format(`false`: String, `true`: String)

case class Type(`type`: String, elementType: String, containsNull: Boolean)

case class Field(
  entity_type: Option[String], 
  time_key: Option[Boolean], 
  format: Option[Either[String, Format]], 
  metadata: Option[JsObject], 
  name: Option[String], 
  nullable: Option[Boolean], 
  primary_key: Option[Boolean], 
  `type`: Option[Either[String, Type]], 
  timezone: Option[String]
)

case class Schema(fields: Seq[Field])

case class Datafeed(
  name: String, 
  version: String, 
  delimiter: String, 
  filename_re_pattern: String, 
  frequency: Frequency, 
  from: String, 
  drop_threshold: DropThreshold,
  dry_run: Boolean,
  header: Boolean,
  text_qualifier: Option[String], //guessing type here
  landing_path: String,
  schema: Schema
)

case class Entity(name: String, number_of_buckets: Int, entity_column_name: String, entity_column_type: String)


case class MyJson(entities: Seq[Entity], datafeeds: Seq[Datafeed])

implicit def eitherReads[A, B](implicit A: Reads[A], B: Reads[B]): Reads[Either[A, B]] = Reads[Either[A, B]] { json =>
  A.reads(json) match {
    case JsSuccess(value, path) => JsSuccess(Left(value), path)
    case JsError(e1) => B.reads(json) match {
      case JsSuccess(value, path) => JsSuccess(Right(value), path)
      case JsError(e2) => JsError(JsError.merge(e1, e2))
    }
  }
}

implicit val frequencyReads: Reads[Frequency] = Json.reads[Frequency]
implicit val dropThresholdReads: Reads[DropThreshold] = Json.reads[DropThreshold]
implicit val formatReads: Reads[Format] = Json.reads[Format]
implicit val typeReads: Reads[Type] = Json.reads[Type]
implicit val fieldReads: Reads[Field] = Json.reads[Field]
implicit val schemaReads: Reads[Schema] = Json.reads[Schema]
implicit val datafeedReads: Reads[Datafeed] = Json.reads[Datafeed]
implicit val entityReads: Reads[Entity] = Json.reads[Entity]
implicit val myJsonReads: Reads[MyJson] = Json.reads[MyJson]

С Either Reads скопировано с здесь .Для проверки:

scala> val json = Json.parse("""{"entities": [{"name": "customers","number_of_buckets": 5,"entity_column_name": "customer_id","entity_column_type": "integer"},{"name": "accounts","number_of_buckets": 7,"entity_column_name": "account_id","entity_column_type": "string"},{"name": "products","number_of_buckets": 1,"entity_column_name": "product_id","entity_column_type": "integer"}],"datafeeds": [{"name": "customer_demographics","version": "1","delimiter": "|","filename_re_pattern": ".*(customer_demographics_v1_[0-9]{8}\\.psv)$","frequency": {"days": 1},"from": "2015-07-01","drop_threshold": {"rows": null,"percentage": 0.05},"dry_run": false,"header": true,"text_qualifier": null,"landing_path": "landing","schema": {"fields": [{"time_key": true,"format": "yyyy-MM-dd","metadata": {},"name": "record_date","nullable": false,"primary_key": true,"type": "timestamp","timezone": "Australia/Sydney"},{"format": "yyyy-MM-dd","metadata": {},"name": "extract_date","nullable": false,"primary_key": true,"type": "timestamp","timezone": "Australia/Sydney"},{"entity_type": "customers","metadata": {},"name": "customer_id","nullable": false,"primary_key": true,"type": "integer"},{"metadata": {},"name": "year_of_birth","nullable": true,"type": "integer"},{"metadata": {},"name": "month_of_birth","nullable": true,"type": "integer"},{"metadata": {},"name": "postcode","nullable": true,"type": "string"},{"metadata": {},"name": "state","nullable": true,"type": "string"},{"format": {"false": "N","true": "Y"},"metadata": {},"name": "marketing_consent","nullable": true,"type": "boolean"}],"type": "struct"}},{"name": "customer_statistics","version": "1","delimiter": "|","filename_re_pattern": ".*(customer_statistics_v1_[0-9]{8}\\.psv)$","frequency": {"days": 1},"from": "2015-07-01","drop_threshold": {"rows": null,"percentage": 0.05},"dry_run": false,"header": true,"text_qualifier": null,"landing_path": "landing","schema": {"fields": [{"time_key": true,"format": "yyyy-MM-dd","metadata": {},"name": "record_date","nullable": false,"primary_key": true,"type": "timestamp","timezone": "Australia/Sydney"},{"format": "yyyy-MM-dd","metadata": {},"name": "extract_date","nullable": false,"primary_key": true,"type": "timestamp","timezone": "Australia/Sydney"},{"entity_type": "customers","metadata": {},"name": "customer_id","nullable": false,"primary_key": true,"type": "integer"},{"metadata": {},"name": "risk_score","nullable": true,"type": "double"},{"metadata": {},"name": "mkg_segments","nullable": true,"type": {"type":"array","elementType":"string","containsNull": false}},{"metadata": {},"name": "avg_balance","nullable": true,"type": "decimal"},{"metadata": {},"name": "num_accounts","nullable": true,"type": "integer"}],"type": "struct"}}],"tables": [{"name": "table_name","version": "version","augmentations": [{"left_table_name": "left_table_name","left_table_version": "v1","right_table_name": "right_table_name","right_table_version": "v1","columns": ["column_a","column_b","column_c"],"join_cols": [{"left_table": "system_code","right_table": "key_a"},{"left_table": "group_product_code","right_table": "key_b"},{"left_table": "sub_product_code","right_table": "key_c"}]}],"sources": [{"name": "table_name","version": "v1","mandatory": true,"type": "datafeed | table"}],"aggregations": [{"column_name": "customer_age_customer_age","column_type": "long","description": "date_diff","expression": "max_by","source_columns": [{"column_name": "customer_age_year_of_birth","source": {"name": "customers","type": "table","version": "v1"}},{"column_name": "customer_age_month_of_birth","source": {"name": "customers","type": "table","version": "v1"}}]}],"column_level_transformations": [{"column_name": "column_added","column_type": "long","description": "adding two columns to return something else","expression": "column_a+column_b","source_columns": [{"column_name": "column_a","source": {"name": "source_a","type": "table","version": "v1"}},{"column_name": "column_b","source": {"name": "source_b","type": "table","version": "v1"}}]}],"frequency": {"months": 1},"joins": [{"name": "table_name","version": "v1"},{"name": "table_name_b","version": "v2"}],"from": "2015-07-01","format": "parquet","structure": "primitives","index_query": "sql statement","insert_query": "sql statement"}]}""")
json: play.api.libs.json.JsValue = {"entities":[{"name":"customers","number_of_buckets":5,"entity_column_name":"customer_id","entity_column_type":"integer"},{"name":"accounts","number_of_buckets":7,"entity_column_name":"account_id","entity_column_type":"string"},{"name":"products","number_of_buckets":1,"entity_column_name":"product_id","entity_column_type":"integer"}],"datafeeds":[{"name":"customer_demographics","version":"1","delimiter":"|","filename_re_pattern":".*(customer_demographics_v1_[0-9]{8}\\.psv)$","frequency":{"days":1},"from":"2015-07-01","drop_threshold":{"rows":null,"percentage":0.05},"dry_run":false,"header":true,"text_qualifier":null,"landing_path":"landing","schema":{"fields":[{"time_key":true,"format":"yyyy-MM-dd","metadata":{},"name":"record...

scala> json.validate[MyJson]
res0: play.api.libs.json.JsResult[MyJson] = JsSuccess(MyJson(List(Entity(customers,5,customer_id,integer), Entity(accounts,7,account_id,string), Entity(products,1,product_id,integer)),List(Datafeed(customer_demographics,1,|,.*(customer_demographics_v1_[0-9]{8}\.psv)$,Frequency(1),2015-07-01,DropThreshold(None,0.05),false,true,None,landing,Schema(List(Field(None,Some(true),Some(Left(yyyy-MM-dd)),Some({}),Some(record_date),Some(false),Some(true),Some(Left(timestamp)),Some(Australia/Sydney)), Field(None,None,Some(Left(yyyy-MM-dd)),Some({}),Some(extract_date),Some(false),Some(true),Some(Left(timestamp)),Some(Australia/Sydney)), Field(Some(customers),None,None,Some({}),Some(customer_id),Some(false),Some(true),Some(Left(integer)),None), Field(None,None,None,Some({}),...

Не забудьте установить для любых необязательных или обнуляемых полей тип Option.

...