从具有嵌套案例类作为字段的案例类创建Dataframe

时间:2018-04-30 14:34:08

标签: scala apache-spark spark-dataframe apache-spark-dataset

我运行以下代码:

import com.holdenkarau.spark.testing.DatasetSuiteBase
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.{Encoders, Row}
import org.scalatest.FlatSpec

case class Inner(i: Int)

case class Outer(in: Inner)

class MyTest extends FlatSpec with DatasetSuiteBase {
  behavior of "Engine"
  it should "work" in {
    import spark.implicits._
    val input = Seq("alice", "bob").toDF("name")
    val schema = Encoders.product[Outer].schema
    implicit val enc = Encoders.kryo[Row]
    val processed = input
      .map { row =>
        new GenericRowWithSchema(Array(Outer(Inner(row.getString(0).length))), schema): Row
      }
    processed.printSchema() //1
    processed.show //2
    val withSchema = spark.createDataFrame(processed.rdd, schema)
    withSchema.printSchema //3
    withSchema.show // throws exception
  }
}

来自1

的结果
root
 |-- value: binary (nullable = true)

来自2

的结果
+--------------------+
|               value|
+--------------------+
|[01 00 6F 72 67 2...|
|[01 00 6F 72 67 2...|
+--------------------+

来自3

的结果
root
 |-- in: struct (nullable = true)
 |    |-- i: integer (nullable = false)

4抛出异常

Outer is not a valid external type for schema of struct<i:int>

有谁知道这里有什么问题吗?它甚至可以在Spark中使用吗?

@edit 重新实现

  it should "find minimal example" in {
    import spark.implicits._
    val input = Seq("alice", "bob").toDF("name")
    val schema = Encoders.product[Outer].schema
    implicit val enc = RowEncoder(schema)
    val processed = input.map { row => Row(Outer(Inner(row.getString(0).length))) }
    processed.printSchema()
    processed.show
  }

0 个答案:

没有答案