我有一个Dataframe,在某些列中有多个值,总是用^
分隔phone|contact|
ERN~58XXXXXX7~^EPN~5XXXXX551~|C~MXXX~MSO~^CAxxE~~~~~~3XXX5|
phone1|phone2|contact1|contact2|
ERN~5XXXXXXX7|EPN~58XXXX91551~|C~MXXXH~MSO~|CAxxE~~~~~~3XXX5|
如何使用循环作为列值之间的分隔符来实现这一点 并不是一成不变的。
答案 0 :(得分:0)
val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", "|").option("charset", "UTF-8").load("test.txt").
val columnList=df.columns
val xx = columnList.map(x => x->0).toMap
val opMap = df.rdd.flatMap { row =>
columnList.foldLeft(xx) { case (y, col) =>
val s = row.getAs[String](col).split("\\^").length
if (y(col) < s)
y.updated(col, s)
else
y
}.toList
}
val colMaxSizeMap = opMap.groupBy(x => x._1).map(x => x._2.toList.maxBy(x => x._2)).collect().toMap
val x = df.rdd.map{x =>
val op = columnList.flatMap{ y =>
val op = x.getAs[String](y).split("\\^")
op++List.fill(colMaxSizeMap(y)-op.size)("")
}
Row.fromSeq(op)
}
val structFieldList = columnList.flatMap{colName =>
List.range(0,colMaxSizeMap(colName),1).map{ i =>
StructField(s"$colName"+s"$i",StringType)
}
}
val schema = StructType(structFieldList)
val da= spark.createDataFrame(x,schema)