如何在Spark中用单个逗号替换字符串列中的多个逗号

时间:2020-06-05 20:27:55

标签: apache-spark apache-spark-sql

我有一个Spark数据框

<video controls id="myVideo" poster="test.jpg">
  <source src="test.mp4" type="video/mp4" />
</video>
<script>
let myVideo = document.getElementById("myVideo");
myVideo.onended = function() {
  myVideo.poster = "test.jpg"
  myVideo.src = "test.mp4"
};
</script> 

我想从此列中删除不必要的逗号。因此,例如,第一条记录应在输出中显示为val df = Seq( (",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, "), (",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",,,,104,,,,,,,111,,,,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",101,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,184,,,,, "), (",,,,104,,,,,,,,,113,,,,,,,,,,,,,,,,,,131,,,,,,,,,,141,142,143,,,146,,,,150,,,,,155,,157,,,,,162,,,,,,,169,,,,,174,,176,177,178,,,,,,,,,,, "), (",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, "), (",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, "), (",,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,, "), (",,,,104,,,,,,,111,112,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, "), (",,102,103,104,,,,,,,,,113,114,,,,,,,121,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,160,,162,,,,,,,,,,,173,174,,176,,178,,,,,,,,,,,"), (",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, "), (",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, "), (",,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "), (",,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,, ") ).toDF("my_col")

该字符串不必以逗号开头和结尾。

如何在Spark中执行此操作?

2 个答案:

答案 0 :(得分:1)

首先,使用delim“,”分割字符串。然后使用array_remove函数删除空字符串。将数组连接回字符串。有一个结尾的“,”。要删除该udf,以删除字符串中最右边的字符。

scala> df.show(false)
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|my_col                                                                                                                                                  |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                              |
|,101,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,184,,,,,            |
|,,,,104,,,,,,,,,113,,,,,,,,,,,,,,,,,,131,,,,,,,,,,141,142,143,,,146,,,,150,,,,,155,,157,,,,,162,,,,,,,169,,,,,174,,176,177,178,,,,,,,,,,,               |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                       |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               |
|,,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,,                  |
|,,,,104,,,,,,,111,112,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                 |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               |
|,,102,103,104,,,,,,,,,113,114,,,,,,,121,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,160,,162,,,,,,,,,,,173,174,,176,,178,,,,,,,,,,,|
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                       |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               |
|,,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                              |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,,                  |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+


scala> df.select(trim(array_join(array_remove(split($"my_col", ","), ""),",")) as "my_col").show(false)
+-----------------------------------------------------------------------------------+
|my_col                                                                             |
+-----------------------------------------------------------------------------------+
|104,111,114,131,157,162,169,174,176,                                               |
|104,111,114,131,157,162,169,174,176,                                               |
|104,111,114,131,157,162,169,174,176,                                               |
|104,111,114,131,157,160,162,174,176,                                               |
|104,111,114,131,157,162,169,174,176,                                               |
|104,111,131,157,160,162,169,174,176,                                               |
|103,104,111,114,121,131,157,162,169,174,176,                                       |
|101,102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,184,               |
|104,113,131,141,142,143,146,150,155,157,162,169,174,176,177,178,                   |
|104,111,114,131,157,162,174,176,                                                   |
|                                                                                   |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,                       |
|104,111,112,114,131,157,162,169,174,176,                                           |
|                                                                                   |
|102,103,104,113,114,121,130,131,141,142,143,146,150,152,157,160,162,173,174,176,178|
|104,111,114,131,157,162,174,176,                                                   |
|                                                                                   |
|103,104,111,114,121,131,157,162,169,174,176,                                       |
|104,111,114,131,157,162,169,174,176,                                               |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,                       |
+-----------------------------------------------------------------------------------+


scala> val myUdf = udf{(x:String) => if(x.endsWith(",")){x.dropRight(1)} else {x}}
myUdf: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(StringType)))

scala> df.select(myUdf(trim(array_join(array_remove(split($"my_col", ","), ""),","))) as "my_col").show(false)
+-----------------------------------------------------------------------------------+
|my_col                                                                             |
+-----------------------------------------------------------------------------------+
|104,111,114,131,157,162,169,174,176                                                |
|104,111,114,131,157,162,169,174,176                                                |
|104,111,114,131,157,162,169,174,176                                                |
|104,111,114,131,157,160,162,174,176                                                |
|104,111,114,131,157,162,169,174,176                                                |
|104,111,131,157,160,162,169,174,176                                                |
|103,104,111,114,121,131,157,162,169,174,176                                        |
|101,102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,184                |
|104,113,131,141,142,143,146,150,155,157,162,169,174,176,177,178                    |
|104,111,114,131,157,162,174,176                                                    |
|                                                                                   |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174                        |
|104,111,112,114,131,157,162,169,174,176                                            |
|                                                                                   |
|102,103,104,113,114,121,130,131,141,142,143,146,150,152,157,160,162,173,174,176,178|
|104,111,114,131,157,162,174,176                                                    |
|                                                                                   |
|103,104,111,114,121,131,157,162,169,174,176                                        |
|104,111,114,131,157,162,169,174,176                                                |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174                        |
+-----------------------------------------------------------------------------------+

答案 1 :(得分:1)

您可以使用regexp_replace:

val df_cleaned = df.withColumn("cleaned", regexp_replace(col("my_col"), ",+", ","))
  .withColumn("cleaned", regexp_replace(col("cleaned"), "^,", ""))
  .withColumn("cleaned", regexp_replace(col("cleaned"), ",$", ""))

第一行删除所有重复的逗号,第二和第三行删除开头和结尾的逗号。

相关问题