
时间:2019-05-03 09:56:58

标签: pyspark

我需要根据数据框中的其他列向左移动列。请注意,我使用的是Spark 2.1

ID  Col1 Col2 Col3 Col4 shift
1    1    2    3    4     1
2    5    6    7    8     3
3    9    10   11   12    2
4    13   14   15   16    0
5    17   18   19   20    5


ID  Col1 Col2 Col3 Col4
1    2    3     4   1
2    8    5     6   7
3    11   12    9   10
4    13   14    15  16
5    18   19    20  17

2 个答案:

答案 0 :(得分:0)


from collections import deque

def shift(row):
    l = list(row)[:-1]
    d = deque(l)
    s = row.shift
    return list(d) x: shift(x)).toDF(df.columns[:-1])

答案 1 :(得分:0)


from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, LongType

df = spark.createDataFrame(
  , ['Id','Col1','Col2','Col3','Col4','shift']

# |-- Id: long (nullable = true)
# |-- Col1: long (nullable = true)
# |-- Col2: long (nullable = true)
# |-- Col3: long (nullable = true)
# |-- Col4: long (nullable = true)
# |-- shift: long (nullable = true)

# colume names to shift/rotate
cols = df.columns[1:-1]
#['Col1', 'Col2', 'Col3', 'Col4']

def my_shift(arr, n):
    if n == 0: return arr
    arr_len = len(arr)
    return [ arr[(i+n)%arr_len] for i in range(arr_len) ]

shift_udf = F.udf(my_shift, ArrayType(LongType()))

# group the cols into an array and then run shift_udf(arr, n) to form 'new_arr' 
df_new = (df.withColumn('arr', F.array([ F.col(c) for c in cols ]))
            .withColumn('new_arr', shift_udf('arr', 'shift'))          
            .select('ID', 'shift', 'arr', 'new_arr', *[ F.col('new_arr')[i].alias(cols[i]) for i in range(len(cols)) ])
#| ID|shift|             arr|         new_arr|Col1|Col2|Col3|Col4|
#|  1|    1|    [1, 2, 3, 4]|    [2, 3, 4, 1]|   2|   3|   4|   1|
#|  2|    3|    [5, 6, 7, 8]|    [8, 5, 6, 7]|   8|   5|   6|   7|
#|  3|    2| [9, 10, 11, 12]| [11, 12, 9, 10]|  11|  12|   9|  10|
#|  4|    0|[13, 14, 15, 16]|[13, 14, 15, 16]|  13|  14|  15|  16|
#|  5|    5|[17, 18, 19, 20]|[18, 19, 20, 17]|  18|  19|  20|  17|