计算第一次出现并在列中赋值

时间:2020-05-21 21:54:41

标签: dataframe pyspark apache-spark-sql

我的数据框如下所示:

enter image description here 对于状态s1和s2的第一次出现,我们将其余发生的状态设置为open_price = 0,我们采用close_price的滞后时间

    from pyspark.sql import functions as F
from pyspark.sql.window import Window
df1 = df.withColumn('id', monotonically_increasing_id())
w=Window().partitionBy('Input','Status','Num').orderBy("id")
w1=Window().orderBy("id")
w2=Window().partitionBy("sum").orderBy("id")
df2 = df1.withColumn("rowNum", F.row_number().over(w))\
        .withColumn("sum", F.sum(F.when(F.col("rowNum")==1, F.lit(1)).otherwise(F.lit(0))).over(w1))\
        .withColumn("sum", F.when((F.row_number().over(w2)==1) & (F.col("sum")==2), F.lit(1)).otherwise(F.col("sum")))\
        .withColumn("lag1", F.lag("Close_price1",2).over(w1))\
        .withColumn("lag2", F.lag("Close_price1",3).over(w1))\
        .withColumn("Opening_Imp_Stock_Stage_1_n", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
        .when((F.col("sum")!=1),F.col("lag2"))\
        .otherwise(F.lit(0)))\
        .withColumn("Open_price1", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price1")))\
        .withColumn("lag1", F.lag("Close_price2",2).over(w1))\
        .withColumn("lag2", F.lag("Close_price2",3).over(w1))\
        .withColumn("Open_price2", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
        .when((F.col("sum")!=1),F.col("lag2"))\
        .otherwise(F.lit(0)))\
        .withColumn("Open_price2", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price2")))\
        .withColumn("lag1", F.lag("Close_price3",2).over(w1))\
        .withColumn("lag2", F.lag("Close_price3",3).over(w1))\
        .withColumn("Open_price3", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
        .when((F.col("sum")!=1),F.col("lag2"))\
        .otherwise(F.lit(0)))\
        .withColumn("Open_price3", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price3")))\
.withColumn('Stage1',(col("Close_price1") - col("Open_price1")))\
.withColumn('Stage2',(col("Close_price2") - col("Open_price2")))\
.withColumn('Stage3',(col("Close_price3") - col("Open_price3")))\
.withColumn('Stage1',when(col("Year") == 0,0).otherwise(col("Stage1")))\
.withColumn('Stage2',when(col("Year") == 0,0).otherwise(col("Stage2")))\
.withColumn('Stage3',when(col("Year") == 0,0).otherwise(col("Stage3")))\
.drop("id","lag1","lag2","rowNum")              

("rowNum", F.row_number().over(w))\
        .withColumn("sum", F.sum(F.when(F.col("rowNum")==1, F.lit(1)).otherwise(F.lit(0))).over(w1))\
        .withColumn("sum", F.when((F.row_number().over(w2)==1) & (F.col("sum")==2), F.lit(1)).otherwise(F.col("sum")))\
        .withColumn("lag1", F.lag("Close_price1",2).over(w1))\
        .withColumn("lag2", F.lag("Close_price1",3).over(w1))\
        .withColumn("Open_price1", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
        .when((F.col("sum")!=1),F.col("lag2"))\
        .otherwise(F.lit(0)))\
        .withColumn("Open_price1", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price1")))\
        .withColumn("lag1", F.lag("Close_price2",2).over(w1))\
        .withColumn("lag2", F.lag("Close_price2",3).over(w1))\
        .withColumn("Open_price2", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
        .when((F.col("sum")!=1),F.col("lag2"))\
        .otherwise(F.lit(0)))\
        .withColumn("Open_price2", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price2")))\
        .withColumn("lag1", F.lag("Close_price3",2).over(w1))\
        .withColumn("lag2", F.lag("Close_price3",3).over(w1))\
        .withColumn("Open_price3", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
        .when((F.col("sum")!=1),F.col("lag2"))\
        .otherwise(F.lit(0)))\
        .withColumn("Open_price3", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("Open_price3")))\
.withColumn('Stage1',(col("Close_price1") - col("Open_price1")))\
.withColumn('Stage2',(col("Close_price2") - col("Open_price2")))\
.withColumn('Stage3',(col("Close_price3") - col("Open_price3")))\
.withColumn('Stage1',when(col("Year") == 0,0).otherwise(col("Stage1")))\
.withColumn('Stage2',when(col("Year") == 0,0).otherwise(col("Stage2")))\
.withColumn('Stage3',when(col("Year") == 0,0).otherwise(col("Stage3")))\
.drop("id","lag1","lag2","rowNum")

如何识别状态S1和S2的首次出现,然后将0分配给open_price

open_price和open_price1是输出列

1 个答案:

答案 0 :(得分:1)

使用一些 window functions.

尝试一下
df.show() #sample data

#+------+----+-----------+
#|status|year|close_price|
#+------+----+-----------+
#|    s1|   0|        1.2|
#|    s1|   0|        2.2|
#|    s1|   1|        3.2|
#|    s1|   1|        4.2|
#|    s2|   1|        5.2|
#|    s1|   2|        6.2|
#|    s1|   2|        7.2|
#+------+----+-----------+


from pyspark.sql import functions as F
from pyspark.sql.window import Window

w=Window().partitionBy("status").orderBy("mono_id")
w1=Window().orderBy("mono_id")
w2=Window().partitionBy("sum").orderBy("mono_id")
df.withColumn("mono_id", F.monotonically_increasing_id())\
  .withColumn("rowNum", F.row_number().over(w))\
  .withColumn("sum", F.sum(F.when(F.col("rowNum")==1, F.lit(1)).otherwise(F.lit(0))).over(w1))\
  .withColumn("sum", F.when((F.row_number().over(w2)==1) & (F.col("sum")==2), F.lit(1)).otherwise(F.col("sum")))\
    .withColumn("lag1", F.lag("close_price",2).over(w1))\
     .withColumn("lag2", F.lag("close_price",3).over(w1))\
  .withColumn("open_price", F.when((F.col("sum")==1)&(F.col("lag1").isNotNull()), F.col("lag1"))\
                             .when((F.col("sum")!=1),F.col("lag2"))\
                              .otherwise(F.lit(0)))\
 .withColumn("open_price", F.when(F.col("rowNum")==1, F.lit(0)).otherwise(F.col("open_price")))\
  .orderBy("mono_id").drop("mono_id","lag1","lag2","rowNum")\
  .show()

#+------+----+-----------+---+----------+
#|status|year|close_price|sum|open_price|
#+------+----+-----------+---+----------+
#|    s1|   0|        1.2|  1|       0.0|
#|    s1|   0|        2.2|  1|       0.0|
#|    s1|   1|        3.2|  1|       1.2|
#|    s1|   1|        4.2|  1|       2.2|
#|    s2|   1|        5.2|  1|       0.0|
#|    s1|   2|        6.2|  2|       3.2|
#|    s1|   2|        7.2|  2|       4.2|
#+------+----+-----------+---+----------+
相关问题