0
from pyspark.sql import Row, functions as F
row = Row("UK_1","UK_2","Date","Cat")
agg = ''
agg = 'Cat'
tdf = (sc.parallelize
([
row(1,1,'12/10/2016',"A"),
row(1,2,None,'A'),
row(2,1,'14/10/2016','B'),
row(3,3,'!~2016/2/276','B'),
row(None,1,'26/09/2016','A'),
row(1,1,'12/10/2016',"A"),
row(1,2,None,'A'),
row(2,1,'14/10/2016','B'),
row(None,None,'!~2016/2/276','B'),
row(None,1,'26/09/2016','A')
]).toDF())
tdf.groupBy( iff(len(agg.strip()) > 0 , F.col(agg), )).agg(F.count('*').alias('row_count')).show()
有沒有一種方法可以根據數據框組中的某些條件使用列或不使用列?Pyspark DataFrame有條件的組