from pyspark.sql import Row, functions as F
row = Row("UK_1","UK_2","Date","Cat")
df = (sc.parallelize
([
row(1,1,'12/10/2016',"A"),
row(1,2,None,'A'),
row(2,1,'14/10/2016','B'),
row(3,3,'!~2016/2/276','B'),
row(None,1,'26/09/2016','A'),
row(1,1,'12/10/2016',"A"),
row(1,2,None,'A'),
row(2,1,'14/10/2016','B'),
row(None,None,'!~2016/2/276','B'),
row(None,1,'26/09/2016','A')
]).toDF())
pks = ["UK_1","UK_2"]
df1 = (
df
.select(columns)
#.withColumn('pk',F.concat(pks))
.withColumn('pk',F.concat("UK_1","UK_2"))
)
df1.show()
Is there a way I can pass in a list of columns into the concat? I want to use the code for scenarios where the columns can be varying and i would like to pass it as a list.