0
origin.csv
no,key1,key2,key3,key4,key5,...
1,A1,B1,C1,D1,E1,..
2,A2,B2,C2,D2,E2,..
3,A3,B3,C3,D3,E3,..


WhatIwant.csv
1,A1,key1
1,B1,key2
1,C1,key3
...
3,A3,key1
3,B3,key2
...

I loaded csv with read method(origin.csv dataframe), but unable to convert it.

val df = spark.read
            .option("header", true)
            .option("charset", "euc-kr")
            .csv(csvFilePath)

Any idea of this?

1 Answer 1

1

Try this.

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

val df = Seq((1,"A1","B1","C1","D1"), (2,"A2","B2","C2","D2"), (3,"A3","B3","C3","D2")).toDF("no", "key1", "key2","key3","key4")
df.show

def myUDF(df: DataFrame, by: Seq[String]): DataFrame = {
  val (columns, types) = df.dtypes.filter{ case (clm, _) => !by.contains(clm)}.unzip
  require(types.distinct.size == 1)      
  val keys = explode(array(
    columns.map(clm => struct(lit(clm).alias("key"),col(clm).alias("val"))): _*
  ))
  val byValue = by.map(col(_))
  df.select(byValue :+ keys.alias("_key"): _*).select(byValue ++ Seq($"_key.val", $"_key.key"): _*)

}

val df1 = myUDF(df, Seq("no"))
df1.show
Sign up to request clarification or add additional context in comments.

4 Comments

Got error from Seq($"_key.val. Error message is "value $ is not a member of StringContext"
I tested this with spark-shell on Spark version 2.0.0. What version are you using?
i'm using org.scala-lang:scala-library:2.11.1
Try adding import for "import sqlContext.implicits._".

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.