I am trying to check a column of a scala dataframe against a regular expression using a udf with an additional argument representing the actual regular expression.
However, putting the regular expression into a lit() statement does not seem to be allowed throwing the following error
java.lang.RuntimeException: Unsupported literal type class scala.util.matching.Regex
using the example code below. I'd expect an additional column "DMY" with Boolean entries.
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.util.matching._
def dateDMY_regex(): Regex = """^[0-3]?[0-9][-/.][0-1]?[0-9][-/.](19|20)?\d{2}$""".r
def date_match(value: String, dateEx: Regex): Boolean = {
return dateEx.unapplySeq(value).isDefined
}
val spark = SparkSession.builder().getOrCreate()
var df = spark.createDataFrame(Seq(
(0, "31/10/2018"),
(1, "01/11/2018"),
(2, "02/11/2018"))).toDF("Id", "col_1")
// to test the function
// print(date_match("31/10/2018", dateDMY_regex()))
val date_match_udf = udf(date_match _) //, lit("c")
df = df.withColumn( "DMY", date_match_udf( $"col_1", lit(dateDMY_regex()) ) )
df.show()