I'm tring to replace the string in a dataframe column using regexp_replace. I have to apply regex patterns to all the records in the dataframe column. But the strings are not replacing as expected.
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark import sql
from pyspark.sql.functions import regexp_replace,col
import re
conf = SparkConf().setAppName("myFirstApp").setMaster("local")
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)
df=sc.parallelize([('2345','ADVANCED by John'),
('2398','ADVANCED by ADVANCE'),
('2328','Verified by somerandomtext'),
('3983','Double Checked by Marsha')]).toDF(['ID', "Notes"])
reg_patterns=["ADVANCED|ADVANCE/ADV/","ASSOCS|AS|ASSOCIATES/ASSOC/"]
for i in range(len(reg_patterns)):
res_split=re.findall(r"[^/]+",reg_patterns[i])
res_split[0]
df=df.withColumn('NotesUPD',regexp_replace(col('Notes'),res_split[0],res_split[1]))
df.show()
Output :
+----+--------------------+--------------------+
| ID| Notes| NotesUPD|
+----+--------------------+--------------------+
|2345| ADVANCED by John| ADVANCED by John|
|2398| ADVANCED by ADVANCE| ADVANCED by ADVANCE|
|2328|Verified by somer...|Verified by somer...|
|3983|Double Checked by...|Double Checked by...|
+----+--------------------+--------------------+
Expected Output:
+----+--------------------+--------------------+
| ID| Notes| NotesUPD|
+----+--------------------+--------------------+
|2345| ADVANCED by John| ADV by John|
|2398| ADVANCED by ADVANCE| ADV by ADV |
|2328|Verified by somer...|Verified by somer...|
|3983|Double Checked by...|Double Checked by...|
+----+--------------------+--------------------+