If Fuzzy matching is taking lot of your time, then following solution could help. It uses rapidfuzz library which is highly optimized so should run faster. There are mutiple ratio options to choose from. Take a look at this github page and test which ratio suits your needs the best.
https://github.com/maxbachmann/RapidFuzz
from pyspark.sql.types import *
from pyspark import SparkContext, SQLContext
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
import rapidfuzz
sc = SparkContext('local')
sqlContext = SQLContext(sc)
keyword_given = [
["green pstr",],
["greenpstr",],
["wlmrt", ],
["walmart",],
["walmart super",]
]
keywordColumns = ["keyword"]
keyword_df = sqlContext.createDataFrame(data=keyword_given, schema = keywordColumns)
print("keyword_df dataframe")
keyword_df.show(truncate=False)
variations = [
("type green pstr", "ABC", 100),
("type green pstr","PQR",200),
("type green pstr", "NZSD", 2999),
("wlmrt payment","walmart",200),
("wlmrt solutions", "walmart", 200),
("nppssdwlmrt", "walmart", 2000)
]
variationsColumns = ["variations", "entity", "ID"]
variations_df = sqlContext.createDataFrame(data=variations, schema = variationsColumns)
print("variations_df dataframe")
variations_df.show(truncate=False)
def evalutate_helper_spark(keyw, var):
return rapidfuzz.fuzz.partial_ratio(keyw, var)
calculate_ratio = udf(lambda keyw, var : evalutate_helper_spark(keyw, var))
middle_df = variations_df.crossJoin(keyword_df)
middle_df = middle_df.withColumn("partial_ratio", calculate_ratio(F.col("keyword"), F.col("variations")))
print("middle df show")
middle_df.show(n=100, truncate=False)
Here's the output :
keyword_df dataframe
+-------------+
|keyword |
+-------------+
|green pstr |
|greenpstr |
|wlmrt |
|walmart |
|walmart super|
+-------------+
variations_df dataframe
+---------------+-------+----+
|variations |entity |ID |
+---------------+-------+----+
|type green pstr|ABC |100 |
|type green pstr|PQR |200 |
|type green pstr|NZSD |2999|
|wlmrt payment |walmart|200 |
|wlmrt solutions|walmart|200 |
|nppssdwlmrt |walmart|2000|
+---------------+-------+----+
middle df show
+---------------+-------+----+-------------+------------------+
|variations |entity |ID |keyword |partial_ratio |
+---------------+-------+----+-------------+------------------+
|type green pstr|ABC |100 |green pstr |100.0 |
|type green pstr|ABC |100 |greenpstr |88.88888888888889 |
|type green pstr|ABC |100 |wlmrt |33.333333333333336|
|type green pstr|ABC |100 |walmart |25.0 |
|type green pstr|ABC |100 |walmart super|40.0 |
|type green pstr|PQR |200 |green pstr |100.0 |
|type green pstr|PQR |200 |greenpstr |88.88888888888889 |
|type green pstr|PQR |200 |wlmrt |33.333333333333336|
|type green pstr|PQR |200 |walmart |25.0 |
|type green pstr|PQR |200 |walmart super|40.0 |
|type green pstr|NZSD |2999|green pstr |100.0 |
|type green pstr|NZSD |2999|greenpstr |88.88888888888889 |
|type green pstr|NZSD |2999|wlmrt |33.333333333333336|
|type green pstr|NZSD |2999|walmart |25.0 |
|type green pstr|NZSD |2999|walmart super|40.0 |
|wlmrt payment |walmart|200 |green pstr |46.15384615384615 |
|wlmrt payment |walmart|200 |greenpstr |50.0 |
|wlmrt payment |walmart|200 |wlmrt |100.0 |
|wlmrt payment |walmart|200 |walmart |83.33333333333334 |
|wlmrt payment |walmart|200 |walmart super|70.0 |
|wlmrt solutions|walmart|200 |green pstr |40.0 |
|wlmrt solutions|walmart|200 |greenpstr |36.36363636363637 |
|wlmrt solutions|walmart|200 |wlmrt |100.0 |
|wlmrt solutions|walmart|200 |walmart |83.33333333333334 |
|wlmrt solutions|walmart|200 |walmart super|70.0 |
|nppssdwlmrt |walmart|2000|green pstr |42.85714285714286 |
|nppssdwlmrt |walmart|2000|greenpstr |46.15384615384615 |
|nppssdwlmrt |walmart|2000|wlmrt |100.0 |
|nppssdwlmrt |walmart|2000|walmart |83.33333333333334 |
|nppssdwlmrt |walmart|2000|walmart super|55.55555555555556 |
+---------------+-------+----+-------------+------------------+