Comparing codes from multiple data sets and assigning new codes or keeping the one they have if not used. Need to integrate a queue or shared memory so multiple processes can run different shards at the same time. As it is, the script will send out the same "new code" multiple times.
import pyspark.sql.functions as F
import pyspark.sql.types as T
import random
used_codes = []
new_codes = []
def generate_code():
random_number = random.randint(1000,9000)
return random_number
def create_codes():
global new_codes
new_codes = []
for i in range(0, 10):
new_codes.append(generate_code())
def get_code(code):
global new_codes
global used_codes
if(code not in used_codes):
used_codes.append(code)
return code
created_code = new_codes[0]
new_codes = new_codes[1:]
while(created_code in used_codes):
created_code = new_codes[0]
new_codes = new_codes[1:]
return created_code
get_code_udf = F.udf(lambda code: get_code(code), T.StringType())
Fas inF.udf? See How to create a Minimal, Reproducible Example.