I have written the below python function which is working fine but there are lot of code which looks redundant to me and hence i want to make it better by using best coding guidelines.
Basically I am trying to build either a gcloud command or a simple bash command to execute a python script which depends upon the spark_flag parameter which is an input to the function and is FALSE by default.
def build_command(table_info1, table_info2, date_folder, timestamp,spark_flag):
try:
run_cmd_str = " nohup /usr/local/airflow/dags/batch_ingestion.py -- "
if table_info1[0] == 'db2':
app_name = "data-pipeline-" + table_info1[0] + "-" + table_info1[5] + "-" + table_info1[
6] + "-" + timestamp + table_info2[10]
if spark_flag:
cmd_str = "gcloud dataproc jobs submit pyspark --cluster={} --region={} --id {} --properties spark.submit.deployMode=cluster,spark.driver.memory=512m,spark.executor.memory=512m,spark.executor.cores=1,spark.executor.instances=1 --jars /usr/local/airflow/dags/batch_ingestion.py -- ".format(
table_info1[10], table_info1[11], app_name)
else:
cmd_str = run_cmd_str
elif table_info1[0] == 'sql_server' or table_info1[0] == 'azure_sql':
if '.' in table_info1[6]:
table = table_info1[6].split('.')
app_name = "data-pipeline-" + table_info1[0] + "-" + table_info1[5] + "-" + table[0] + "_" + table[
1] + "-" + timestamp + table_info2[10]
else:
app_name = "data-pipeline-" + table_info1[0] + "-" + table_info1[5] + "-" + table_info1[
6] + "-" + timestamp + table_info2[10]
if spark_flag:
cmd_str = "gcloud dataproc jobs submit pyspark --cluster={} --region={} --id {} --properties spark.submit.deployMode=cluster,spark.driver.memory=512m,spark.executor.memory=512m,spark.executor.cores=1,spark.executor.instances=1 --jars /usr/local/airflow/dags/batch_ingestion.py -- ".format(
table_info1[10], table_info1[11], app_name)
else:
cmd_str=run_cmd_str
elif table_info1[0] == 'abc_informix' or table_info1[0] == 'def_informix':
if table_info1[7] != '-1':
app_name = "data-pipeline-" + table_info1[0] + "-" + table_info1[5] + "-" + table_info1[6] + "-" + \
table_info1[7] + "-" + timestamp + table_info2[10]
elif table_info1[7] == '-1' and table_info1[0] == 'def_informix':
app_name = "data-pipeline-" + table_info1[0] + "-" + table_info1[5] + "-" + table_info1[
6] + "-" + timestamp + table_info2[10]
if spark_flag:
cmd_str = "gcloud dataproc jobs submit pyspark --cluster={} --region={} --id {} --properties spark.submit.deployMode=cluster,spark.driver.memory=512m,spark.executor.memory=512m,spark.executor.cores=1,spark.executor.instances=1 /usr/local/airflow/dags/batch_ingestion.py -- ".format(
table_info1[10], table_info1[11], app_name)
else:
cmd_str = run_cmd_str
last_run_dated = str(table_info2[1]).split(None, 1)[0]
cmd_string = " ".join(
[cmd_str, table_info1[0], table_info1[5], table_info1[6], table_info1[7], table_info1[1], table_info1[2],
table_info1[3], table_info1[8], table_info1[4], table_info2[0], last_run_dated, table_info2[2],
date_folder, table_info2[5], table_info2[6], table_info2[7], table_info2[3], table_info2[4],
table_info2[9]])
return cmd_string, app_name
except Exception as e:
print(e)
raise
table_info1[6]should not be split across lines. \$\endgroup\$name1, name2, name3, ... = table_info1\$\endgroup\$