i like to merge two columns in Pandas Dataframe with an unequal length.
I've tried many approaches with merge, concat and join but no works.
keyList = ["Clone", "Chain", "Fragment", "R0", "R1", "R2"]
dataDict = {key: [] for key in keyList}
# Example for different list length
plist1 = ["ABCD", "DJFZ", "DHRZ"]
plist2 = ["ABCD", "DJFZ", "DHRZ", "JGJZ"]
filelist = ["E2_VH_Fab_R0.fasta", "E2_VH_scFV_R0.fasta", "E2_VH_Fab_R1.fasta", "E2_VH_scFV_R1.fasta","E2_VH_Fab_R2.fasta" ]
# Subsets are:
# E1 || E2 with VH || VL with Fab || scFV with R0 || R1 || R2
for file in enumerate(filelist):
# Get List with emits from class function
peptidelist = clseq.processEmits()
# Split filename into 6 parameters, see keylist
fileparms = datafile.split('.')[0].split('_')
# Iterate through peptide list and add the subsets into the dict
for peptide in peptidelist:
dataDict.setdefault("Clone", []).append(sclone)
dataDict.setdefault("Chain", []).append(schain)
dataDict.setdefault("Fragment", []).append(sfragment)
# Set other Rounds as "NaN" to equal the length
if "R0" in sround:
dataDict.setdefault("R0", []).append(peptide)
dataDict.setdefault("R1", []).append("NaN")
dataDict.setdefault("R2", []).append("NaN")
elif "R1" in sround:
dataDict.setdefault("R0", []).append("NaN")
dataDict.setdefault("R1", []).append(peptide)
dataDict.setdefault("R2", []).append("NaN")
elif "R2" in sround:
dataDict.setdefault("R0", []).append("NaN")
dataDict.setdefault("R1", []).append("NaN")
dataDict.setdefault("R2", []).append(peptide)
else:
dataDict.setdefault("R0", []).append("NaN")
dataDict.setdefault("R1", []).append("NaN")
dataDict.setdefault("R2", []).append("NaN")
dtframe.merge(pd.DataFrame(dataDict), on=['Clone', 'Chain', 'Fragment'], how='inner')
The problem is, that i have different list length with i like to merge into one dataframe and also pad the rest with NaN.
This:
0 E2 VH Fab r0 nan
1 E2 VH Fab r0 nan
2 E2 VH Fab r0 nan
3 E2 VH Fab r0 nan
4 E2 VH Fab r0 nan
5 E2 VH Fab r0 nan
and this:
0 E2 VH Fab nan r1
1 E2 VH Fab nan r1
2 E2 VH Fab nan r1
3 E2 VH Fab nan r1
4 E2 VH Fab nan r1
5 E2 VH Fab nan r1
6 E2 VH Fab nan r1
7 E2 VH Fab nan r1
Should result in this:
0 E2 VH Fab r0 r1
1 E2 VH Fab r0 r1
2 E2 VH Fab r0 r1
3 E2 VH Fab r0 r1
4 E2 VH Fab r0 r1
5 E2 VH Fab r0 r1
6 E2 VH Fab nan r1
7 E2 VH Fab nan r1
Beware that all of my data fields are strings.
pd.concat([df1, df2[~df2.index.isin(df1.index)])?if-elifclauses