Not sure if it is the best solution, but it seems to work fine. The idea is to first find all city/city combinations of your matrix and then check if x should be zero or one.
import itertools
import pandas as pd
# Create your dataframe
data = {'id': ['A', 'A', 'A','B', 'B','B', 'C', 'C'],
'city': ['NaN', 'NaN', 'Paris','NaN', 'Berlin','London', 'NaN', 'Rome'],
'year': [2000, 2001, 2002, 2000, 2001,2001,2000,2001],
'x': [0,0,1,0,1,1,0,1]}
df = pd.DataFrame(data)
# Helper function to determine x value in final matrix
def check_if_one_or_zero(df_x, city_a, city_b):
a = df_x[df_x['city'] == city_a]
b = df_x[df_x['city'] == city_b]
# if one city has no entry with x==1 => x for both is always zero
if a.empty or b.empty:
return 0
a_id = a['id'].iloc[0]
b_id = b['id'].iloc[0]
# Number of entries with same id
group_size = len(df_x[df_x['id'] == a_id])
# if city A is city B and it is only city with this label => x is 1
if group_size == 1 and city_a == city_b:
return 1
# if city A and B are distinct and have same id => x is 1
if group_size > 1 and a_id == b_id and city_a != city_b:
return 1
return 0
def create_matrix(df, year):
# Create a data frame where we include every combination of two cities
all_combinations = list(itertools.product(df[df['city'] != 'NaN']['city'], df[df['city']!='NaN']['city']))
df_combinations = pd.DataFrame(all_combinations, columns=['City A','City B'])
# Create helper df for comparison which only has entries with x==1 and correct year
df_x = df[(df['year']==year) & (df['city']!='NaN') & (df['x']==1)]
# Set x for each city / city combination with the helper function
df_combinations['x'] = df_combinations.apply(lambda x: check_if_one_or_zero(df_x, x['City A'], x['City B']), axis=1)
# Use pivot to create final matrix
return df_combinations.pivot(index='City A', columns='City B', values='x')
print(create_matrix(df, 2000))
print(create_matrix(df, 2001))
print(create_matrix(df, 2002))
Output year==2000:
City B Berlin London Paris Rome
City A
Berlin 0 0 0 0
London 0 0 0 0
Paris 0 0 0 0
Rome 0 0 0 0
Output year==2001:
City B Berlin London Paris Rome
City A
Berlin 0 1 0 0
London 1 0 0 0
Paris 0 0 0 0
Rome 0 0 0 1
Output year==2002:
City B Berlin London Paris Rome
City A
Berlin 0 0 0 0
London 0 0 0 0
Paris 0 0 1 0
Rome 0 0 0 0
x==1with the same id? If so, what should the output look like?iandjcannot be the same? and if they can be the same why aren't berlin and london connected to themselves in 2001?