You need to redo your dictionary and build rows to properly infer the schema.
import datetime
from pyspark.sql import Row
data_dict = {
'[email protected]': {
'Date': datetime.date(2019, 10, 21),
'idle_time': datetime.datetime(2019, 10, 21, 1, 50)
},
'[email protected]': {
'Date': datetime.date(2019, 10, 21),
'idle_time': datetime.datetime(2019, 10, 21, 1, 35)
},
'[email protected]': {
'Date': datetime.date(2019, 10, 21),
'idle_time': datetime.datetime(2019, 10, 21, 1, 55)
}
}
data_as_rows = [Row(**{'user_name': k, **v}) for k,v in data_dict.items()]
data_df = spark.createDataFrame(data_as_rows).select('user_name', 'Date', 'idle_time')
data_df.show(truncate=False)
>>>
+-------------------------+----------+-------------------+
|user_name |Date |idle_time |
+-------------------------+----------+-------------------+
|[email protected]|2019-10-21|2019-10-21 01:50:00|
|[email protected]|2019-10-21|2019-10-21 01:35:00|
|[email protected] |2019-10-21|2019-10-21 01:55:00|
+-------------------------+----------+-------------------+
Note: if you already have the schema prepared and don't need to infer, you can just supply the schema to the createDataFrame function:
import pyspark.sql.types as T
schema = T.StructType([
T.StructField('user_name', T.StringType(), False),
T.StructField('Date', T.DateType(), False),
T.StructField('idle_time', T.TimestampType(), False)
])
data_as_tuples = [(k, v['Date'], v['idle_time']) for k,v in data_dict.items()]
data_df = spark.createDataFrame(data_as_tuples, schema=schema)
data_df.show(truncate=False)
>>>
+-------------------------+----------+-------------------+
|user_name |Date |idle_time |
+-------------------------+----------+-------------------+
|[email protected]|2019-10-21|2019-10-21 01:50:00|
|[email protected]|2019-10-21|2019-10-21 01:35:00|
|[email protected] |2019-10-21|2019-10-21 01:55:00|
+-------------------------+----------+-------------------+