I’m new to GCP BigQuery. I would like to upload this dataset to BigQuery using a Python script as follow: (I’m using this post & these Github examples as references)
import os
import pandas as pd
import numpy as np
import gdown
from pandas import read_csv
from google.cloud import bigquery
from dotenv import load_dotenv
from google.oauth2 import service_account
def create_schema(field_list: list, types_list: list):
schema_list = []
for fields, types in zip(field_list, types_list):
schema = bigquery.SchemaField(fields, types)
schema_list.append(schema)
return schema_list
def auth():
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.getenv(
'PATH_CREDENTIAL_GCP')
key_path = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
credentials = service_account.Credentials.from_service_account_file(
key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
client = bigquery.Client(credentials=credentials,
project=credentials.project_id,)
return client
def bq_load(df, dataset_id: str, table_id: str, schema):
bq_client = auth()
dataset_ref = bq_client.dataset(dataset_id)
dataset_table_id = dataset_ref.table(table_id)
job_config = bigquery.LoadJobConfig()
job_config.write_disposition = 'WRITE_TRUNCATE'
job_config.source_format = bigquery.SourceFormat.CSV
job_config.autodetect = False
job_config.schema = schema
job_config.ignore_unknown_values = True
job = bq_client.load_table_from_dataframe(
df,
table_id,
location='US',
job_config=job_config
)
return job.result()
def main():
load_dotenv()
working_dir = os.getcwd()
print(working_dir)
data_path = working_dir+'/news.csv'
df = read_csv(data_path)
headers = features_names.to_numpy()
types_df = df.dtypes.to_numpy()
field_list = headers
type_list = ["INTEGER", "BYTES", "BYTES", "BYTES", "INTEGER"]
result = create_schema(field_list=field_list, types_list=type_list)
dataset_id = "some_dataset_id"
table_id = "some_table_id"
bf_to_bq = bq_load(df, dataset_id, table_id, result)
print(bf_to_bq)
return 'Done!!'
if __name__ == "__main__":
main()
By running the prior code, I’m getting this error google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table references column position 4, but line starting at position:0 contains only 4 columns :
Traceback (most recent call last):
File "/Users/user/bigquery/table_upload_bq.py", line 110, in <module>
main()
File "/Users/user/bigquery/table_upload_bq.py", line 103, in main
bf_to_bq = bq_load(df, dataset_id, table_id, result)
File "/Users/user/bigquery/table_upload_bq.py", line 65, in bq_load
return job.result()
File "/Users/user/.pyenv/versions/3.10.7/envs/py-3.10.7/lib/python3.10/site-packages/google/cloud/bigquery/job/base.py", line 911, in result
return super(_AsyncJob, self).result(timeout=timeout, **kwargs)
File "/Users/user/.pyenv/versions/3.10.7/envs/py-3.10.7/lib/python3.10/site-packages/google/api_core/future/polling.py", line 261, in result
raise self._exception
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table references column position 4, but line starting at position:0 contains only 4 columns.
This is what the CSV looks like (it has 5 columns):

If someone could suggest to me how to should this problem, it would be awesome. Thanks!