0

I’m new to GCP BigQuery. I would like to upload this dataset to BigQuery using a Python script as follow: (I’m using this post & these Github examples as references)

import os
import pandas as pd
import numpy as np
import gdown
from pandas import read_csv
from google.cloud import bigquery
from dotenv import load_dotenv
from google.oauth2 import service_account


def create_schema(field_list: list, types_list: list):
    schema_list = []
    for fields, types in zip(field_list, types_list):
        schema = bigquery.SchemaField(fields, types)
        schema_list.append(schema)
    return schema_list


def auth():
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.getenv(
        'PATH_CREDENTIAL_GCP')
    key_path = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
    credentials = service_account.Credentials.from_service_account_file(
        key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
    )
    client = bigquery.Client(credentials=credentials,
                             project=credentials.project_id,)
    return client


def bq_load(df, dataset_id: str, table_id: str, schema):
    bq_client = auth()

    dataset_ref = bq_client.dataset(dataset_id)
    dataset_table_id = dataset_ref.table(table_id)

    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = 'WRITE_TRUNCATE'
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.autodetect = False

    job_config.schema = schema

    job_config.ignore_unknown_values = True
    job = bq_client.load_table_from_dataframe(
        df,
        table_id,
        location='US',
        job_config=job_config
    )
    return job.result()


def main():

    load_dotenv()
    working_dir = os.getcwd()
    print(working_dir)

    data_path = working_dir+'/news.csv'

    df = read_csv(data_path)
    headers = features_names.to_numpy()

    types_df = df.dtypes.to_numpy()

    field_list = headers
    type_list = ["INTEGER", "BYTES", "BYTES", "BYTES", "INTEGER"]

    result = create_schema(field_list=field_list, types_list=type_list)
    dataset_id = "some_dataset_id"
    table_id = "some_table_id"

    bf_to_bq = bq_load(df, dataset_id, table_id, result)

    print(bf_to_bq)
    return 'Done!!'


if __name__ == "__main__":
    main()

By running the prior code, I’m getting this error google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table references column position 4, but line starting at position:0 contains only 4 columns :

Traceback (most recent call last):
  File "/Users/user/bigquery/table_upload_bq.py", line 110, in <module>
    main()
  File "/Users/user/bigquery/table_upload_bq.py", line 103, in main
    bf_to_bq = bq_load(df, dataset_id, table_id, result)
  File "/Users/user/bigquery/table_upload_bq.py", line 65, in bq_load
    return job.result()
  File "/Users/user/.pyenv/versions/3.10.7/envs/py-3.10.7/lib/python3.10/site-packages/google/cloud/bigquery/job/base.py", line 911, in result
    return super(_AsyncJob, self).result(timeout=timeout, **kwargs)
  File "/Users/user/.pyenv/versions/3.10.7/envs/py-3.10.7/lib/python3.10/site-packages/google/api_core/future/polling.py", line 261, in result
    raise self._exception
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table references column position 4, but line starting at position:0 contains only 4 columns.

This is what the CSV looks like (it has 5 columns): dataset

If someone could suggest to me how to should this problem, it would be awesome. Thanks!

2
  • Did you check the number of columns in CSV? Can you provide the sample data in CSV in your question? Commented Mar 29, 2023 at 4:16
  • Hi @0x55b1E06FF Do this thread1 and thread2 help you? Commented Mar 29, 2023 at 11:53

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.