Python pyarrow.lib.ArrowTypeError:需要一个整数(获取类型str)

Python pyarrow.lib.ArrowTypeError:需要一个整数(获取类型str),python,pandas,airflow,pyarrow,Python,Pandas,Airflow,Pyarrow,我想从sql server表中摄取新行。我发现获得差异的方法是使用下面的脚本。对于MySql表,它工作得非常好。当我插入pymssql库以连接到此新库并应用差异文件摄取时,我遇到以下错误: 我请求帮助理解为什么对于Sql Server上的表,我不能应用脚本 import os import pandas as pd import numpy as np import mysql.connector as sql from datetime import datetime, timedelta f

我想从sql server表中摄取新行。我发现获得差异的方法是使用下面的脚本。对于MySql表,它工作得非常好。当我插入pymssql库以连接到此新库并应用差异文件摄取时,我遇到以下错误:

我请求帮助理解为什么对于Sql Server上的表,我不能应用脚本

import os
import pandas as pd
import numpy as np
import mysql.connector as sql
from datetime import datetime, timedelta
from airflow.contrib.operators.mssql_to_gcs import MsSqlToGoogleCloudStorageOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
import pyarrow
import airflow
from gcloud import storage
from google.cloud import bigquery
from airflow import DAG
import pyodbc
import pymssql
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator

def update_table():
    query_bq = """SELECT * FROM dataset.table_test"""
    query_sql = """select id, \
                    col2, \
                    col3, \
                    col4, \
                    col5, \
                    col6, \
                    col7, \
                    col8, \
                    col9, \
                    replace(replace(replace(col10,';','|'),'\n',''),'"','') as col10, \
                    replace(replace(replace(col11,';','|'),'\n',''),'"','') as col11, \
                    col12, \
                    col13, \
                    replace(replace(replace(col14,';','|'),'\n',''),'"','') as col14, \
                    replace(replace(replace(col15,';','|'),'\n',''),'"','') as col15, \
                    replace(replace(replace(col16,';','|'),'\n',''),'"','') as col16, \
                    replace(replace(replace(col17,';','|'),'\n',''),'"','') as col17, \
                    replace(replace(replace(col18,';','|'),'\n',''),'"','') as col18, \
                    col19, \
                    replace(replace(replace(col20,';','|'),'\n',''),'"','') as col20, \
                    replace(replace(replace(col21,';','|'),'\n',''),'"','') as col21, \
                    col22, \
                    col23, \
                    col24, \
                    col25, \
                    col26, \
                    replace(replace(replace(col27,';','|'),'\n',''),'"','') as col27, \
                    col28, \
                    col29 \
                    from operacoes_b2w"""
    bucket_name = 'bucket_name'
    schema_path_gcs = 'path/subpath/'
    schema_name_gcs = 'table_test.json'
    table_path_gcs = 'dir/table_test/'
    table_name_gcs = 'table_test' + '_' + datetime.now().strftime("%Y%m%d_%H%M%S") + '.csv'
    dataset_bq = 'dataset'
    table_bq = 'table_test'
    date_columns = ['col3','col13','col22']

    client = bigquery.Client()
    query_job = client.query(query_bq)
    df_bq = query_job.to_dataframe()
    ids_bq = df_bq.id.tolist()

    # MySQL Credentials and settings
    db = pymssql.connect(server='ip_adress',user='username',password='***',database='bdd',port='1433')

    df_mysql = pd.read_sql(query_sql, db, parse_dates=date_columns)
    ids_mysql = df_mysql.iloc[:,0].tolist()


    ids_diff = np.setdiff1d(ids_mysql, ids_bq)
    df_diff1 = df_mysql.loc[df_mysql.id.isin(ids_diff), :]
    df_diff = df_diff1.replace({pd.np.nan: None})
    if df_diff.shape[0] > 0:
        df_diff.to_csv(table_name_gcs)


        storage_client = storage.Client()
        bucket = storage_client.get_bucket(bucket_name)
        #blob_schema = bucket.blob(schema_path_gcs+schema_name_gcs)
        #blob_schema.download_to_filename(schema_name_gcs)
        #schema_fields = client.schema_from_json(schema_name_gcs)
        #os.remove(schema_name_gcs)


        blob_table = bucket.blob(table_path_gcs+table_name_gcs)
        blob_table.upload_from_filename(table_name_gcs)
        os.remove(table_name_gcs)


        job_config = bigquery.LoadJobConfig()
        job_config.write_disposition = 'WRITE_APPEND'
        job_config.schema = [
            bigquery.SchemaField('id','INTEGER',mode='REQUIRED'),
            bigquery.SchemaField('col2','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col3','DATE',mode='REQUIRED'),
            bigquery.SchemaField('col4','FLOAT',mode='REQUIRED'),
            bigquery.SchemaField('col5','FLOAT',mode='NULLABLE'),
            bigquery.SchemaField('col6','FLOAT',mode='REQUIRED'),
            bigquery.SchemaField('col7','FLOAT',mode='REQUIRED'),
            bigquery.SchemaField('col8','FLOAT',mode='NULLABLE'),
            bigquery.SchemaField('col9','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col10','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col11','STRING',mode='REQUIRED'),
            bigquery.SchemaField('col12','INTEGER',mode='REQUIRED'),
            bigquery.SchemaField('col13','date',mode='NULLABLE'),
            bigquery.SchemaField('col14','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col15','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col16','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col17','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col18','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col19','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col20','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col21','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col22','DATE',mode='REQUIRED'),
            bigquery.SchemaField('col23','INTEGER',mode='REQUIRED'),
            bigquery.SchemaField('col24','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col25','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col26','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col27','STRING',mode='NULLABLE'),
            bigquery.SchemaField('col28','INTEGER',mode='NULLABLE'),
            bigquery.SchemaField('col29','INTEGER',mode='NULLABLE')
            ]

        # Criar e chamar job
        dataset_ref = client.dataset(dataset_bq)
        table_ref = dataset_ref.table(table_bq)
        job = client.load_table_from_dataframe(df_diff.reset_index(drop=True), table_ref, location="southamerica-east1", job_config=job_config)
        job.result()

    print(str(len(ids_diff)) + ' row(s) added.')


default_args = {
    'owner': 'bexs-data',
    'start_date': airflow.utils.dates.days_ago(0),
    'depends_on_past': False,
    #Exclusao de email
    #'email': ['airflow@apache.org'],
    'email_on_failure': False,
    'email_on_retry': False,
    'depends_on_past': False,
    'catchup': False,
    #Se o processo falhar, tente novamente depois de esperar pelo menos 5 minutos
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

with DAG('dag_test' , default_args=default_args, description='Python DAG', schedule_interval='25 9 * * *') as dag:
    python_task = PythonOperator(task_id='run_dag', python_callable=update_bq_table, dag=dag)
    python_task
错误:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/airflow/models/taskinstance.py", line 1094, in handle_failure
    task.on_failure_callback(context)
  File "/airflow/dags/git/dag_test.py", line 144, in 
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table Traceback (most recent call last):
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/bin/airflow", line 32, in <module>
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     args.func(args)
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/utils/cli.py", line 74, in wrapper
[2019-09-11 15:30:20,972] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return f(*args, **kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 522, in run
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     _run(args, dag, ti)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 440, in _run
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     pool=args.pool,
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/utils/db.py", line 74, in wrapper
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return func(*args, **kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/models/taskinstance.py", line 926, in _run_raw_task
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     result = task_copy.execute(context=context)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/operators/python_operator.py", line 113, in execute
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return_value = self.execute_callable()
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/airflow/operators/python_operator.py", line 118, in execute_callable
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return self.python_callable(*self.op_args, **self.op_kwargs)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/airflow/dags/git/operacoes_operacoes_b2w.py", line 132, in update_table
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     job = client.load_table_from_dataframe(df_diff.reset_index(drop=True), table_ref, location="southamerica-east1", job_config=job_config)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/client.py", line 1566, in load_table_from_dataframe
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     parquet_compression=parquet_compression,
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 368, in dataframe_to_parquet
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     arrow_table = dataframe_to_arrow(dataframe, bq_schema)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 335, in dataframe_to_arrow
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "/usr/local/lib/python3.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 187, in bq_to_arrow_array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table     return pyarrow.array(series, type=arrow_type)
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "pyarrow/array.pxi", line 191, in pyarrow.lib.array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "pyarrow/array.pxi", line 78, in pyarrow.lib._ndarray_to_array
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table   File "pyarrow/error.pxi", line 95, in pyarrow.lib.check_status
[2019-09-11 15:30:20,973] {base_task_runner.py:115} INFO - Job 184: Subtask update_bq_table pyarrow.lib.ArrowTypeError: an integer is required (got type str)
回溯(最近一次呼叫最后一次):
文件“/usr/local/lib/python3.7/site packages/afflow/models/taskinstance.py”,第1094行,在句柄中
task.on\u失败\u回调(上下文)
文件“/afflow/dags/git/dag_test.py”,第144行,in
[2019-09-11 15:30:20972]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表回溯(最近一次调用最后一次):
[2019-09-11 15:30:20972]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/bin/aiffair”,第32行,in
[2019-09-11 15:30:20972]{base_task_runner.py:115}信息-作业184:子任务更新_bq_table args.func(args)
[2019-09-11 15:30:20972]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/aiffort/utils/cli.py”,第74行,在包装器中
[2019-09-11 15:30:20972]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表返回f(*args,**kwargs)
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/aiffort/bin/cli.py”,第522行,运行中
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_table_run(args、dag、ti)
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/afflow/bin/cli.py”,第440行,正在运行
[2019-09-11 15:30:20973]{base_task_runner.py:115}INFO-Job 184:Subtask update_bq_table pool=args.pool,
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/aiffort/utils/db.py”,第74行,在包装器中
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_table return func(*args,**kwargs)
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/aiffort/models/taskinstance.py”,第926行,在_run_raw_task中
[2019-09-11 15:30:20973]{base_task_runner.py:115}INFO-作业184:子任务更新_bq_table result=task_copy.execute(上下文=上下文)
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/aiffort/operators/python_operator.py”,第113行,执行
[2019-09-11 15:30:20973]{base_task_runner.py:115}INFO-作业184:子任务更新_bq_table return_value=self.execute_callable()
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/aiffort/operators/python_operator.py”,第118行,在execute_callable中
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_table返回self.python可调用(*self.op_args,**self.op_kwargs)
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/aiffort/dags/git/operacoes_operacoes_b2w.py”,第132行,在更新_表中
[2019-09-11 15:30:20973]{base_task_runner.py:115}INFO-Job 184:子任务更新_bq_table Job=client.load_table_from_dataframe(df_diff.reset_index(drop=True),table_ref,location=“southamerica-east1”,Job_config=Job_config)
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/google/cloud/bigquery/client.py”,第1566行,从_dataframe加载_表
[2019-09-11 15:30:20973]{base\u task\u runner.py:115}信息-作业184:子任务更新\u bq\u table parquet\u compression=parquet\u compression,
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/google/cloud/bigquery/_pandas_helpers.py”,第368行,在dataframe_to_拼花地板中
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_table arrow_table=dataframe_to_arrow(dataframe,bq_schema)
[2019-09-11 15:30:20973]{base_task_runner.py:115}INFO-Job 184:Subtask update_bq_table File”/usr/local/lib/python3.7/site packages/google/cloud/bigquery/_pandas_helpers.py”,第335行,数据框中的箭头
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_table bq_到_arrow_数组(获取_列或_索引(dataframe,bq_field.name),bq_字段)
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表文件“/usr/local/lib/python3.7/site packages/google/cloud/bigquery/_pandas_helpers.py”,第187行,在bq_to_arrow_数组中
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表返回pyarrow.array(系列,类型=arrow类型)
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表格文件“pyarrow/array.pxi”,第191行,在pyarrow.lib.array中
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表格文件“pyarrow/array.pxi”,第78行,在pyarrow.lib中
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_表格文件“pyarrow/error.pxi”,第95行,pyarrow.lib.check_状态
[2019-09-11 15:30:20973]{base_task_runner.py:115}信息-作业184:子任务更新_bq_table pyarrow.lib.ArrowTypeError:需要一个整数(获取类型str)

能否使用
pdb
显示传递给
pyarrow.array
的值?这将告诉我们这是谷歌库中的bug还是pyarrow。什么