python2.7&;GCP Google BigQuery:捕获文件加载错误?
全部, 我得到了一些Python2.7BiqQuery(BQ)数据加载“操作就绪”,我正在努力找到正确的方法,以与我过去使用的其他大数据DW平台类似的方式捕获文件加载错误 在BQ中,我可以从中访问错误,示例如下:bigquery\u client.load\u table\u from\u uri.errorspython2.7&;GCP Google BigQuery:捕获文件加载错误?,python,python-2.7,google-bigquery,Python,Python 2.7,Google Bigquery,全部, 我得到了一些Python2.7BiqQuery(BQ)数据加载“操作就绪”,我正在努力找到正确的方法,以与我过去使用的其他大数据DW平台类似的方式捕获文件加载错误 在BQ中,我可以从中访问错误,示例如下:bigquery\u client.load\u table\u from\u uri.errors 这很好,但我真的需要一些更好的信息,特别是错误的行号,这是我面临的主要问题 红移:stl\U加载错误\U详细信息和stl\U加载错误 在SnowflakeDB中:加载_历史记录和表(
这很好,但我真的需要一些更好的信息,特别是错误的行号,这是我面临的主要问题 红移:stl\U加载错误\U详细信息和stl\U加载错误 在SnowflakeDB中:加载_历史记录和表(验证(表_名称,作业_id=>“_上次”) 总之,我需要加载我能加载的数据(将我的max_bad_记录设置得相当高),当记录失败时,我需要知道:
google-cloud==0.29.0
google-cloud-bigquery==0.28.0
google-cloud-core==0.28.0
# load a table to bq from gcs with the schema
def load_table_from_gcs(dataset_name, table_name, schema, source, skip_leading_rows=1, source_format='CSV', max_bad_records=0, write_disposition='WRITE_EMPTY', project=None):
try:
# convert the schema json string to a list
schemaList = convert_schema(schema)
bigquery_client = bigquery.Client(project=project)
dataset_ref = bigquery_client.dataset(dataset_name)
table_ref = dataset_ref.table(table_name)
table = bigquery.Table(table_ref, schema=schemaList)
bigquery_client.create_table(table)
job_id_prefix = "bqTools_load_job"
job_config = bigquery.LoadJobConfig()
job_config.create_disposition = 'NEVER'
job_config.skip_leading_rows = skip_leading_rows
job_config.source_format = source_format
job_config.write_disposition = write_disposition
if max_bad_records:
job_config.max_bad_records = max_bad_records
load_job = bigquery_client.load_table_from_uri(
source, table_ref, job_config=job_config,
job_id_prefix=job_id_prefix)
# the following waits for table load to complete
load_job.result()
print("------ load_job\n")
print("load_job: " + str(type(load_job)))
print(dir(load_job))
print("------ load_job.result\n")
job_result = load_job.result
print("job_result: " + str(type(job_result)))
print(job_result)
job_exception = load_job.exception
job_id = load_job.job_id
job_state = load_job.state
error_result = load_job.error_result
job_statistics = load_job._job_statistics()
badRecords = job_statistics['badRecords']
outputRows = job_statistics['outputRows']
inputFiles = job_statistics['inputFiles']
inputFileBytes = job_statistics['inputFileBytes']
outputBytes = job_statistics['outputBytes']
print("\n ***************************** ")
print(" job_state: " + str(job_state))
print(" error_result: " + str(error_result))
print(" job_id: " + str(job_id))
print(" badRecords: " + str(badRecords))
print(" outputRows: " + str(outputRows))
print(" inputFiles: " + str(inputFiles))
print(" inputFileBytes: " + str(inputFileBytes))
print(" outputBytes: " + str(outputBytes))
print(" type(job_exception): " + str(type(job_exception)))
print(" job_exception: " + str(job_exception))
print(" ***************************** ")
print("------ load_job.errors \n")
myErrors = load_job.errors
# print("myErrors: " + str(type(myErrors)))
for errorRecord in myErrors:
print(errorRecord)
print("------ ------ ------ ------\n")
# TODO: need to figure out how to get # records failed, and which ones they are
# research shoed "statistics.load_job" - but not sure how that works
returnMsg = 'load_table_from_gcs {}:{} {}'.format(dataset_name, table_name, source)
return returnMsg
except Exception as e:
errorStr = 'ERROR (load_table_from_gcs): ' + str(e)
print(errorStr)
raise
BigQuery不报告错误行号的原因是,该文件由多个工作线程并行拆分和解析。假设一个工作者负责文件的偏移量10000~20000,它将寻找10000并从那里开始解析。当解析一行失败时,它只知道该行的起始偏移量。要知道行号,需要从文件的开头开始扫描
您可以找到给定起始偏移量的直线。您需要行号的具体原因是什么?编辑主题以添加我的加载代码我不确定这是否可行,因为这与API无关,而是BQ后端作为响应返回的内容(一种可能是启动BQ跟踪程序,看看团队对此有何想法)谢谢Willian,我添加了一张罚单(69405901)如果我得到回复,我会把这个帖子贴出来。我需要原始记录和关于失败原因的信息。有了这些信息,数据提供商就可以在发送给我们之前对数据进行更改,或者我们可以更改负载。相当标准的东西,什么失败了,为什么?如果我打不到电话,没什么大不了的,这是有用的,但不是紧急的。失败的数据以及为什么是最重要的。BigQuery允许单个CSV行的大小达到10MB。在错误消息中返回10MB的行是非常低效的。更糟糕的是在100次警告中返回它。而是返回一个偏移量,以便您可以从文件中检索失败的行。为什么已在错误消息中?“无法将'r2501'解析为字段lineNum(位置0)的int?”如您的问题所述?从操作角度来看,我需要知道哪些记录失败(来自哪些文件)以及原因。我也想知道故障线路的行号,但这似乎很困难(尽管其他人正在这样做),请从操作的角度考虑这一点,您每天晚上都会收到数百个文件,提供不同的系统/表,哪些文件有错误,它们是什么(消息和故障记录)。感谢链接。看起来红移返回了一行的样本,字符(1024)。我们或许也可以这样做。我提出了一个功能要求。由于他们不要求JSON是新行分隔的,我假设他们按顺序而不是并行地处理文件,因为扫描1TB文件以找出行号会非常昂贵。
google-cloud==0.29.0
google-cloud-bigquery==0.28.0
google-cloud-core==0.28.0
# load a table to bq from gcs with the schema
def load_table_from_gcs(dataset_name, table_name, schema, source, skip_leading_rows=1, source_format='CSV', max_bad_records=0, write_disposition='WRITE_EMPTY', project=None):
try:
# convert the schema json string to a list
schemaList = convert_schema(schema)
bigquery_client = bigquery.Client(project=project)
dataset_ref = bigquery_client.dataset(dataset_name)
table_ref = dataset_ref.table(table_name)
table = bigquery.Table(table_ref, schema=schemaList)
bigquery_client.create_table(table)
job_id_prefix = "bqTools_load_job"
job_config = bigquery.LoadJobConfig()
job_config.create_disposition = 'NEVER'
job_config.skip_leading_rows = skip_leading_rows
job_config.source_format = source_format
job_config.write_disposition = write_disposition
if max_bad_records:
job_config.max_bad_records = max_bad_records
load_job = bigquery_client.load_table_from_uri(
source, table_ref, job_config=job_config,
job_id_prefix=job_id_prefix)
# the following waits for table load to complete
load_job.result()
print("------ load_job\n")
print("load_job: " + str(type(load_job)))
print(dir(load_job))
print("------ load_job.result\n")
job_result = load_job.result
print("job_result: " + str(type(job_result)))
print(job_result)
job_exception = load_job.exception
job_id = load_job.job_id
job_state = load_job.state
error_result = load_job.error_result
job_statistics = load_job._job_statistics()
badRecords = job_statistics['badRecords']
outputRows = job_statistics['outputRows']
inputFiles = job_statistics['inputFiles']
inputFileBytes = job_statistics['inputFileBytes']
outputBytes = job_statistics['outputBytes']
print("\n ***************************** ")
print(" job_state: " + str(job_state))
print(" error_result: " + str(error_result))
print(" job_id: " + str(job_id))
print(" badRecords: " + str(badRecords))
print(" outputRows: " + str(outputRows))
print(" inputFiles: " + str(inputFiles))
print(" inputFileBytes: " + str(inputFileBytes))
print(" outputBytes: " + str(outputBytes))
print(" type(job_exception): " + str(type(job_exception)))
print(" job_exception: " + str(job_exception))
print(" ***************************** ")
print("------ load_job.errors \n")
myErrors = load_job.errors
# print("myErrors: " + str(type(myErrors)))
for errorRecord in myErrors:
print(errorRecord)
print("------ ------ ------ ------\n")
# TODO: need to figure out how to get # records failed, and which ones they are
# research shoed "statistics.load_job" - but not sure how that works
returnMsg = 'load_table_from_gcs {}:{} {}'.format(dataset_name, table_name, source)
return returnMsg
except Exception as e:
errorStr = 'ERROR (load_table_from_gcs): ' + str(e)
print(errorStr)
raise