将Google BigQuery中一个表中的XML数据转换为同一表中另一列中的JSON数据
我在Google BigQuery中有下表(这里只显示了几行): 有没有办法直接在BIgquery中使用SQL或使用Python来实现这一点将Google BigQuery中一个表中的XML数据转换为同一表中另一列中的JSON数据,json,xml,google-bigquery,Json,Xml,Google Bigquery,我在Google BigQuery中有下表(这里只显示了几行): 有没有办法直接在BIgquery中使用SQL或使用Python来实现这一点 谢谢为了更新BigQuery中的数据,您可以查看一下,但要考虑到它有自己的配额。在您的情况下,我将考虑从现有的一个表创建一个新的表,并在Python中处理XML字段,以便将其解析为JSON格式。 我已经使用Google Cloud Client libraries for Python复制了我的工作流程,它可以与下面附带的代码正常工作。该代码的工作原理如下
谢谢为了更新BigQuery中的数据,您可以查看一下,但要考虑到它有自己的配额。在您的情况下,我将考虑从现有的一个表创建一个新的表,并在Python中处理XML字段,以便将其解析为JSON格式。 我已经使用Google Cloud Client libraries for Python复制了我的工作流程,它可以与下面附带的代码正常工作。该代码的工作原理如下:
- 将表格CSV文件导出到GCS存储桶
- 将CSV文件从GCS bucket下载到您的计算机
- 将列追加到名为“JSON_data”的输入数据帧
- 将XML列“data”解析为列“JSON_data”中的JSON格式
- 使用新数据创建新的BigQuery表
import xmltodict, json
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd
#Define bigquery Client
client = bigquery.Client()
#Extract job
bucket_name = <YOUR_BUCKET_NAME>
project = <YOUR_PROJECT_ID>
dataset_id = <YOUR_DATASET_ID>
table_id = <YOUR_TABLE_ID>
location = <YOUR_TABLE_LOCATION>
def export_dataset(bucket_name, dataset_id, project, table_id):
destination_uri = "gs://{}/{}".format(bucket_name, "bq_table.csv")
dataset_ref = client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)
extract_job = client.extract_table(
table_ref,
destination_uri,
# Location must match that of the source table.
location=location,
) # API request
extract_job.result() # Waits for job to complete.
print(
"Exported {}:{}.{} to {}".format(project, dataset_id, table_id,
destination_uri)
)
#Execute export job
export_dataset(bucket_name, dataset_id, project, table_id)
#--------------------------------------------
#Retrieve CSV file from GCS bucket
source_blob_name = "bq_table.csv"
destination_file_name = "bq_table.csv"
def download_blob(bucket_name, source_blob_name, destination_file_name):
"""Downloads a blob from the bucket."""
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(source_blob_name)
blob.download_to_filename(destination_file_name)
print('Blob {} downloaded to {}.'.format(
source_blob_name,
destination_file_name))
#Download CSV from bucket
download_blob(bucket_name, source_blob_name, destination_file_name)
#--------------------------------------------
#Declare XML column name
XML_col = 'data'
#Read CSV as Pandas DF
df = pd.read_csv('bq_table.csv')
#Append JSON_data column
df['JSON_data'] = ''
#Transform XML and save in Array
JSON_arr = [json.dumps(xmltodict.parse(df[XML_col].values[i])) for i in
range(len(df[XML_col]))]
#Set transformed data to column JSON_data
df.loc[:,'JSON_data'] = JSON_arr
#df to CSV - Generete output file
df.to_csv('new_data.csv', index=False, sep=',')
#----------------------------------------------
#Now we will create the new table with the new CSV
csv_path='gs://{}/new_data.csv'.format(bucket_name)
new_table='new_table'
#Define schema for table
schema = [
bigquery.SchemaField("id", "INTEGER"),
bigquery.SchemaField("loaded_date", "DATE"),
bigquery.SchemaField("JSON_data", "STRING"),
]
#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto-
detect-schema-using-python-api
def insertTable(datasetName, tableName, csvFilePath, schema=None):
"""
This function creates a table in given dataset in our default project
and inserts the data given via a csv file.
:param datasetName: The name of the dataset to be created
:param tableName: The name of the dataset in which the table needs to be created
:param csvFilePath: The path of the file to be inserted
:param schema: The schema of the table to be created
:return: returns nothing
"""
csv_file = open(csvFilePath, 'rb')
dataset_ref = client.dataset(datasetName)
from google.cloud.bigquery import Dataset
dataset = Dataset(dataset_ref)
table_ref = dataset.table(tableName)
if schema is not None:
table = bigquery.Table(table_ref,schema)
else:
table = bigquery.Table(table_ref)
try:
client.delete_table(table)
except:
pass
table = client.create_table(table)
from google.cloud.bigquery import LoadJobConfig
job_config = LoadJobConfig()
table_ref = dataset.table(tableName)
job_config.source_format = 'CSV'
job_config.skip_leading_rows = 1
job_config.autodetect = True
job = client.load_table_from_file(
csv_file, table_ref, job_config=job_config)
job.result()
insertTable(dataset_id, new_table, 'new_data.csv', schema)
导入xmltodict,json
从google.cloud导入bigquery
从google.cloud导入存储
作为pd进口熊猫
#定义bigquery客户端
client=bigquery.client()
#提取作业
bucket\u name=
项目=
数据集\u id=
表_id=
位置=
def导出数据集(存储桶名称、数据集id、项目、表id):
destination_uri=“gs://{}/{}”。格式(bucket_名称,“bq_table.csv”)
dataset\u ref=client.dataset(dataset\u id,project=project)
表\u ref=数据集\u ref.表(表\u id)
extract\u job=client.extract\u表(
表(参考),
目的地,
#位置必须与源表的位置匹配。
位置=位置,
)#API请求
extract_job.result()#等待作业完成。
印刷品(
“导出{}:{}.{}到{}”。格式(项目、数据集id、表id、,
目的地(uri)
)
#执行导出作业
导出数据集(存储桶名称、数据集id、项目、表id)
#--------------------------------------------
#从GCS存储桶中检索CSV文件
source\u blob\u name=“bq\u table.csv”
目的地\u文件\u名称=“bq\u table.csv”
def下载\u blob(存储桶名称、源\u blob名称、目标\u文件名称):
“”“从存储桶中下载一个blob。”“”
storage\u client=storage.client()
bucket=存储\客户端。获取\ bucket(bucket\名称)
blob=bucket.blob(源\u blob\u名称)
blob.下载到文件名(目标文件名)
打印('Blob{}下载为{}.'.format(
源\u blob\u名称,
目的地(文件名)
#从bucket下载CSV
下载\u blob(bucket\u名称、源\u blob\u名称、目标\u文件\u名称)
#--------------------------------------------
#声明XML列名
XML_col='data'
#将CSV读为DF
df=pd.read\u csv('bq\u table.csv')
#附加JSON_数据列
df['JSON_data']=''
#转换XML并保存在数组中
JSON_arr=[JSON.dumps(xmltodict.parse(df[XML_col].values[i]),用于
范围(len(df[XML\u col]))
#将转换后的数据设置为列JSON_数据
df.loc[:,'JSON_data']=JSON_arr
#df到CSV-Generete输出文件
df.to_csv('new_data.csv',index=False,sep=','))
#----------------------------------------------
#现在,我们将使用新的CSV创建新表
csv_path='gs://{}/new_data.csv'。格式(bucket_名称)
新建表格=“新建表格”
#定义表的架构
模式=[
SchemaField(“id”,“INTEGER”),
SchemaField(“加载日期”、“日期”),
SchemaField(“JSON_数据”,“字符串”),
]
#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto-
使用pythonapi检测模式
def insertTable(数据集名称、表名称、csvFilePath、架构=无):
"""
此函数用于在默认项目中的给定数据集中创建表
并插入通过csv文件提供的数据。
:param datasetName:要创建的数据集的名称
:param tableName:需要在其中创建表的数据集的名称
:param csvFilePath:要插入的文件的路径
:param schema:要创建的表的架构
:return:不返回任何内容
"""
csv_file=open(csvFilePath,'rb')
dataset_ref=client.dataset(datasetName)
从google.cloud.bigquery导入数据集
数据集=数据集(数据集\u参考)
table_ref=dataset.table(tableName)
如果架构不是“无”:
table=bigquery.table(表参考,模式)
其他:
table=bigquery.table(table\u ref)
尝试:
client.delete_表(表)
除:
通过
table=客户端。创建_表(table)
从google.cloud.bigquery导入LoadJobConfig
job_config=LoadJobConfig()
table_ref=dataset.table(tableName)
job_config.source_格式='CSV'
job\u config.skip\u leading\u rows=1
job_config.autodetect=True
job=client.load\u表\u从\u文件(
csv_文件,表_参考,作业_配置=作业_配置)
job.result()
insertTable(数据集\u id,新\u表,'new\u data.csv',模式)
请告诉我这对你是否有效
好的,您不能直接在SQL中使用任何函数来实现这一点。最简单的方法是编写JavaScript UDF并在其中使用XML->JSON库。请参见此处开始:谢谢@GrahamPolley。这很有用,谢谢你,Joaquim。我今天无法尝试,但我明天会做,并会让你知道。谢谢。干杯
id loaded_date data data_json
1 2019-10-25 Same data as before {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Avstemming kunder - OMT"},{"-name": "Created","-type": "datetime","-value": "2019-10-25 17:35:17Z"},{"-name": "Type","-type": "text","-value": "Session Provisioning Failure"}]}}}
2 2019-10-25 Same data as before {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Avstemming kunder - OMT"},{"-name": "Created","-type": "datetime","-value": "2019-10-25 17:51:32Z"},{"-name": "Type","-type": "text","-value": "Session Provisioning Failure"}]}}}
3 2019-02-23 Same data as before {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Hent klienter til kø"},{"-name": "Last Generation Time","-type": "datetime","-value": "2019-02-23 11:00:36Z"},{"-name": "Priority","-type": "number","-value": "-3"}]}}}
import xmltodict, json
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd
#Define bigquery Client
client = bigquery.Client()
#Extract job
bucket_name = <YOUR_BUCKET_NAME>
project = <YOUR_PROJECT_ID>
dataset_id = <YOUR_DATASET_ID>
table_id = <YOUR_TABLE_ID>
location = <YOUR_TABLE_LOCATION>
def export_dataset(bucket_name, dataset_id, project, table_id):
destination_uri = "gs://{}/{}".format(bucket_name, "bq_table.csv")
dataset_ref = client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)
extract_job = client.extract_table(
table_ref,
destination_uri,
# Location must match that of the source table.
location=location,
) # API request
extract_job.result() # Waits for job to complete.
print(
"Exported {}:{}.{} to {}".format(project, dataset_id, table_id,
destination_uri)
)
#Execute export job
export_dataset(bucket_name, dataset_id, project, table_id)
#--------------------------------------------
#Retrieve CSV file from GCS bucket
source_blob_name = "bq_table.csv"
destination_file_name = "bq_table.csv"
def download_blob(bucket_name, source_blob_name, destination_file_name):
"""Downloads a blob from the bucket."""
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(source_blob_name)
blob.download_to_filename(destination_file_name)
print('Blob {} downloaded to {}.'.format(
source_blob_name,
destination_file_name))
#Download CSV from bucket
download_blob(bucket_name, source_blob_name, destination_file_name)
#--------------------------------------------
#Declare XML column name
XML_col = 'data'
#Read CSV as Pandas DF
df = pd.read_csv('bq_table.csv')
#Append JSON_data column
df['JSON_data'] = ''
#Transform XML and save in Array
JSON_arr = [json.dumps(xmltodict.parse(df[XML_col].values[i])) for i in
range(len(df[XML_col]))]
#Set transformed data to column JSON_data
df.loc[:,'JSON_data'] = JSON_arr
#df to CSV - Generete output file
df.to_csv('new_data.csv', index=False, sep=',')
#----------------------------------------------
#Now we will create the new table with the new CSV
csv_path='gs://{}/new_data.csv'.format(bucket_name)
new_table='new_table'
#Define schema for table
schema = [
bigquery.SchemaField("id", "INTEGER"),
bigquery.SchemaField("loaded_date", "DATE"),
bigquery.SchemaField("JSON_data", "STRING"),
]
#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto-
detect-schema-using-python-api
def insertTable(datasetName, tableName, csvFilePath, schema=None):
"""
This function creates a table in given dataset in our default project
and inserts the data given via a csv file.
:param datasetName: The name of the dataset to be created
:param tableName: The name of the dataset in which the table needs to be created
:param csvFilePath: The path of the file to be inserted
:param schema: The schema of the table to be created
:return: returns nothing
"""
csv_file = open(csvFilePath, 'rb')
dataset_ref = client.dataset(datasetName)
from google.cloud.bigquery import Dataset
dataset = Dataset(dataset_ref)
table_ref = dataset.table(tableName)
if schema is not None:
table = bigquery.Table(table_ref,schema)
else:
table = bigquery.Table(table_ref)
try:
client.delete_table(table)
except:
pass
table = client.create_table(table)
from google.cloud.bigquery import LoadJobConfig
job_config = LoadJobConfig()
table_ref = dataset.table(tableName)
job_config.source_format = 'CSV'
job_config.skip_leading_rows = 1
job_config.autodetect = True
job = client.load_table_from_file(
csv_file, table_ref, job_config=job_config)
job.result()
insertTable(dataset_id, new_table, 'new_data.csv', schema)