将Google BigQuery中一个表中的XML数据转换为同一表中另一列中的JSON数据_Json_Xml_Google Bigquery

将Google BigQuery中一个表中的XML数据转换为同一表中另一列中的JSON数据

json xml google-bigquery

将Google BigQuery中一个表中的XML数据转换为同一表中另一列中的JSON数据,json,xml,google-bigquery,Json,Xml,Google Bigquery,我在Google BigQuery中有下表（这里只显示了几行）：有没有办法直接在BIgquery中使用SQL或使用Python来实现这一点谢谢为了更新BigQuery中的数据，您可以查看一下，但要考虑到它有自己的配额。在您的情况下，我将考虑从现有的一个表创建一个新的表，并在Python中处理XML字段，以便将其解析为JSON格式。我已经使用Google Cloud Client libraries for Python复制了我的工作流程，它可以与下面附带的代码正常工作。该代码的工作原理如下

我在Google BigQuery中有下表（这里只显示了几行）：

有没有办法直接在BIgquery中使用SQL或使用Python来实现这一点

谢谢

为了更新BigQuery中的数据，您可以查看一下，但要考虑到它有自己的配额。在您的情况下，我将考虑从现有的一个表创建一个新的表，并在Python中处理XML字段，以便将其解析为JSON格式。我已经使用Google Cloud Client libraries for Python复制了我的工作流程，它可以与下面附带的代码正常工作。该代码的工作原理如下：

将表格CSV文件导出到GCS存储桶
将CSV文件从GCS bucket下载到您的计算机
将列追加到名为“JSON_data”的输入数据帧
将XML列“data”解析为列“JSON_data”中的JSON格式
使用新数据创建新的BigQuery表

为了创建BigQuery表，我遵循了StackOverflow线程

您必须设置自己的变量（bucket\u name、project、dataset\u id、table\u id、location）。请记住，GCS存储桶与BigQuery数据集位于同一区域

import xmltodict, json
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd


#Define bigquery Client
client = bigquery.Client()

#Extract job
bucket_name = <YOUR_BUCKET_NAME>
project = <YOUR_PROJECT_ID>
dataset_id = <YOUR_DATASET_ID>
table_id = <YOUR_TABLE_ID>
location = <YOUR_TABLE_LOCATION>


def export_dataset(bucket_name, dataset_id, project, table_id):

    destination_uri = "gs://{}/{}".format(bucket_name, "bq_table.csv")
    dataset_ref = client.dataset(dataset_id, project=project)
    table_ref = dataset_ref.table(table_id)

    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        # Location must match that of the source table.
        location=location,
    )  # API request
    extract_job.result()  # Waits for job to complete.

    print(
        "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, 
destination_uri)
    )


#Execute export job    
export_dataset(bucket_name, dataset_id, project, table_id)


#--------------------------------------------

#Retrieve CSV file from GCS bucket
source_blob_name = "bq_table.csv"
destination_file_name = "bq_table.csv"

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)

    print('Blob {} downloaded to {}.'.format(
        source_blob_name,
        destination_file_name))

#Download CSV from bucket
download_blob(bucket_name, source_blob_name, destination_file_name)

#--------------------------------------------

#Declare XML column name
XML_col = 'data' 

#Read CSV as Pandas DF
df = pd.read_csv('bq_table.csv')
#Append JSON_data column
df['JSON_data'] = ''
#Transform XML and save in Array
JSON_arr = [json.dumps(xmltodict.parse(df[XML_col].values[i])) for i in 
 range(len(df[XML_col]))]
#Set transformed data to column JSON_data
df.loc[:,'JSON_data'] = JSON_arr
#df to CSV - Generete output file
df.to_csv('new_data.csv', index=False, sep=',')

#----------------------------------------------


#Now we will create the new table with the new CSV 
csv_path='gs://{}/new_data.csv'.format(bucket_name)
new_table='new_table'


#Define schema for table
schema = [
        bigquery.SchemaField("id", "INTEGER"),
        bigquery.SchemaField("loaded_date", "DATE"),
        bigquery.SchemaField("JSON_data", "STRING"),   
    ]

#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto- 
detect-schema-using-python-api
def insertTable(datasetName, tableName, csvFilePath, schema=None):
    """
    This function creates a table in given dataset in our default project
    and inserts the data given via a csv file.

    :param datasetName: The name of the dataset to be created
    :param tableName: The name of the dataset in which the table needs to be created
    :param csvFilePath: The path of the file to be inserted
    :param schema: The schema of the table to be created
    :return: returns nothing
    """

    csv_file = open(csvFilePath, 'rb')

    dataset_ref = client.dataset(datasetName)        
    from google.cloud.bigquery import Dataset
   dataset = Dataset(dataset_ref)

    table_ref = dataset.table(tableName)
    if schema is not None:
        table = bigquery.Table(table_ref,schema)
    else:
        table = bigquery.Table(table_ref)

    try:
        client.delete_table(table)
    except:
        pass

    table = client.create_table(table)

    from google.cloud.bigquery import LoadJobConfig        
    job_config = LoadJobConfig()
    table_ref = dataset.table(tableName)
    job_config.source_format = 'CSV'
    job_config.skip_leading_rows = 1
    job_config.autodetect = True
    job = client.load_table_from_file(
        csv_file, table_ref, job_config=job_config)
    job.result()

insertTable(dataset_id, new_table, 'new_data.csv', schema)

导入xmltodict，json
从google.cloud导入bigquery
从google.cloud导入存储
作为pd进口熊猫
#定义bigquery客户端
client=bigquery.client（）
#提取作业
bucket\u name=
项目=
数据集\u id=
表_id=
位置=
def导出数据集（存储桶名称、数据集id、项目、表id）：
destination_uri=“gs://{}/{}”。格式（bucket_名称，“bq_table.csv”）
dataset\u ref=client.dataset（dataset\u id，project=project）
表\u ref=数据集\u ref.表（表\u id）
extract\u job=client.extract\u表(
表(参考)，
目的地，
#位置必须与源表的位置匹配。
位置=位置，
)#API请求
extract_job.result（）#等待作业完成。
印刷品(
“导出{}:{}.{}到{}”。格式（项目、数据集id、表id、，
目的地（uri）
)
#执行导出作业
导出数据集（存储桶名称、数据集id、项目、表id）
#--------------------------------------------
#从GCS存储桶中检索CSV文件
source\u blob\u name=“bq\u table.csv”
目的地\u文件\u名称=“bq\u table.csv”
def下载\u blob（存储桶名称、源\u blob名称、目标\u文件名称）：
“”“从存储桶中下载一个blob。”“”
storage\u client=storage.client（）
bucket=存储\客户端。获取\ bucket（bucket\名称）
blob=bucket.blob（源\u blob\u名称）
blob.下载到文件名（目标文件名）
打印（'Blob{}下载为{}.'.format(
源\u blob\u名称，
目的地（文件名）
#从bucket下载CSV
下载\u blob（bucket\u名称、源\u blob\u名称、目标\u文件\u名称）
#--------------------------------------------
#声明XML列名
XML_col='data'
#将CSV读为DF
df=pd.read\u csv（'bq\u table.csv'）
#附加JSON_数据列
df['JSON_data']=''
#转换XML并保存在数组中
JSON_arr=[JSON.dumps（xmltodict.parse（df[XML_col].values[i]），用于
范围（len（df[XML\u col]））
#将转换后的数据设置为列JSON_数据
df.loc[：，'JSON_data']=JSON_arr
#df到CSV-Generete输出文件
df.to_csv（'new_data.csv'，index=False，sep='，'））
#----------------------------------------------
#现在，我们将使用新的CSV创建新表
csv_path='gs://{}/new_data.csv'。格式（bucket_名称）
新建表格=“新建表格”
#定义表的架构
模式=[
SchemaField（“id”，“INTEGER”），
SchemaField（“加载日期”、“日期”），
SchemaField（“JSON_数据”，“字符串”），
]
#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto- 
使用pythonapi检测模式
def insertTable（数据集名称、表名称、csvFilePath、架构=无）：
"""
此函数用于在默认项目中的给定数据集中创建表
并插入通过csv文件提供的数据。
：param datasetName：要创建的数据集的名称
：param tableName：需要在其中创建表的数据集的名称
：param csvFilePath：要插入的文件的路径
：param schema：要创建的表的架构
：return：不返回任何内容
"""
csv_file=open（csvFilePath，'rb'）
dataset_ref=client.dataset（datasetName）
从google.cloud.bigquery导入数据集
数据集=数据集（数据集\u参考）
table_ref=dataset.table（tableName）
如果架构不是“无”：
table=bigquery.table（表参考，模式）
其他：
table=bigquery.table（table\u ref）
尝试：
client.delete_表（表）
除：
通过
table=客户端。创建_表（table）
从google.cloud.bigquery导入LoadJobConfig
job_config=LoadJobConfig（）
table_ref=dataset.table（tableName）
job_config.source_格式='CSV'
job\u config.skip\u leading\u rows=1
job_config.autodetect=True
job=client.load\u表\u从\u文件(
csv_文件，表_参考，作业_配置=作业_配置）
job.result（）
insertTable（数据集\u id，新\u表，'new\u data.csv'，模式）

请告诉我这对你是否有效

好的，您不能直接在SQL中使用任何函数来实现这一点。最简单的方法是编写JavaScript UDF并在其中使用XML->JSON库。请参见此处开始：谢谢@GrahamPolley。这很有用，谢谢你，Joaquim。我今天无法尝试，但我明天会做，并会让你知道。谢谢。干杯

id     loaded_date     data                    data_json
1      2019-10-25      Same data as before     {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Avstemming kunder - OMT"},{"-name": "Created","-type": "datetime","-value": "2019-10-25 17:35:17Z"},{"-name": "Type","-type": "text","-value": "Session Provisioning Failure"}]}}}
2      2019-10-25      Same data as before     {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Avstemming kunder - OMT"},{"-name": "Created","-type": "datetime","-value": "2019-10-25 17:51:32Z"},{"-name": "Type","-type": "text","-value": "Session Provisioning Failure"}]}}}
3      2019-02-23      Same data as before     {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Hent klienter til kø"},{"-name": "Last Generation Time","-type": "datetime","-value": "2019-02-23 11:00:36Z"},{"-name": "Priority","-type": "number","-value": "-3"}]}}}

import xmltodict, json
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd


#Define bigquery Client
client = bigquery.Client()

#Extract job
bucket_name = <YOUR_BUCKET_NAME>
project = <YOUR_PROJECT_ID>
dataset_id = <YOUR_DATASET_ID>
table_id = <YOUR_TABLE_ID>
location = <YOUR_TABLE_LOCATION>


def export_dataset(bucket_name, dataset_id, project, table_id):

    destination_uri = "gs://{}/{}".format(bucket_name, "bq_table.csv")
    dataset_ref = client.dataset(dataset_id, project=project)
    table_ref = dataset_ref.table(table_id)

    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        # Location must match that of the source table.
        location=location,
    )  # API request
    extract_job.result()  # Waits for job to complete.

    print(
        "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, 
destination_uri)
    )


#Execute export job    
export_dataset(bucket_name, dataset_id, project, table_id)


#--------------------------------------------

#Retrieve CSV file from GCS bucket
source_blob_name = "bq_table.csv"
destination_file_name = "bq_table.csv"

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)

    print('Blob {} downloaded to {}.'.format(
        source_blob_name,
        destination_file_name))

#Download CSV from bucket
download_blob(bucket_name, source_blob_name, destination_file_name)

#--------------------------------------------

#Declare XML column name
XML_col = 'data' 

#Read CSV as Pandas DF
df = pd.read_csv('bq_table.csv')
#Append JSON_data column
df['JSON_data'] = ''
#Transform XML and save in Array
JSON_arr = [json.dumps(xmltodict.parse(df[XML_col].values[i])) for i in 
 range(len(df[XML_col]))]
#Set transformed data to column JSON_data
df.loc[:,'JSON_data'] = JSON_arr
#df to CSV - Generete output file
df.to_csv('new_data.csv', index=False, sep=',')

#----------------------------------------------


#Now we will create the new table with the new CSV 
csv_path='gs://{}/new_data.csv'.format(bucket_name)
new_table='new_table'


#Define schema for table
schema = [
        bigquery.SchemaField("id", "INTEGER"),
        bigquery.SchemaField("loaded_date", "DATE"),
        bigquery.SchemaField("JSON_data", "STRING"),   
    ]

#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto- 
detect-schema-using-python-api
def insertTable(datasetName, tableName, csvFilePath, schema=None):
    """
    This function creates a table in given dataset in our default project
    and inserts the data given via a csv file.

    :param datasetName: The name of the dataset to be created
    :param tableName: The name of the dataset in which the table needs to be created
    :param csvFilePath: The path of the file to be inserted
    :param schema: The schema of the table to be created
    :return: returns nothing
    """

    csv_file = open(csvFilePath, 'rb')

    dataset_ref = client.dataset(datasetName)        
    from google.cloud.bigquery import Dataset
   dataset = Dataset(dataset_ref)

    table_ref = dataset.table(tableName)
    if schema is not None:
        table = bigquery.Table(table_ref,schema)
    else:
        table = bigquery.Table(table_ref)

    try:
        client.delete_table(table)
    except:
        pass

    table = client.create_table(table)

    from google.cloud.bigquery import LoadJobConfig        
    job_config = LoadJobConfig()
    table_ref = dataset.table(tableName)
    job_config.source_format = 'CSV'
    job_config.skip_leading_rows = 1
    job_config.autodetect = True
    job = client.load_table_from_file(
        csv_file, table_ref, job_config=job_config)
    job.result()

insertTable(dataset_id, new_table, 'new_data.csv', schema)