Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/xml/15.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
将Google BigQuery中一个表中的XML数据转换为同一表中另一列中的JSON数据_Json_Xml_Google Bigquery - Fatal编程技术网

将Google BigQuery中一个表中的XML数据转换为同一表中另一列中的JSON数据

将Google BigQuery中一个表中的XML数据转换为同一表中另一列中的JSON数据,json,xml,google-bigquery,Json,Xml,Google Bigquery,我在Google BigQuery中有下表(这里只显示了几行): 有没有办法直接在BIgquery中使用SQL或使用Python来实现这一点 谢谢为了更新BigQuery中的数据,您可以查看一下,但要考虑到它有自己的配额。在您的情况下,我将考虑从现有的一个表创建一个新的表,并在Python中处理XML字段,以便将其解析为JSON格式。 我已经使用Google Cloud Client libraries for Python复制了我的工作流程,它可以与下面附带的代码正常工作。该代码的工作原理如下

我在Google BigQuery中有下表(这里只显示了几行):

有没有办法直接在BIgquery中使用SQL或使用Python来实现这一点


谢谢

为了更新BigQuery中的数据,您可以查看一下,但要考虑到它有自己的配额。在您的情况下,我将考虑从现有的一个表创建一个新的表,并在Python中处理XML字段,以便将其解析为JSON格式。 我已经使用Google Cloud Client libraries for Python复制了我的工作流程,它可以与下面附带的代码正常工作。该代码的工作原理如下:

  • 将表格CSV文件导出到GCS存储桶
  • 将CSV文件从GCS bucket下载到您的计算机
  • 将列追加到名为“JSON_data”的输入数据帧
  • 将XML列“data”解析为列“JSON_data”中的JSON格式
  • 使用新数据创建新的BigQuery表
为了创建BigQuery表,我遵循了StackOverflow线程

您必须设置自己的变量(bucket\u name、project、dataset\u id、table\u id、location)。请记住,GCS存储桶与BigQuery数据集位于同一区域

import xmltodict, json
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd


#Define bigquery Client
client = bigquery.Client()

#Extract job
bucket_name = <YOUR_BUCKET_NAME>
project = <YOUR_PROJECT_ID>
dataset_id = <YOUR_DATASET_ID>
table_id = <YOUR_TABLE_ID>
location = <YOUR_TABLE_LOCATION>


def export_dataset(bucket_name, dataset_id, project, table_id):

    destination_uri = "gs://{}/{}".format(bucket_name, "bq_table.csv")
    dataset_ref = client.dataset(dataset_id, project=project)
    table_ref = dataset_ref.table(table_id)

    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        # Location must match that of the source table.
        location=location,
    )  # API request
    extract_job.result()  # Waits for job to complete.

    print(
        "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, 
destination_uri)
    )


#Execute export job    
export_dataset(bucket_name, dataset_id, project, table_id)


#--------------------------------------------

#Retrieve CSV file from GCS bucket
source_blob_name = "bq_table.csv"
destination_file_name = "bq_table.csv"

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)

    print('Blob {} downloaded to {}.'.format(
        source_blob_name,
        destination_file_name))

#Download CSV from bucket
download_blob(bucket_name, source_blob_name, destination_file_name)

#--------------------------------------------

#Declare XML column name
XML_col = 'data' 

#Read CSV as Pandas DF
df = pd.read_csv('bq_table.csv')
#Append JSON_data column
df['JSON_data'] = ''
#Transform XML and save in Array
JSON_arr = [json.dumps(xmltodict.parse(df[XML_col].values[i])) for i in 
 range(len(df[XML_col]))]
#Set transformed data to column JSON_data
df.loc[:,'JSON_data'] = JSON_arr
#df to CSV - Generete output file
df.to_csv('new_data.csv', index=False, sep=',')

#----------------------------------------------


#Now we will create the new table with the new CSV 
csv_path='gs://{}/new_data.csv'.format(bucket_name)
new_table='new_table'


#Define schema for table
schema = [
        bigquery.SchemaField("id", "INTEGER"),
        bigquery.SchemaField("loaded_date", "DATE"),
        bigquery.SchemaField("JSON_data", "STRING"),   
    ]

#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto- 
detect-schema-using-python-api
def insertTable(datasetName, tableName, csvFilePath, schema=None):
    """
    This function creates a table in given dataset in our default project
    and inserts the data given via a csv file.

    :param datasetName: The name of the dataset to be created
    :param tableName: The name of the dataset in which the table needs to be created
    :param csvFilePath: The path of the file to be inserted
    :param schema: The schema of the table to be created
    :return: returns nothing
    """

    csv_file = open(csvFilePath, 'rb')

    dataset_ref = client.dataset(datasetName)        
    from google.cloud.bigquery import Dataset
   dataset = Dataset(dataset_ref)

    table_ref = dataset.table(tableName)
    if schema is not None:
        table = bigquery.Table(table_ref,schema)
    else:
        table = bigquery.Table(table_ref)

    try:
        client.delete_table(table)
    except:
        pass

    table = client.create_table(table)

    from google.cloud.bigquery import LoadJobConfig        
    job_config = LoadJobConfig()
    table_ref = dataset.table(tableName)
    job_config.source_format = 'CSV'
    job_config.skip_leading_rows = 1
    job_config.autodetect = True
    job = client.load_table_from_file(
        csv_file, table_ref, job_config=job_config)
    job.result()

insertTable(dataset_id, new_table, 'new_data.csv', schema)
导入xmltodict,json
从google.cloud导入bigquery
从google.cloud导入存储
作为pd进口熊猫
#定义bigquery客户端
client=bigquery.client()
#提取作业
bucket\u name=
项目=
数据集\u id=
表_id=
位置=
def导出数据集(存储桶名称、数据集id、项目、表id):
destination_uri=“gs://{}/{}”。格式(bucket_名称,“bq_table.csv”)
dataset\u ref=client.dataset(dataset\u id,project=project)
表\u ref=数据集\u ref.表(表\u id)
extract\u job=client.extract\u表(
表(参考),
目的地,
#位置必须与源表的位置匹配。
位置=位置,
)#API请求
extract_job.result()#等待作业完成。
印刷品(
“导出{}:{}.{}到{}”。格式(项目、数据集id、表id、,
目的地(uri)
)
#执行导出作业
导出数据集(存储桶名称、数据集id、项目、表id)
#--------------------------------------------
#从GCS存储桶中检索CSV文件
source\u blob\u name=“bq\u table.csv”
目的地\u文件\u名称=“bq\u table.csv”
def下载\u blob(存储桶名称、源\u blob名称、目标\u文件名称):
“”“从存储桶中下载一个blob。”“”
storage\u client=storage.client()
bucket=存储\客户端。获取\ bucket(bucket\名称)
blob=bucket.blob(源\u blob\u名称)
blob.下载到文件名(目标文件名)
打印('Blob{}下载为{}.'.format(
源\u blob\u名称,
目的地(文件名)
#从bucket下载CSV
下载\u blob(bucket\u名称、源\u blob\u名称、目标\u文件\u名称)
#--------------------------------------------
#声明XML列名
XML_col='data'
#将CSV读为DF
df=pd.read\u csv('bq\u table.csv')
#附加JSON_数据列
df['JSON_data']=''
#转换XML并保存在数组中
JSON_arr=[JSON.dumps(xmltodict.parse(df[XML_col].values[i]),用于
范围(len(df[XML\u col]))
#将转换后的数据设置为列JSON_数据
df.loc[:,'JSON_data']=JSON_arr
#df到CSV-Generete输出文件
df.to_csv('new_data.csv',index=False,sep=','))
#----------------------------------------------
#现在,我们将使用新的CSV创建新表
csv_path='gs://{}/new_data.csv'。格式(bucket_名称)
新建表格=“新建表格”
#定义表的架构
模式=[
SchemaField(“id”,“INTEGER”),
SchemaField(“加载日期”、“日期”),
SchemaField(“JSON_数据”,“字符串”),
]
#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto- 
使用pythonapi检测模式
def insertTable(数据集名称、表名称、csvFilePath、架构=无):
"""
此函数用于在默认项目中的给定数据集中创建表
并插入通过csv文件提供的数据。
:param datasetName:要创建的数据集的名称
:param tableName:需要在其中创建表的数据集的名称
:param csvFilePath:要插入的文件的路径
:param schema:要创建的表的架构
:return:不返回任何内容
"""
csv_file=open(csvFilePath,'rb')
dataset_ref=client.dataset(datasetName)
从google.cloud.bigquery导入数据集
数据集=数据集(数据集\u参考)
table_ref=dataset.table(tableName)
如果架构不是“无”:
table=bigquery.table(表参考,模式)
其他:
table=bigquery.table(table\u ref)
尝试:
client.delete_表(表)
除:
通过
table=客户端。创建_表(table)
从google.cloud.bigquery导入LoadJobConfig
job_config=LoadJobConfig()
table_ref=dataset.table(tableName)
job_config.source_格式='CSV'
job\u config.skip\u leading\u rows=1
job_config.autodetect=True
job=client.load\u表\u从\u文件(
csv_文件,表_参考,作业_配置=作业_配置)
job.result()
insertTable(数据集\u id,新\u表,'new\u data.csv',模式)
请告诉我这对你是否有效



好的,您不能直接在SQL中使用任何函数来实现这一点。最简单的方法是编写JavaScript UDF并在其中使用XML->JSON库。请参见此处开始:谢谢@GrahamPolley。这很有用,谢谢你,Joaquim。我今天无法尝试,但我明天会做,并会让你知道。谢谢。干杯
id     loaded_date     data                    data_json
1      2019-10-25      Same data as before     {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Avstemming kunder - OMT"},{"-name": "Created","-type": "datetime","-value": "2019-10-25 17:35:17Z"},{"-name": "Type","-type": "text","-value": "Session Provisioning Failure"}]}}}
2      2019-10-25      Same data as before     {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Avstemming kunder - OMT"},{"-name": "Created","-type": "datetime","-value": "2019-10-25 17:51:32Z"},{"-name": "Type","-type": "text","-value": "Session Provisioning Failure"}]}}}
3      2019-02-23      Same data as before     {"collection": {"row": {"field": [{"-name": "Item Key","-type": "text","-value": "Haircolour - Hent klienter til kø"},{"-name": "Last Generation Time","-type": "datetime","-value": "2019-02-23 11:00:36Z"},{"-name": "Priority","-type": "number","-value": "-3"}]}}}
import xmltodict, json
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd


#Define bigquery Client
client = bigquery.Client()

#Extract job
bucket_name = <YOUR_BUCKET_NAME>
project = <YOUR_PROJECT_ID>
dataset_id = <YOUR_DATASET_ID>
table_id = <YOUR_TABLE_ID>
location = <YOUR_TABLE_LOCATION>


def export_dataset(bucket_name, dataset_id, project, table_id):

    destination_uri = "gs://{}/{}".format(bucket_name, "bq_table.csv")
    dataset_ref = client.dataset(dataset_id, project=project)
    table_ref = dataset_ref.table(table_id)

    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        # Location must match that of the source table.
        location=location,
    )  # API request
    extract_job.result()  # Waits for job to complete.

    print(
        "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, 
destination_uri)
    )


#Execute export job    
export_dataset(bucket_name, dataset_id, project, table_id)


#--------------------------------------------

#Retrieve CSV file from GCS bucket
source_blob_name = "bq_table.csv"
destination_file_name = "bq_table.csv"

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)

    print('Blob {} downloaded to {}.'.format(
        source_blob_name,
        destination_file_name))

#Download CSV from bucket
download_blob(bucket_name, source_blob_name, destination_file_name)

#--------------------------------------------

#Declare XML column name
XML_col = 'data' 

#Read CSV as Pandas DF
df = pd.read_csv('bq_table.csv')
#Append JSON_data column
df['JSON_data'] = ''
#Transform XML and save in Array
JSON_arr = [json.dumps(xmltodict.parse(df[XML_col].values[i])) for i in 
 range(len(df[XML_col]))]
#Set transformed data to column JSON_data
df.loc[:,'JSON_data'] = JSON_arr
#df to CSV - Generete output file
df.to_csv('new_data.csv', index=False, sep=',')

#----------------------------------------------


#Now we will create the new table with the new CSV 
csv_path='gs://{}/new_data.csv'.format(bucket_name)
new_table='new_table'


#Define schema for table
schema = [
        bigquery.SchemaField("id", "INTEGER"),
        bigquery.SchemaField("loaded_date", "DATE"),
        bigquery.SchemaField("JSON_data", "STRING"),   
    ]

#https://stackoverflow.com/questions/44947369/load-the-csv-file-into-big-query-auto- 
detect-schema-using-python-api
def insertTable(datasetName, tableName, csvFilePath, schema=None):
    """
    This function creates a table in given dataset in our default project
    and inserts the data given via a csv file.

    :param datasetName: The name of the dataset to be created
    :param tableName: The name of the dataset in which the table needs to be created
    :param csvFilePath: The path of the file to be inserted
    :param schema: The schema of the table to be created
    :return: returns nothing
    """

    csv_file = open(csvFilePath, 'rb')

    dataset_ref = client.dataset(datasetName)        
    from google.cloud.bigquery import Dataset
   dataset = Dataset(dataset_ref)

    table_ref = dataset.table(tableName)
    if schema is not None:
        table = bigquery.Table(table_ref,schema)
    else:
        table = bigquery.Table(table_ref)

    try:
        client.delete_table(table)
    except:
        pass

    table = client.create_table(table)

    from google.cloud.bigquery import LoadJobConfig        
    job_config = LoadJobConfig()
    table_ref = dataset.table(tableName)
    job_config.source_format = 'CSV'
    job_config.skip_leading_rows = 1
    job_config.autodetect = True
    job = client.load_table_from_file(
        csv_file, table_ref, job_config=job_config)
    job.result()

insertTable(dataset_id, new_table, 'new_data.csv', schema)