Google cloud platform BigQuery显示错误结果-从云函数复制数据?
我是一名初级开发人员,负责在现有项目中实现Facebook API。然而,业务团队发现BigQuery上显示的谷歌分析结果是错误的。他们让我修它。这就是架构: 我所做的是:Google cloud platform BigQuery显示错误结果-从云函数复制数据?,google-cloud-platform,google-bigquery,google-cloud-functions,google-cloud-storage,Google Cloud Platform,Google Bigquery,Google Cloud Functions,Google Cloud Storage,我是一名初级开发人员,负责在现有项目中实现Facebook API。然而,业务团队发现BigQuery上显示的谷歌分析结果是错误的。他们让我修它。这就是架构: 我所做的是: 在BigQuery上,检查Google Analytics的结果有多接近/远。我发现有一个模式,我在BigQuery上得到的结果总是GA原始值的1、2或3倍 我检查了计算引擎上是否有多个cron作业。实际上只有一个cron作业,每天运行一次 我在谷歌云存储上验证了结果。谷歌云存储的结果是正确的,如下所示: 基于
- 在BigQuery上,检查Google Analytics的结果有多接近/远。我发现有一个模式,我在BigQuery上得到的结果总是GA原始值的1、2或3倍
- 我检查了计算引擎上是否有多个cron作业。实际上只有一个cron作业,每天运行一次
- 我在谷歌云存储上验证了结果。谷歌云存储的结果是正确的,如下所示:
BUCKET = "xxxx"
GOOGLE_PROJECT = "xxxx"
HEADER_MAPPING = {
"Source/Medium": "source_medium",
"Campaign": "campaign",
"Last Non-Direct Click Conversions": "last_non_direct_click_conversions",
"Last Non-Direct Click Conversion Value": "last_non_direct_click_conversion_value",
"Last Click Prio Conversions": "last_click_prio_conversions",
"Last Click Prio Conversion Value": "last_click_prio_conversion_value",
"Data-Driven Conversions": "dda_conversions",
"Data-Driven Conversion Value": "dda_conversion_value",
"% Change in Conversions from Last Non-Direct Click to Last Click Prio": "last_click_prio_vs_last_click",
"% Change in Conversions from Last Non-Direct Click to Data-Driven": "dda_vs_last_click"
}
SPEND_HEADER_MAPPING = {
"Source/Medium": "source_medium",
"Campaign": "campaign",
"Spend": "spend"
}
tables_schema = {
"google-analytics": [
bigquery.SchemaField("date", bigquery.enums.SqlTypeNames.DATE, mode='REQUIRED'),
bigquery.SchemaField("week", bigquery.enums.SqlTypeNames.INT64, mode='REQUIRED'),
bigquery.SchemaField("goal", bigquery.enums.SqlTypeNames.STRING, mode='REQUIRED'),
bigquery.SchemaField("source", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("medium", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("campaign", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("last_non_direct_click_conversions", bigquery.enums.SqlTypeNames.INT64, mode='NULLABLE'),
bigquery.SchemaField("last_non_direct_click_conversion_value", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("last_click_prio_conversions", bigquery.enums.SqlTypeNames.INT64, mode='NULLABLE'),
bigquery.SchemaField("last_click_prio_conversion_value", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("dda_conversions", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("dda_conversion_value", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("last_click_prio_vs_last_click", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("dda_vs_last_click", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE')
],
"google-analytics-spend": [
bigquery.SchemaField("date", bigquery.enums.SqlTypeNames.DATE, mode='REQUIRED'),
bigquery.SchemaField("week", bigquery.enums.SqlTypeNames.INT64, mode='REQUIRED'),
bigquery.SchemaField("source", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("medium", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("campaign", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("spend", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
]
}
def download_from_gcs(file):
client = storage.Client()
bucket = client.get_bucket(BUCKET)
blob = bucket.get_blob(file['name'])
file_name = os.path.basename(os.path.normpath(file['name']))
blob.download_to_filename(f"/tmp/{file_name}")
return file_name
def load_in_bigquery(file_object, dataset: str, table: str):
client = bigquery.Client()
table_id = f"{GOOGLE_PROJECT}.{dataset}.{table}"
job_config = bigquery.LoadJobConfig(
source_format=bigquery.SourceFormat.CSV,
skip_leading_rows=1,
autodetect=True,
schema=tables_schema[table]
)
job = client.load_table_from_file(file_object, table_id, job_config=job_config)
job.result() # Wait for the job to complete.
def __order_columns(df: pd.DataFrame, spend=False) ->pd.DataFrame:
# We want to have source and medium columns at the third position
# for a spend data frame and at the fourth postion for others df
# because spend data frame don't have goal column.
pos = 2 if spend else 3
cols = df.columns.tolist()
cols[pos:2] = cols[-2:]
cols = cols[:-2]
return df[cols]
def __common_transformation(df: pd.DataFrame, date: str, goal: str) -> pd.DataFrame:
# for any kind of dataframe, we add date and week columns
# based on the file name and we split Source/Medium from the csv
# into two different columns
week_of_the_year = datetime.strptime(date, '%Y-%m-%d').isocalendar()[1]
df.insert(0, 'date', date)
df.insert(1, 'week', week_of_the_year)
mapping = SPEND_HEADER_MAPPING if goal == "spend" else HEADER_MAPPING
print(df.columns.tolist())
df = df.rename(columns=mapping)
print(df.columns.tolist())
print(df)
df["source_medium"] = df["source_medium"].str.replace(' ', '')
df[["source", "medium"]] = df["source_medium"].str.split('/', expand=True)
df = df.drop(["source_medium"], axis=1)
df["week"] = df["week"].astype(int, copy=False)
return df
def __transform_spend(df: pd.DataFrame) -> pd.DataFrame:
df["spend"] = df["spend"].astype(float, copy=False)
df = __order_columns(df, spend=True)
return df[df.columns[:6]]
def __transform_attribution(df: pd.DataFrame, goal: str) -> pd.DataFrame:
df.insert(2, 'goal', goal)
df["last_non_direct_click_conversions"] = df["last_non_direct_click_conversions"].astype(int, copy=False)
df["last_click_prio_conversions"] = df["last_click_prio_conversions"].astype(int, copy=False)
df["dda_conversions"] = df["dda_conversions"].astype(float, copy=False)
return __order_columns(df)
def transform(df: pd.DataFrame, file_name) -> pd.DataFrame:
goal, date, *_ = file_name.split('_')
df = __common_transformation(df, date, goal)
# we only add goal in attribution df (google-analytics table).
return __transform_spend(df) if "spend" in file_name else __transform_attribution(df, goal)
def main(event, context):
"""Triggered by a change to a Cloud Storage bucket.
Args:
event (dict): Event payload.
context (google.cloud.functions.Context): Metadata for the event.
"""
file = event
file_name = download_from_gcs(file)
df = pd.read_csv(f"/tmp/{file_name}")
transformed_df = transform(df, file_name)
with open(f"/tmp/bq_{file_name}", "w") as file_object:
file_object.write(transformed_df.to_csv(index=False))
with open(f"/tmp/bq_{file_name}", "rb") as file_object:
table = "google-analytics-spend" if "spend" in file_name else "google-analytics"
load_in_bigquery(file_object, dataset='attribution', table=table)
更新
是,云功能由GCS对象最终确定事件触发。此外,该函数在失败时不会自动重试
我遵循您的建议,现在检查我的云功能页面上的日志表。在最后10行日志数据中,似乎运行了3个不同的云函数实例。我无法得到更多的细节,当我扩大每一行
我现在还要检查BigQuery日志。我想最简单的解决方案是使用
BigQueryAuditMetadata
并获取有关表何时更新的日志?在我看来,这是一个非常大的主题,因此可能很难提供一个精确的解决方案来解决所有问题。所以,我无法解决这个问题,但我只能表达一些个人的看法并提供一些建议
云功能由GCS对象最终确定事件触发-请检查这是否正确?在这种情况下,事件将在触发云函数调用之前“通过PubSub”。现在有两件事需要考虑:
- PubSub基于“至少传递一次”范式,因此可以重复传递消息
- 这种云函数调用具有自动确认。而开发者无法控制这一点。PubSub不能用于控制整个进程状态。云函数执行的时间越长(最长超时540秒或更长),PubSub就越有可能做出(内部)消息尚未传递的决定,因此应该再次传递,从而重新调用云函数
为了避免复制和粘贴,请参见此处的答案:在我看来,这是一个非常大的主题,因此可能很难提供一个精确的解决方案来解决所有问题。所以,我无法解决这个问题,但我只能表达一些个人的看法并提供一些建议 云功能由GCS对象最终确定事件触发-请检查这是否正确?在这种情况下,事件将在触发云函数调用之前“通过PubSub”。现在有两件事需要考虑:
- PubSub基于“至少传递一次”范式,因此可以重复传递消息
- 这种云函数调用具有自动确认。而开发者无法控制这一点。PubSub不能用于控制整个进程状态。云函数执行的时间越长(最长超时540秒或更长),PubSub就越有可能做出(内部)消息尚未传递的决定,因此应该再次传递,从而重新调用云函数