Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/72.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Airflow 气流分支操纵器_Airflow - Fatal编程技术网

Airflow 气流分支操纵器

Airflow 气流分支操纵器,airflow,Airflow,我试图并行运行任务,但我知道BranchPythonOperator只返回一个分支。我的问题是,如果需要,我如何返回多个任务? 这是我的dag: 如果我只有一个文件,它的工作良好,为这个案件。但是如果我有两个或多个文件,它只执行一个任务,而跳过所有其他任务。我想并行运行相关任务,如果我有4个文件,我需要并行运行它们并跳过其他文件 我怎么能做这样的事 我的代码: import datetime as dt from airflow import DAG import shutil import

我试图并行运行任务,但我知道BranchPythonOperator只返回一个分支。我的问题是,如果需要,我如何返回多个任务? 这是我的dag:

如果我只有一个文件,它的工作良好,为这个案件。但是如果我有两个或多个文件,它只执行一个任务,而跳过所有其他任务。我想并行运行相关任务,如果我有4个文件,我需要并行运行它们并跳过其他文件

我怎么能做这样的事

我的代码:

import datetime as dt
from airflow import DAG
import shutil
import os
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator

scriptAirflow = '/home/alexw/scriptAirflow/testFile/'
uploadPath='/apps/lv-manuf2020-data/80_DATA/00_Loading/'
receiptPath= '/apps/lv-manuf2020-data/80_DATA/01_Receipt/'
allReceiptFiles=os.listdir(receiptPath)
branchTask=['kpi_opj_data', 'material_mvke','material_mara','material_mbew','material_marm','material_mdma','material_marc','material_mard']

def parseFileName(file):
    splitFile = file.split('_')
    baseName= splitFile[2:]
    newBaseName='_'.join(baseName)
    formatDate= newBaseName.split('-')
    baseFileName = formatDate[0].lower()
    return baseFileName

def onlyCsvFiles():
    if(os.listdir(uploadPath)):
        for files in os.listdir(uploadPath):    
            if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
                shutil.move(uploadPath+files, receiptPath)
                print(files+' moved in ' + receiptPath+files)
        for files in os.listdir(receiptPath):
            if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
                return "result_mv"
            else:
                return "no_file_timeout"
    else:
        print('No file in upload_00')

def result():
    if allReceiptFiles:
        mem_flag = False
        fms_flag = False
        for files in allReceiptFiles:
            if (files.startswith('MEM') and files.endswith('.csv')):
                mem_flag = True
            if (files.startswith('FMS') and files.endswith('.csv')):
                fms_flag = True
        if mem_flag and fms_flag:
            return "run_both_scripts"
        if mem_flag:
            return "run_for_mem"
        if fms_flag:
            return "run_for_fms"
    else:
        print('No script to launch')
        pass

def returnGoodBranch():
    checkScript=[]
    for files in os.listdir(receiptPath):
        newFiles = parseFileName(files)
        checkScript.append(newFiles)
        for scriptFiles in checkScript:
            if scriptFiles.startswith(scriptFiles):
                return scriptFiles

default_args = {
    'owner': 'testParallel',
    'start_date': dt.datetime(2020, 2, 17),
    'retries': 1,
}


dag = DAG('testParallel', default_args=default_args, description='airflow_manuf2020_v4',
          schedule_interval=None, catchup=False)


file_sensor = FileSensor(
    task_id="file_sensor",
    filepath=uploadPath,
    fs_conn_id='airflow_db',
    poke_interval=10,
    dag=dag,
)
move_csv = BranchPythonOperator(
    task_id='move_csv',
    python_callable=onlyCsvFiles,
    trigger_rule='none_failed',
    dag=dag,
)
result_mv = BranchPythonOperator(
    task_id='result_mv',
    python_callable=result,
    trigger_rule='none_failed',
    dag=dag,
)

run_Mem_Script = DummyOperator(
    task_id="run_for_mem",
    dag=dag,
)

kpi_obj_data = BashOperator(
    task_id='kpi_obj_data',
    bash_command='python3 '+scriptAirflow+'kpi_obj_data.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)
run_Fms_Script = BranchPythonOperator(
    task_id="run_for_fms",
    python_callable=returnGoodBranch,
    trigger_rule='all_success',
    dag=dag,
)
material_makt = BashOperator(
    task_id="material_makt",
    bash_command='python3 '+scriptAirflow+'material_makt.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)
material_mara = BashOperator(
    task_id="material_mara",
    bash_command='python3 '+scriptAirflow+'material_mara.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)
material_marc = BashOperator(
    task_id="material_marc",
    bash_command='python3 '+scriptAirflow+'material_marc.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)
material_mard = BashOperator(
    task_id="material_mard",
    bash_command='python3 '+scriptAirflow+'material_mard.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)
material_marm = BashOperator(
    task_id="material_marm",
    bash_command='python3 '+scriptAirflow+'material_marm.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)
material_mbew = BashOperator(
    task_id="material_mbew",
    bash_command='python3 '+scriptAirflow+'material_mbew.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)
material_mdma = BashOperator(
    task_id="material_mdma",
    bash_command='python3 '+scriptAirflow+'material_mdma.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)
material_mvke = BashOperator(
    task_id="material_mvke",
    bash_command='python3 '+scriptAirflow+'material_mvke.py "{{ execution_date }}"',
    trigger_rule='one_success',
    dag=dag,
)

run_both_scripts = DummyOperator(
    task_id="run_both_scripts",
    dag=dag,
)

no_file_timeout= BashOperator(
    task_id="no_file_timeout",
    bash_command='sleep 300',
    trigger_rule='all_done',
    dag=dag,
)
rerun_dag_no_file = TriggerDagRunOperator(
    task_id='rerun_dag_no_file',
    trigger_dag_id='testParallel',
    trigger_rule='all_success',
    dag=dag,
)
checking_file= DummyOperator(
    task_id='file_ok',
    trigger_rule='all_done',
    dag=dag,
)

rerun_dag=TriggerDagRunOperator(
    task_id='rerun_dag',
    trigger_dag_id='testParallel',
    trigger_rule='all_done',
    dag=dag,
)




move_csv.set_upstream(file_sensor)
result_mv.set_upstream(move_csv)
no_file_timeout.set_upstream(move_csv)
run_both_scripts.set_upstream(result_mv)
run_Fms_Script.set_upstream(result_mv)
run_Mem_Script.set_upstream(result_mv)
kpi_obj_data.set_upstream(run_Mem_Script)
kpi_obj_data.set_upstream(run_both_scripts)
material_makt.set_upstream(run_both_scripts)
material_mara.set_upstream(run_both_scripts)
material_marc.set_upstream(run_both_scripts)
material_mard.set_upstream(run_both_scripts)
material_marm.set_upstream(run_both_scripts)
material_mbew.set_upstream(run_both_scripts)
material_mdma.set_upstream(run_both_scripts)
material_mvke.set_upstream(run_both_scripts)
material_makt.set_upstream(run_Fms_Script)
material_mara.set_upstream(run_Fms_Script)
material_marc.set_upstream(run_Fms_Script)
material_mard.set_upstream(run_Fms_Script)
material_marm.set_upstream(run_Fms_Script)
material_mbew.set_upstream(run_Fms_Script)
material_mdma.set_upstream(run_Fms_Script)
material_mvke.set_upstream(run_Fms_Script)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(material_makt)
checking_file.set_upstream(material_mara)
checking_file.set_upstream(material_marc)
checking_file.set_upstream(material_mard)
checking_file.set_upstream(material_marm)
checking_file.set_upstream(material_mbew)
checking_file.set_upstream(material_mdma)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(kpi_obj_data)
rerun_dag.set_upstream(checking_file)
rerun_dag_no_file.set_upstream(no_file_timeout)
任务是bash操作符并调用python脚本
如果有人有办法的话,我会坚持下去的!非常感谢

BranchPythonoperar可以返回任务ID列表。 例如,您要执行material_marm、material_mbew和material_mdma, 您只需要在python可调用函数中返回这些任务ID。 返回[“材料”、“材料”、“材料”和“\mdma”]


如果您想了解有关BranchPythonOperator的更多信息,请查看my,我相信它将对您有所帮助:)

自1.10.3以来,您可以从BranchPythonCallable返回任务ID列表。这对你有用吗?好吧,我还没有看到,你有链接或例子吗?谢谢!“…需要一个Python函数返回单个任务id或任务id列表”-示例:)Thx a lo,it帮助:)