Python 3.x 导入错误:请从安装apexhttps://www.github.com/nvidia/apex 使用分布式和fp16培训

Python 3.x 导入错误:请从安装apexhttps://www.github.com/nvidia/apex 使用分布式和fp16培训,python-3.x,deep-learning,nlp,pytorch,Python 3.x,Deep Learning,Nlp,Pytorch,无法为bert模型的分布式和fp16培训安装apex 我尝试从github克隆apex进行安装,并尝试使用pip安装软件包 我已尝试通过使用以下命令从git hub克隆来安装apex: git克隆 和cd apex转到转到apex目录,并尝试使用以下pip命令安装程序包: pip安装-v--no cache dir--global option=“--cpp_ext”-global option=“--cuda_ext” 完整代码为: def main(server_ip,server_port

无法为bert模型的分布式和fp16培训安装apex 我尝试从github克隆apex进行安装,并尝试使用pip安装软件包

我已尝试通过使用以下命令从git hub克隆来安装apex:

git克隆

和cd apex转到转到apex目录,并尝试使用以下pip命令安装程序包:

pip安装-v--no cache dir--global option=“--cpp_ext”-global option=“--cuda_ext”

完整代码为:

def main(server_ip,server_port,local_rank,no_cuda,fp16,train_batch_size,gradient_accumulation_steps,seed,do_train,do_eval,output_dir,task_name,data_dir,do_lower_case,bert_model,num_train_epochs,cache_dir,learning_rate,warmup_proportion,loss_scale,max_seq_length):
        if server_ip and server_port:
            # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
            import ptvsd
            print("Waiting for debugger attach")
            ptvsd.enable_attach(address=(server_ip, server_port), redirect_output=True)
            ptvsd.wait_for_attach()

        processors = {"ner":NerProcessor}
        print(processors)

        if local_rank == -1 or no_cuda:
            device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
            n_gpu = torch.cuda.device_count()
        else:
            torch.cuda.set_device(local_rank)
            device = torch.device("cuda", local_rank)
            n_gpu = 1
            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
            torch.distributed.init_process_group(backend='nccl')
        logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
            device, n_gpu, bool(local_rank != -1), fp16))

        if gradient_accumulation_steps < 1:
            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                                args.gradient_accumulation_steps))

        train_batch_size = train_batch_size // gradient_accumulation_steps

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        if not do_train and not do_eval:
            raise ValueError("At least one of `do_train` or `do_eval` must be True.")

        if os.path.exists(output_dir) and os.listdir(output_dir) and do_train:
            raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        task_name = task_name.lower()

        if task_name not in processors:
            raise ValueError("Task not found: %s" % (task_name))

        processor = processors[task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list) + 1

        tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)

        train_examples = None
        num_train_optimization_steps = None
        if do_train:
            train_examples = processor.get_train_examples(data_dir)
            num_train_optimization_steps = int(
                len(train_examples) / train_batch_size / gradient_accumulation_steps) * num_train_epochs
            if local_rank != -1:
                num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    #     # Prepare model
        cache_dir = cache_dir if cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(local_rank))
        model = Ner.from_pretrained(bert_model,
                  cache_dir=cache_dir,
                  num_labels = num_labels)
        if fp16:
            model.half()
        # model.cuda()
        model.to(device)
        if local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            model = DDP(model)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
             ]
        if fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=learning_rate,
                                 warmup=warmup_proportion,
                                 t_total=num_train_optimization_steps)

        global_step = 0
        nb_tr_steps = 0
        tr_loss = 0
        label_map = {i : label for i, label in enumerate(label_list,1)}
        if do_train:
            train_features = convert_examples_to_features(
                train_examples, label_list, max_seq_length, tokenizer)
            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)
            all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
            all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long)
            all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long)
            train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids)
            if local_rank == -1:
                train_sampler = RandomSampler(train_data)
            else:
                train_sampler = DistributedSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

            model.train()
            for _ in trange(int(num_train_epochs), desc="Epoch"):
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch
                    loss = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
                    del loss
                    if n_gpu > 1:
                        loss = loss.mean() # mean() to average on multi-gpu.
                    if gradient_accumulation_steps > 1:
                        loss = loss / gradient_accumulation_steps

                    if fp16:
                        optimizer.backward(loss)
                    else:
                        loss.backward()

                    tr_loss += loss.item()
                    nb_tr_examples += input_ids.size(0)
                    nb_tr_steps += 1
                    if (step + 1) % gradient_accumulation_steps == 0:
                        if fp16:
                            # modify learning rate with special warm up BERT uses
                            # if args.fp16 is False, BertAdam is used that handles this automatically
                            lr_this_step = learning_rate * warmup_linear(global_step/num_train_optimization_steps, warmup_proportion)
                            for param_group in optimizer.param_groups:
                                param_group['lr'] = lr_this_step
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1
def main(服务器ip、服务器端口、本地秩、无cuda、fp16、训练批量大小、梯度累积步数、种子、训练、评估、输出目录、任务名称、数据目录、小写、伯特模型、训练次数、缓存目录、学习速率、预热比例、损耗比例、最大序列长度):
如果服务器ip和服务器端口:
#远程调试-请参阅https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
导入ptvsd
打印(“等待调试器附加”)
ptvsd.enable_attach(地址=(服务器ip,服务器端口),重定向_输出=True)
ptvsd.wait_for_attach()
处理器={“ner”:NerProcessor}
打印(处理器)
如果本地\u秩==-1或没有\u cuda:
device=torch.device(“cuda”如果torch.cuda.is_可用(),而不是no_cuda else“cpu”)
n_gpu=torch.cuda.device_count()
其他:
torch.cuda.set_装置(本地_等级)
装置=火炬装置(“cuda”,本地等级)
n_gpu=1
#初始化将负责同步节点/GPU的分布式后端
torch.distributed.init_进程组(backend='nccl')
info(“设备:{}n\u gpu:{},分布式训练:{},16位训练:{}”。格式(
设备,n_gpu,bool(本地_等级!=-1),fp16)
如果梯度累积步数小于1:
raise VALUETERROR(“无效的梯度累积步长参数:{},应>=1”。格式(
参数梯度(累积(步数)
列车批量大小=列车批量大小//梯度累积步数
随机。种子(种子)
np.随机种子(种子)
手电筒.手动种子(种子)
如果不进行培训和评估:
raise VALUE ERROR(“必须至少有一个'do_train'或'do_eval'为真”。)
如果os.path.exists(output\u dir)和os.listdir(output\u dir)存在,则执行以下操作:
raise VALUERROR(“输出目录({})已存在且不是空的。”.format(Output_dir))
如果操作系统路径不存在(输出目录):
os.makedirs(输出目录)
task\u name=task\u name.lower()
如果任务名称不在处理器中:
raise VALUERROR(“未找到任务:%s”%(任务名称))
处理器=处理器[任务名称]()
label\u list=处理器。获取\u标签()
num\u labels=len(标签列表)+1
标记器=BertTokenizer.from_pretrained(bert_模型,do_lower_case=do_lower_case)
列车示例=无
num\u train\u optimization\u steps=无
如果你要训练:
列车示例=处理器。获取列车示例(数据目录)
num\u train\u optimization\u steps=int(
len(系列示例)/系列批次大小/梯度累积步骤)*数量系列时代
如果本地_排名!=-1:
num\u train\u optimization\u steps=num\u train\u optimization\u steps//torch.distributed.get\u world\u size()
##准备模型
cache_dir=cache_dir if cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_cache),'distributed_{}.format(local_rank))
模型=从预训练(伯特模型,
cache_dir=cache_dir,
num_标签=num_标签)
如果fp16:
model.half()
#model.cuda()
型号.至(设备)
如果本地_排名!=-1:
尝试:
从apex.parallel导入分布式数据并行为DDP
除恐怖外:
请从中安装apexhttps://www.github.com/nvidia/apex 使用分布式和fp16培训。”)
型号=DDP(型号)
如果n_gpu>1:
模型=火炬.nn.DataParallel(模型)
param_optimizer=list(model.named_parameters())
无衰减=['bias','LayerNorm.bias','LayerNorm.weight']
优化器\u分组\u参数=[
{'params':[p代表n,p在param_优化器中,如果不存在(nd在n中代表nd在无衰减中)],'weight_衰减]:0.01},
{'params':[p代表n,p在param_优化器中(nd在n中代表nd在无衰减中)],'weight_衰减]:0.0}
]
如果fp16:
尝试:
从apex.optimizers导入FP16\U优化器
从apex.optimizers导入FuseDam
除恐怖外:
请从中安装apexhttps://www.github.com/nvidia/apex 使用分布式和fp16培训。”)
优化器=FuseDam(优化器参数,
lr=学习率,
偏差校正=错误,
最大梯度标准=1.0)
如果损失率=0:
优化器=FP16\u优化器(优化器,动态\u损失\u比例=真)
其他:
优化器=FP16_优化器(优化器,静态_损失_比例=损失_比例)
其他:
优化器=BertAdam(优化器参数,
lr=学习率,
预热=预热比例,
t\u总计=数量(列车优化步骤)
全局步进=0
nb_tr_步数=0
tr_损失=0
label_-map={i:i的label,枚举中的label(label_-list,1)}
如果你要训练:
训练特征=将示例转化为特征(
序列示例、标签列表、最大序列长度、标记i
import os, sys, shutil
import time
import gc
from contextlib import contextmanager
from pathlib import Path
import random
import numpy as np, pandas as pd
from tqdm import tqdm, tqdm_notebook

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

USE_APEX = True

if USE_APEX:
            with timer('install Nvidia apex'):
                # Installing Nvidia Apex
                os.system('git clone https://github.com/NVIDIA/apex; cd apex; pip install -v --no-cache-dir' + 
                          ' --global-option="--cpp_ext" --global-option="--cuda_ext" ./')
                os.system('rm -rf apex/.git') # too many files, Kaggle fails
                from apex import amp