深度学习框架使用简介

不同深度学习框架的python运行环境加载方式如下:

//加载TensorFlow运行环境

source /public/DeepLearning/TensorFlow/tensorflow-env.sh

//加载PyTorch运行环境

source /public/DeepLearning/PyTorch/pytorch-env.sh

//加载MxNet运行环境

source /public/DeepLearning/MxNet/mxnet-env.sh

下面分别介绍三个框架的单机和分布式运行示例:

  1. TensorFlow:

        //下载测试程序:
        https://github.com/horovod/horovod/blob/master/examples/tensorflow_synthetic_benchmark.py
        //单机单卡
        python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500
    
        //单机多卡
        cat single_process.sh
        #!/bin/bash
    
        lrank=$OMPI_COMM_WORLD_LOCAL_RANK
    
        APP="python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500"
        case ${lrank} in
        [0])
        export HIP_VISIBLE_DEVICES=0,1,2,3
        export UCX_NET_DEVICES=mlx5_0:1
        export UCX_IB_PCI_BW=mlx5_0:50Gbs
        numactl --cpunodebind=0 --membind=0 ${APP}
        ;;
        [1])
        export HIP_VISIBLE_DEVICES=0,1,2,3
        export UCX_NET_DEVICES=mlx5_1:1
        export UCX_IB_PCI_BW=mlx5_1:50Gbs
        numactl --cpunodebind=1 --membind=1 ${APP}
        ;;
        [2])
        export HIP_VISIBLE_DEVICES=0,1,2,3
        export UCX_NET_DEVICES=mlx5_2:1
        export UCX_IB_PCI_BW=mlx5_2:50Gbs
        numactl --cpunodebind=2 --membind=2 ${APP}
        ;;
        [3])
        export HIP_VISIBLE_DEVICES=0,1,2,3
        export UCX_NET_DEVICES=mlx5_3:1
        export UCX_IB_PCI_BW=mlx5_3:50Gbs
        numactl --cpunodebind=3 --membind=3 ${APP}
        ;;
        esac
    
        mpirun -np 4 ./single_process.sh
    
        //多机多卡,结合调度系统使用
        #!/bin/bash
        #SBATCH -p debug
        #SBATCH -N 16
        #SBATCH -J xuan-tf
        #SBATCH -n 512
        #SBATCH --gres=加速卡:4
    
        hostfile=./$SLURM_JOB_ID
        scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
        for i in `cat ./${hostfile}`
        do
            echo ${i} slots=4 >> ./hostfile-dl-$SLURM_JOB_ID    ((num_node=${num_node}+1))
        done
        echo "resnet50 node is " ${num_node}
        ((num_加速卡=${num_node}*4))
    
        mpirun -np ${num_加速卡} ./single_process.h
    
  2. PyTorch:

         //下载测试程序
         https://github.com/pytorch/examples/blob/master/imagenet/main.py
         /*usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N]
                     [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e]
                     [--pretrained] [--world-size WORLD_SIZE] [--rank RANK]
                     [--dist-url DIST_URL] [--dist-backend DIST_BACKEND]
                     [--seed SEED] [--gpu GPU] [--multiprocessing-distributed]
                     DIR
         */
         //单机单卡
         python3 main.py \
             --batch-size=32 \
             --arch=resnet50 \
             --workers 6 \
             --epochs=1 \
             --gpu=0 \
             /imagenet/
    
         //单机多卡(-)
         python3 main.py \
             --batch-size=128 \
             --arch=resnet50 \
             --workers 24 \
             --epochs=1 \
             /imagenet/
    
         //单机多卡(二)
         cat single_process.sh
         export GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3
         export MIOPEN_USER_DB_PATH=/tmp/pytorch-miopen-2.8
         export HSA_USERPTR_FOR_PAGED_MEM=0
    
         lrank=$OMPI_COMM_WORLD_LOCAL_RANK
         comm_rank=$OMPI_COMM_WORLD_RANK
         comm_size=$OMPI_COMM_WORLD_SIZE
    
         APP="python3 main.py --batch-size=32 --a=resnet50 -j 6 --epochs=1 --dist-url tcp://${1}:34567 --dist-backend gloo --world-size=${comm_size} --rank=${comm_rank} /imagenet/"
         case ${lrank} in
         [0])
         export HIP_VISIBLE_DEVICES=0
         export UCX_NET_DEVICES=mlx5_0:1
         export UCX_IB_PCI_BW=mlx5_0:50Gbs
         GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=0 --membind=0 ${APP}
         ;;
         [1])
         export HIP_VISIBLE_DEVICES=1
         export UCX_NET_DEVICES=mlx5_1:1
         export UCX_IB_PCI_BW=mlx5_1:50Gbs
         GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=1 --membind=1 ${APP}
         ;;
         [2])
         export HIP_VISIBLE_DEVICES=2
         export UCX_NET_DEVICES=mlx5_2:1
         export UCX_IB_PCI_BW=mlx5_2:50Gbs
         GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=2 --membind=2 ${APP}
         ;;
         [3])
         export HIP_VISIBLE_DEVICES=3
         export UCX_NET_DEVICES=mlx5_3:1
         export UCX_IB_PCI_BW=mlx5_3:50Gbs
         GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=3 --membind=3 ${APP}
         ;;
         esac
         //单机四卡
         mpirun -np 4 `pwd`/single_process.sh $dist_url
    
         //多机多卡
         #!/bin/bash
         #SBATCH -p debug
         #SBATCH -N 2
         #SBATCH -J xuan-pytorch
         #SBATCH -n 64
         which mpirun
    
         hostfile=./$SLURM_JOB_ID
         scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
         num_node=$(cat $hostfile|sort|uniq |wc -l)
    
         num_加速卡=$(($num_node*4))
         nodename=$(cat $hostfile |sed -n "1p")
         dist_url=`echo $nodename | awk '{print $1}'`
    
         rm `pwd`/hostfile-xuan -f
         cat $hostfile|sort|uniq >`pwd`/tmp
    
         for i in `cat ./tmp`
         do
             echo ${i} slots=4 >> `pwd`/hostfile-xuan
         done
    
         mpirun -np $np --allow-run-as-root -hostfile `pwd`/hostfile-xuan `pwd`/single_process.sh $dist_url
    
  3. Mxnet:

        //下载测试程序
        https://github.com/apache/incubator-mxnet/blob/v1.4.x/example/image-classification
    
        //单机单卡
        python3 train_imagenet.py \
            --benchmark 1 \
            --gpus 0 \
            --network inception-v3 \
            --batch-size 64 \
            --image-shape 3,299,299 \
            --num-epochs 10 \
            --kv-store device
    
        //单机多卡
        python3 train_imagenet.py \
            --benchmark 1 \
            --gpus 0,1,2,3 \
            --network inception-v3 \
            --batch-size 64 \
            --image-shape 3,299,299 \
            --num-epochs 10 \
            --kv-store device
    
        //多机多卡,ps-worker
        #!/bin/bash
        #SBATCH -J mxnet
        #SBATCH -p dl
        #SBATCH -N 2
        #SBATCH -n 64
        #SBATCH --gres=加速卡:4
    
        hostfile=./$SLURM_JOB_ID
        scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
        num_node=0
        for i in `cat ./${hostfile}`
        do
        #    echo ${i} slots=4 >> ./mxnet-$SLURM_JOB_ID
            gethostip ${i} | awk '{print $2}' >>./mxnet-$SLURM_JOB_ID
            ((num_node=${num_node}+1))
        done
    
        source /public/home/yangxuan1/mxnet-env.sh
        which python3
    
        python3 ../tools/launch.py \
            -n ${num_node} -s 2 -H mxnet-$SLURM_JOB_ID \
            --sync-dst-dir ../example/distributed_training/ \
            --launcher ssh \
            "source /public/home/yangxuan1/mxnet-env.sh; python3 cifar10_dist.py \
            --network resnet \
            --num-layers 110 \
            --batch-size 128 \
            --kv-store dist_device_sync"
    
        /*cat hosts
        10.11.7.51
        10.11.7.53*/
    

results matching ""

    No results matching ""