0


NsightCompute教程入门

一、定义

  1. 使用pycuda 获取显卡的性能
  2. 手动绘制rootline model.
  3. nsight system 安装教程
  4. nsight system 使用教程–nsight system 分析pytorch 神经网络
  5. nsight compute 安装教程
  6. nsight compute 使用教程
  7. nsight system 与nsight compute 应用区别
  8. 遇到的问题

二、实现

https://blog.csdn.net/weixin_40653140/article/details/136238420
https://blog.csdn.net/m0_61864577/article/details/140022323
https://developer.nvidia.cn/tools-overview

  1. 使用pycuda 获取显卡的性能 test.py
import pycuda.driver as cuda
import pycuda.autoinit

def get_gpu_compute_capability_and_clock_rate():
    device = cuda.Device(0)
    compute_capability = device.compute_capability()
    clock_rate = device.get_attribute(cuda.device_attribute.CLOCK_RATE)# in kHz
    sm_count = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)
    cores_per_sm = get_cuda_cores_per_sm(compute_capability)return compute_capability, clock_rate, sm_count, cores_per_sm

def get_cuda_cores_per_sm(compute_capability):
    major, minor = compute_capability
    if major ==2:
        return32elif major ==3:
        return192elif major ==5:
        return128elif major ==6 and minor in[0, 1]:
        return64elif major ==6 and minor ==2:
        return128elif major ==7 and minor in[0, 5]:
        return64elif major ==7 and minor ==2:
        return64elif major ==8 and minor in[0, 6]:
        return128
    else:
        raise ValueError("Unknown compute capability")

def calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm):
    clock_rate_hz = clock_rate * 1e3  # Convert kHz to Hz
    flops = clock_rate_hz * sm_count * cores_per_sm * 2# 2 FLOPs per clock per core (FMA)return flops

compute_capability, clock_rate, sm_count, cores_per_sm = get_gpu_compute_capability_and_clock_rate()
theoretical_flops = calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm)

print(f"GPU compute capability: {compute_capability}")#计算能力
print(f"Clock rate (kHz): {clock_rate}")#最大时钟频率        
print(f"Number of SMs: {sm_count}")#流多处理器数量
print(f"Cores per SM: {cores_per_sm}")#流多处理器核数
print(f"Theoretical FLOPS for float32: {theoretical_flops / 1e12} TFLOPS")#理论浮点运算能力

在这里插入图片描述
2. 手动绘制rootline model.
roofline_model.py
绘图字体下载:https://blog.csdn.net/qq_35165004/article/details/132738991

import sys

import torch
import torch.nn as nn
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
import os
import argparse

# 定义一个测试模型
class SimpleModel(nn.Module):
    def __init__(self, input_features, output_features):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_features, output_features, bias=False)

    def forward(self, x):
        x = self.fc1(x)return x

def main(args):
    # 模型和输入数据

    input_features =8192
    output_features =8192
    batch_size =8192

    model = SimpleModel(input_features, output_features)
    input_data = torch.randn(batch_size, input_features)

    test_count =10# 计算 FLOPs 和内存访问量
    flops = FlopCountAnalysis(model, input_data).total() * test_count
    activations = ActivationCountAnalysis(model, input_data).total() + input_data.numel()
    print("activations:", activations)# 计算参数个数
    params = sum(p.numel()forpin model.parameters())# 内存访问量假定为 activations 和params 乘以 4 字节(假设 activations 和 params 是 float32 类型)
    activation_memory_access = activations * 4
    params_memory_access = params * 4
    memory_access = activation_memory_access + params_memory_access
    memory_access = memory_access * test_count

    model = model.cuda()

    input_data = input_data.float().cuda()foriin range(5):
        output = model(input_data)
        torch.cuda.synchronize()# 使用 PyTorch Profiler 计算 FLOPs、内存访问和执行时间
    with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
        for_in range(test_count):
            output = model(input_data)
    key_averages = prof.key_averages()forevin key_averages:
        print(ev)
    total_cuda_time = sum([event.self_cuda_time_total foreventin key_averages if event.key.find("sgemm")>=0]) / 1e6  # 转换至秒# FLOPs 转换至 GFLOPs
    flops_measured_glops = flops / 1e9

    # 内存带宽测量
    memory_access_gb = memory_access / 1e9
    bandwidth_measured = memory_access_gb / total_cuda_time  # 单位:GB/s
    print("bandwidth_measured:", bandwidth_measured)# GPU 的峰值性能和带宽
    peak_performance =38.98368 * 1e3  # 单位:GFLOPs
    memory_bandwidth =1555.0# 单位:GB/s# 计算 Roofline 模型中的数据点
    Io = np.logspace(-2, 4, 100)# GFLOPs/GB
    performance = np.minimum(peak_performance, Io * memory_bandwidth)# 不同计算密度下的最大FLOPs/S,上限为峰值算力peak_performance#==============================================================================# 绘制 Roofline 模型
    plt.figure(figsize=(10, 6))

    thresold =0.75#设定使用率# 设置字体以支持中文
    font_path ='simsun.ttc'# 在这里替换为你的字体路径
    font_prop = FontProperties(fname=font_path)# Bandwidth Bound
    x = Io[Io <(peak_performance / memory_bandwidth)]
    plt.fill_between(x, np.minimum(peak_performance, x * memory_bandwidth) * thresold,
                     np.minimum(peak_performance, x * memory_bandwidth),
                     color='lightblue', alpha=0.6, label='Bandwidth Bound')# Compute Bound
    x2 = Io[Io >=(peak_performance / memory_bandwidth)]
    plt.fill_between(x2, np.minimum(peak_performance, x2 * memory_bandwidth) * thresold,
                     np.minimum(peak_performance, x2 * memory_bandwidth),
                     color='green', alpha=0.6, label='Compute Bound')# 绘制低性能区域
    plt.fill_between(Io, 0, np.minimum(peak_performance, Io * memory_bandwidth) * thresold,
                     color='gray', alpha=0.6, label='poor performance')

    plt.axhline(y=peak_performance, color='b', linestyle='--',
                label=f'峰值计算能力:{peak_performance / 1e3:.2f}TFLOPs')

    plt.axvline(x=peak_performance / memory_bandwidth, color='g', linestyle='--',
                label=f'{peak_performance / memory_bandwidth:.2f}GFLOPs/GB')

    plt.loglog(Io, performance, label='Roofline')

    arithmetic_intensity_measured = flops_measured_glops / memory_access_gb  # GFLOPs/GB(算法的静态属性)
    point_y = arithmetic_intensity_measured * bandwidth_measured

    plt.scatter(arithmetic_intensity_measured, point_y, c='r',
                label=f'Measured Points {point_y / 1e3:.2f} TFLOPs/sec {point_y * 100 / peak_performance:.2f}%')

    plt.xlabel('操作强度 [GFLOPs/GB]', fontproperties=font_prop)
    plt.ylabel('性能 [GFLOPs/sec]', fontproperties=font_prop)
    plt.title('Roofline 模型', fontproperties=font_prop)
    plt.legend(prop=font_prop)# 保存图片而不显示
    plt.savefig('roofline_model.png')
    plt.close()

    print(f"FLOPs: {flops} FLOPs")
    print(f"内存访问量: {memory_access} 字节")
    print(f"执行时间: {total_cuda_time:.4f} 秒")
    print(f"理论值的:{point_y * 100 / peak_performance:.2f}%")

parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--warmup_only", action="store_true", help="warmup_only")

args = parser.parse_args()
main(args)

python roofline_model.py

在这里插入图片描述
在这里插入图片描述
理论值24.32%,即算力只使用了24%。
如下图,29.87% 为pytorch 实现,37.80%为triton 算子实现,因此,pytorch改进方向为提升模型的算力,降低执行时间。
提升模型的flops,即改进算子的速度、冗余度。
减少内存读取大小,可以量化、改进算子计算冗余度。在这里插入图片描述
3. nsight system 安装教程
官方文档:https://docs.nvidia.com/nsight-systems/UserGuide/index.html#profiling-windows-targets-from-the-gui
nsight system 需要安装服务器段 和客户端
方式一: linux服务器端采用命令行执行,获取结果文件,window客户端读取文件
linux服务器端安装:cli 文件

aptinstall ./NsightSystems-linux-cli-public-2023.2.1.122-3259852.deb

在这里插入图片描述
windows 端安装:选择Windows Host下载安装

  1. 使用:demo:1. linux 生成可执行文件
#示例,产生的nsight_t5_fp16下载到本地用nsight工具查看
nsys profile -o nsight_t5_fp16 -ftrue python temp.py
  1. window 端加载文件
文件-->打开:加载nsys-rep文件

方式二: window 服务器执行命令,window客户端远程连接服务器,实时读取文件

  1. 以管理员的方式打开cmd,并执行指令
>>D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda -o baseline -wtrue D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py

在这里插入图片描述
2. windows host 客户端打开nsys-rep软件
》》>>file–>open–>baseline.nsys-rep

在这里插入图片描述
4. nsight system 使用教程–nsight system 分析pytorch 神经网络

import torch.cuda.nvtx as nvtx
nvtx.range_push("copy to device")#添加注释
nvtx.range_pop()#弹出
也可以使用
import nvtx
@nvtx.annotate(color="blue")
def my_function():
    foriin range(5):
        with nvtx.annotate("my_loop", color="red"):
            time.sleep(i)

my_function()
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())import torch.cuda.nvtx as nvtx

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)return output

def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    nvtx.range_push("data loading")for batch_idx, (data, target)in enumerate(train_loader):
        nvtx.range_pop()
        nvtx.range_push("batch:"+str(batch_idx))
        nvtx.range_push("copy to device")
        data, target = data.to(device), target.to(device)
        nvtx.range_pop()

        nvtx.range_push("forward pass")
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        nvtx.range_pop()

        nvtx.range_push("backward pass")
        loss.backward()
        optimizer.step()
        nvtx.range_pop()
        nvtx.range_pop()if batch_idx % args.log_interval ==0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:
                break

def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=3, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--no-mps', action='store_true', default=False,
                        help='disables macOS GPU training')
    parser.add_argument('--dry-run', action='store_true', default=False,
                        help='quickly check a single pass')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    use_mps = not args.no_mps and torch.backends.mps.is_available()

    torch.manual_seed(args.seed)if use_cuda:
        device = torch.device("cuda")elif use_mps:
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    train_kwargs ={'batch_size': args.batch_size}
    test_kwargs ={'batch_size': args.test_batch_size}if use_cuda:
        cuda_kwargs ={'num_workers':1,
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,),(0.3081,))])
    dataset1 = datasets.MNIST('../data', train=True, download=True,
                       transform=transform)
    dataset2 = datasets.MNIST('../data', train=False,
                       transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)forepochin range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)
        scheduler.step()if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ =='__main__':
    main()# D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda,nvtx,cudnn -o baseline -w true D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py

客户端加载baseline.nsys-rep 文件。
5. nsight compute 安装教程

https://developer.nvidia.com/tools-overview/nsight-compute/get-started
下载linux 版本,sh nsight-compute-linux-2024.2.1.2-34372528.run
安装默认路径: /usr/local/NVIDIA-Nsight-Compute
在这里插入图片描述
window 安装:
安装windows 版本

6 nsight compute 使用教程
》》 >>ncu --list-sets

在这里插入图片描述
采用c++ 代码测试

#include <stdio.h>

__global__ void kernel_A(double* A, int N, int M){
    double d =0.0;
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    // printf("Kernel A\n");if(idx < N){#pragma unroll(100)for(int j =0; j < M; ++j){
            d += A[idx];}

        A[idx]= d;}}

__global__ void kernel_B(double* A, int N, int M){
    double d =0.0;
    int idx = threadIdx.x + blockIdx.x * blockDim.x;if(idx < N){#pragma unroll(100)for(int j =0; j < M; ++j){
            d += A[idx];}

        A[idx]= d;}}

__global__ void kernel_C(double* A, const double* B, int N){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    // printf("Kernel C\n");

    // Strided memory access: warp 0 accesses (0, stride, 2*stride, ...), warp 1 accesses
    // (1, stride + 1, 2*stride + 1, ...).
    const int stride =16;
    int strided_idx = threadIdx.x * stride + blockIdx.x % stride + (blockIdx.x / stride) * stride * blockDim.x;if(strided_idx < N){
        A[idx]= B[strided_idx] + B[strided_idx];}}

int main(){

    double* A;
    double* B;

    int N =80 * 2048 * 100;
    size_t sz = N * sizeof(double);

    cudaMalloc((void**)&A, sz);
    cudaMalloc((void**)&B, sz);

    cudaMemset(A, 0, sz);
    cudaMemset(B, 0, sz);

    int threadsPerBlock =64;
    int numBlocks =(N + threadsPerBlock - 1) / threadsPerBlock;

    int M =10000;
    kernel_A<<<numBlocks, threadsPerBlock>>>(A, N, M);

    cudaFuncSetAttribute(kernel_B, cudaFuncAttributeMaxDynamicSharedMemorySize, 48 * 1024);
    kernel_B<<<numBlocks, threadsPerBlock, 48 * 1024>>>(A, N, M);

    kernel_C<<<numBlocks, threadsPerBlock>>>(A, B, N);

    cudaDeviceSynchronize();

将c++ 文件编译
》》>>nvcc -o kernel_abc kernel_abc.cu
采用nsight compute 进行profile
》》>>ncu --set detailed -o kernel_abc ./test/kernel_abc
在这里插入图片描述

window 客户端打开
在这里插入图片描述
python 测试

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())#print(torch.cuda.list_gpu_processes())import torch.cuda.nvtx as nvtx
print(torch.cuda.device_count())

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)return output

def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    nvtx.range_push("data loading")for batch_idx, (data, target)in enumerate(train_loader):
        nvtx.range_pop()
        nvtx.range_push("batch:"+str(batch_idx))
        nvtx.range_push("copy to device")
        data, target = data.to(device), target.to(device)
        nvtx.range_pop()

        nvtx.range_push("forward pass")
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        nvtx.range_pop()

        nvtx.range_push("backward pass")
        loss.backward()
        optimizer.step()
        nvtx.range_pop()
        nvtx.range_pop()if batch_idx % args.log_interval ==0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:
                breakif batch_idx>10:
            break

def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=1, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--no-mps', action='store_true', default=False,
                        help='disables macOS GPU training')
    parser.add_argument('--dry-run', action='store_true', default=False,
                        help='quickly check a single pass')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    use_mps = not args.no_mps and torch.backends.mps.is_available()

    torch.manual_seed(args.seed)if use_cuda:
        device = torch.device("cuda")elif use_mps:
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    train_kwargs ={'batch_size': args.batch_size}
    test_kwargs ={'batch_size': args.test_batch_size}if use_cuda:
        cuda_kwargs ={'num_workers':1,
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,),(0.3081,))])
    dataset1 = datasets.MNIST('../data', train=True, download=True,
                       transform=transform)
    dataset2 = datasets.MNIST('../data', train=False,
                       transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)forepochin range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)
        scheduler.step()if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ =='__main__':
    main()

    ncu --set roofline -o models python test/test1.py

windows 客户端加载models.ncu-rep
在这里插入图片描述

7 nsight system 与nsight compute 应用区别
nsight system 倾向于全局角度分析核函数的读写、调度、内存占用率,cpu与gpu的异步调用
nsight compute 倾向于核函数内部操作,不同模块的执行时间、吞吐量、带宽分析等。

8 遇到的问题
8.1. 加载到98%,卡住不动。原因:笔记本电脑性能低,带不动。换台式机即可。
8.2 ERROR ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM
解决:添加root 权限
docker run -it --gpus all --name llm-04 -v /jiayafei_linux/:/home/ -p38014:8014 --privileged=true pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel /bin/bash

标签: 人工智能

本文转载自: https://blog.csdn.net/weixin_40777649/article/details/140267921
版权归原作者 云帆@ 所有, 如有侵权,请联系我们删除。

“NsightCompute教程入门”的评论:

还没有评论