NsightCompute教程入门

一、定义

使用pycuda 获取显卡的性能
手动绘制rootline model.
nsight system 安装教程
nsight system 使用教程–nsight system 分析pytorch 神经网络
nsight compute 安装教程
nsight compute 使用教程
nsight system 与nsight compute 应用区别
遇到的问题

二、实现

https://blog.csdn.net/weixin_40653140/article/details/136238420
https://blog.csdn.net/m0_61864577/article/details/140022323
https://developer.nvidia.cn/tools-overview

使用pycuda 获取显卡的性能 test.py

import pycuda.driver as cuda
import pycuda.autoinit

def get_gpu_compute_capability_and_clock_rate():
    device = cuda.Device(0)
    compute_capability = device.compute_capability()
    clock_rate = device.get_attribute(cuda.device_attribute.CLOCK_RATE)# in kHz
    sm_count = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)
    cores_per_sm = get_cuda_cores_per_sm(compute_capability)return compute_capability, clock_rate, sm_count, cores_per_sm

def get_cuda_cores_per_sm(compute_capability):
    major, minor = compute_capability
    if major ==2:
        return32elif major ==3:
        return192elif major ==5:
        return128elif major ==6 and minor in[0, 1]:
        return64elif major ==6 and minor ==2:
        return128elif major ==7 and minor in[0, 5]:
        return64elif major ==7 and minor ==2:
        return64elif major ==8 and minor in[0, 6]:
        return128
    else:
        raise ValueError("Unknown compute capability")

def calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm):
    clock_rate_hz = clock_rate * 1e3  # Convert kHz to Hz
    flops = clock_rate_hz * sm_count * cores_per_sm * 2# 2 FLOPs per clock per core (FMA)return flops

compute_capability, clock_rate, sm_count, cores_per_sm = get_gpu_compute_capability_and_clock_rate()
theoretical_flops = calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm)

print(f"GPU compute capability: {compute_capability}")#计算能力
print(f"Clock rate (kHz): {clock_rate}")#最大时钟频率        
print(f"Number of SMs: {sm_count}")#流多处理器数量
print(f"Cores per SM: {cores_per_sm}")#流多处理器核数
print(f"Theoretical FLOPS for float32: {theoretical_flops / 1e12} TFLOPS")#理论浮点运算能力

在这里插入图片描述
2. 手动绘制rootline model.
roofline_model.py
绘图字体下载：https://blog.csdn.net/qq_35165004/article/details/132738991

import sys

import torch
import torch.nn as nn
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
import os
import argparse

# 定义一个测试模型
class SimpleModel(nn.Module):
    def __init__(self, input_features, output_features):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_features, output_features, bias=False)

    def forward(self, x):
        x = self.fc1(x)return x

def main(args):
    # 模型和输入数据

    input_features =8192
    output_features =8192
    batch_size =8192

    model = SimpleModel(input_features, output_features)
    input_data = torch.randn(batch_size, input_features)

    test_count =10# 计算 FLOPs 和内存访问量
    flops = FlopCountAnalysis(model, input_data).total() * test_count
    activations = ActivationCountAnalysis(model, input_data).total() + input_data.numel()
    print("activations:", activations)# 计算参数个数
    params = sum(p.numel()forpin model.parameters())# 内存访问量假定为 activations 和params 乘以 4 字节（假设 activations 和 params 是 float32 类型）
    activation_memory_access = activations * 4
    params_memory_access = params * 4
    memory_access = activation_memory_access + params_memory_access
    memory_access = memory_access * test_count

    model = model.cuda()

    input_data = input_data.float().cuda()foriin range(5):
        output = model(input_data)
        torch.cuda.synchronize()# 使用 PyTorch Profiler 计算 FLOPs、内存访问和执行时间
    with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
        for_in range(test_count):
            output = model(input_data)
    key_averages = prof.key_averages()forevin key_averages:
        print(ev)
    total_cuda_time = sum([event.self_cuda_time_total foreventin key_averages if event.key.find("sgemm")>=0]) / 1e6  # 转换至秒# FLOPs 转换至 GFLOPs
    flops_measured_glops = flops / 1e9

    # 内存带宽测量
    memory_access_gb = memory_access / 1e9
    bandwidth_measured = memory_access_gb / total_cuda_time  # 单位：GB/s
    print("bandwidth_measured:", bandwidth_measured)# GPU 的峰值性能和带宽
    peak_performance =38.98368 * 1e3  # 单位：GFLOPs
    memory_bandwidth =1555.0# 单位：GB/s# 计算 Roofline 模型中的数据点
    Io = np.logspace(-2, 4, 100)# GFLOPs/GB
    performance = np.minimum(peak_performance, Io * memory_bandwidth)# 不同计算密度下的最大FLOPs/S,上限为峰值算力peak_performance#==============================================================================# 绘制 Roofline 模型
    plt.figure(figsize=(10, 6))

    thresold =0.75#设定使用率# 设置字体以支持中文
    font_path ='simsun.ttc'# 在这里替换为你的字体路径
    font_prop = FontProperties(fname=font_path)# Bandwidth Bound
    x = Io[Io <(peak_performance / memory_bandwidth)]
    plt.fill_between(x, np.minimum(peak_performance, x * memory_bandwidth) * thresold,
                     np.minimum(peak_performance, x * memory_bandwidth),
                     color='lightblue', alpha=0.6, label='Bandwidth Bound')# Compute Bound
    x2 = Io[Io >=(peak_performance / memory_bandwidth)]
    plt.fill_between(x2, np.minimum(peak_performance, x2 * memory_bandwidth) * thresold,
                     np.minimum(peak_performance, x2 * memory_bandwidth),
                     color='green', alpha=0.6, label='Compute Bound')# 绘制低性能区域
    plt.fill_between(Io, 0, np.minimum(peak_performance, Io * memory_bandwidth) * thresold,
                     color='gray', alpha=0.6, label='poor performance')

    plt.axhline(y=peak_performance, color='b', linestyle='--',
                label=f'峰值计算能力:{peak_performance / 1e3:.2f}TFLOPs')

    plt.axvline(x=peak_performance / memory_bandwidth, color='g', linestyle='--',
                label=f'{peak_performance / memory_bandwidth:.2f}GFLOPs/GB')

    plt.loglog(Io, performance, label='Roofline')

    arithmetic_intensity_measured = flops_measured_glops / memory_access_gb  # GFLOPs/GB(算法的静态属性)
    point_y = arithmetic_intensity_measured * bandwidth_measured

    plt.scatter(arithmetic_intensity_measured, point_y, c='r',
                label=f'Measured Points {point_y / 1e3:.2f} TFLOPs/sec {point_y * 100 / peak_performance:.2f}%')

    plt.xlabel('操作强度 [GFLOPs/GB]', fontproperties=font_prop)
    plt.ylabel('性能 [GFLOPs/sec]', fontproperties=font_prop)
    plt.title('Roofline 模型', fontproperties=font_prop)
    plt.legend(prop=font_prop)# 保存图片而不显示
    plt.savefig('roofline_model.png')
    plt.close()

    print(f"FLOPs: {flops} FLOPs")
    print(f"内存访问量: {memory_access} 字节")
    print(f"执行时间: {total_cuda_time:.4f} 秒")
    print(f"理论值的:{point_y * 100 / peak_performance:.2f}%")

parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--warmup_only", action="store_true", help="warmup_only")

args = parser.parse_args()
main(args)

python roofline_model.py

在这里插入图片描述

理论值24.32%，即算力只使用了24%。
如下图，29.87% 为pytorch 实现，37.80%为triton 算子实现，因此，pytorch改进方向为提升模型的算力，降低执行时间。
提升模型的flops,即改进算子的速度、冗余度。
减少内存读取大小，可以量化、改进算子计算冗余度。在这里插入图片描述
3. nsight system 安装教程
官方文档：https://docs.nvidia.com/nsight-systems/UserGuide/index.html#profiling-windows-targets-from-the-gui
nsight system 需要安装服务器段和客户端
方式一： linux服务器端采用命令行执行，获取结果文件，window客户端读取文件
linux服务器端安装：cli 文件

aptinstall ./NsightSystems-linux-cli-public-2023.2.1.122-3259852.deb

在这里插入图片描述
windows 端安装：选择Windows Host下载安装

使用：demo:1. linux 生成可执行文件

#示例，产生的nsight_t5_fp16下载到本地用nsight工具查看
nsys profile -o nsight_t5_fp16 -ftrue python temp.py

window 端加载文件

文件-->打开：加载nsys-rep文件

方式二： window 服务器执行命令，window客户端远程连接服务器，实时读取文件

以管理员的方式打开cmd,并执行指令

>>D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda -o baseline -wtrue D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py

在这里插入图片描述
2. windows host 客户端打开nsys-rep软件
》》>>file–>open–>baseline.nsys-rep

在这里插入图片描述
4. nsight system 使用教程–nsight system 分析pytorch 神经网络

import torch.cuda.nvtx as nvtx
nvtx.range_push("copy to device")#添加注释
nvtx.range_pop()#弹出
也可以使用
import nvtx
@nvtx.annotate(color="blue")
def my_function():
    foriin range(5):
        with nvtx.annotate("my_loop", color="red"):
            time.sleep(i)

my_function()

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())import torch.cuda.nvtx as nvtx

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)return output

def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    nvtx.range_push("data loading")for batch_idx, (data, target)in enumerate(train_loader):
        nvtx.range_pop()
        nvtx.range_push("batch:"+str(batch_idx))
        nvtx.range_push("copy to device")
        data, target = data.to(device), target.to(device)
        nvtx.range_pop()

        nvtx.range_push("forward pass")
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        nvtx.range_pop()

        nvtx.range_push("backward pass")
        loss.backward()
        optimizer.step()
        nvtx.range_pop()
        nvtx.range_pop()if batch_idx % args.log_interval ==0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:
                break

def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=3, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--no-mps', action='store_true', default=False,
                        help='disables macOS GPU training')
    parser.add_argument('--dry-run', action='store_true', default=False,
                        help='quickly check a single pass')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    use_mps = not args.no_mps and torch.backends.mps.is_available()

    torch.manual_seed(args.seed)if use_cuda:
        device = torch.device("cuda")elif use_mps:
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    train_kwargs ={'batch_size': args.batch_size}
    test_kwargs ={'batch_size': args.test_batch_size}if use_cuda:
        cuda_kwargs ={'num_workers':1,
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,),(0.3081,))])
    dataset1 = datasets.MNIST('../data', train=True, download=True,
                       transform=transform)
    dataset2 = datasets.MNIST('../data', train=False,
                       transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)forepochin range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)
        scheduler.step()if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ =='__main__':
    main()# D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda,nvtx,cudnn -o baseline -w true D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py

客户端加载baseline.nsys-rep 文件。
5. nsight compute 安装教程

https://developer.nvidia.com/tools-overview/nsight-compute/get-started
下载linux 版本，sh nsight-compute-linux-2024.2.1.2-34372528.run
安装默认路径： /usr/local/NVIDIA-Nsight-Compute
在这里插入图片描述
window 安装：
安装windows 版本

6 nsight compute 使用教程
》》 >>ncu --list-sets

在这里插入图片描述
采用c++ 代码测试

#include <stdio.h>

__global__ void kernel_A(double* A, int N, int M){
    double d =0.0;
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    // printf("Kernel A\n");if(idx < N){#pragma unroll(100)for(int j =0; j < M; ++j){
            d += A[idx];}

        A[idx]= d;}}

__global__ void kernel_B(double* A, int N, int M){
    double d =0.0;
    int idx = threadIdx.x + blockIdx.x * blockDim.x;if(idx < N){#pragma unroll(100)for(int j =0; j < M; ++j){
            d += A[idx];}

        A[idx]= d;}}

__global__ void kernel_C(double* A, const double* B, int N){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    // printf("Kernel C\n");

    // Strided memory access: warp 0 accesses (0, stride, 2*stride, ...), warp 1 accesses
    // (1, stride + 1, 2*stride + 1, ...).
    const int stride =16;
    int strided_idx = threadIdx.x * stride + blockIdx.x % stride + (blockIdx.x / stride) * stride * blockDim.x;if(strided_idx < N){
        A[idx]= B[strided_idx] + B[strided_idx];}}

int main(){

    double* A;
    double* B;

    int N =80 * 2048 * 100;
    size_t sz = N * sizeof(double);

    cudaMalloc((void**)&A, sz);
    cudaMalloc((void**)&B, sz);

    cudaMemset(A, 0, sz);
    cudaMemset(B, 0, sz);

    int threadsPerBlock =64;
    int numBlocks =(N + threadsPerBlock - 1) / threadsPerBlock;

    int M =10000;
    kernel_A<<<numBlocks, threadsPerBlock>>>(A, N, M);

    cudaFuncSetAttribute(kernel_B, cudaFuncAttributeMaxDynamicSharedMemorySize, 48 * 1024);
    kernel_B<<<numBlocks, threadsPerBlock, 48 * 1024>>>(A, N, M);

    kernel_C<<<numBlocks, threadsPerBlock>>>(A, B, N);

    cudaDeviceSynchronize();

将c++ 文件编译
》》>>nvcc -o kernel_abc kernel_abc.cu
采用nsight compute 进行profile
》》>>ncu --set detailed -o kernel_abc ./test/kernel_abc
在这里插入图片描述

window 客户端打开
在这里插入图片描述
python 测试

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())#print(torch.cuda.list_gpu_processes())import torch.cuda.nvtx as nvtx
print(torch.cuda.device_count())

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)return output

def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    nvtx.range_push("data loading")for batch_idx, (data, target)in enumerate(train_loader):
        nvtx.range_pop()
        nvtx.range_push("batch:"+str(batch_idx))
        nvtx.range_push("copy to device")
        data, target = data.to(device), target.to(device)
        nvtx.range_pop()

        nvtx.range_push("forward pass")
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        nvtx.range_pop()

        nvtx.range_push("backward pass")
        loss.backward()
        optimizer.step()
        nvtx.range_pop()
        nvtx.range_pop()if batch_idx % args.log_interval ==0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:
                breakif batch_idx>10:
            break

def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=1, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--no-mps', action='store_true', default=False,
                        help='disables macOS GPU training')
    parser.add_argument('--dry-run', action='store_true', default=False,
                        help='quickly check a single pass')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    use_mps = not args.no_mps and torch.backends.mps.is_available()

    torch.manual_seed(args.seed)if use_cuda:
        device = torch.device("cuda")elif use_mps:
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    train_kwargs ={'batch_size': args.batch_size}
    test_kwargs ={'batch_size': args.test_batch_size}if use_cuda:
        cuda_kwargs ={'num_workers':1,
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,),(0.3081,))])
    dataset1 = datasets.MNIST('../data', train=True, download=True,
                       transform=transform)
    dataset2 = datasets.MNIST('../data', train=False,
                       transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)forepochin range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)
        scheduler.step()if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ =='__main__':
    main()

    ncu --set roofline -o models python test/test1.py

windows 客户端加载models.ncu-rep
在这里插入图片描述

7 nsight system 与nsight compute 应用区别
nsight system 倾向于全局角度分析核函数的读写、调度、内存占用率，cpu与gpu的异步调用
nsight compute 倾向于核函数内部操作，不同模块的执行时间、吞吐量、带宽分析等。

8 遇到的问题
8.1. 加载到98%，卡住不动。原因：笔记本电脑性能低，带不动。换台式机即可。
8.2 ERROR ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM
解决：添加root 权限
docker run -it --gpus all --name llm-04 -v /jiayafei_linux/:/home/ -p38014:8014 --privileged=true pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel /bin/bash

标签：人工智能

本文转载自: https://blog.csdn.net/weixin_40777649/article/details/140267921
版权归原作者 云帆@ 所有，如有侵权，请联系我们删除。

NsightCompute教程入门

一、定义

二、实现

发表评论

“NsightCompute教程入门”的评论:

关于作者

overfit同步小助手

相关阅读

文章导航