一、定义
- 使用pycuda 获取显卡的性能
- 手动绘制rootline model.
- nsight system 安装教程
- nsight system 使用教程–nsight system 分析pytorch 神经网络
- nsight compute 安装教程
- nsight compute 使用教程
- nsight system 与nsight compute 应用区别
- 遇到的问题
二、实现
https://blog.csdn.net/weixin_40653140/article/details/136238420
https://blog.csdn.net/m0_61864577/article/details/140022323
https://developer.nvidia.cn/tools-overview
- 使用pycuda 获取显卡的性能 test.py
import pycuda.driver as cuda
import pycuda.autoinit
def get_gpu_compute_capability_and_clock_rate():
device = cuda.Device(0)
compute_capability = device.compute_capability()
clock_rate = device.get_attribute(cuda.device_attribute.CLOCK_RATE)# in kHz
sm_count = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)
cores_per_sm = get_cuda_cores_per_sm(compute_capability)return compute_capability, clock_rate, sm_count, cores_per_sm
def get_cuda_cores_per_sm(compute_capability):
major, minor = compute_capability
if major ==2:
return32elif major ==3:
return192elif major ==5:
return128elif major ==6 and minor in[0, 1]:
return64elif major ==6 and minor ==2:
return128elif major ==7 and minor in[0, 5]:
return64elif major ==7 and minor ==2:
return64elif major ==8 and minor in[0, 6]:
return128
else:
raise ValueError("Unknown compute capability")
def calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm):
clock_rate_hz = clock_rate * 1e3 # Convert kHz to Hz
flops = clock_rate_hz * sm_count * cores_per_sm * 2# 2 FLOPs per clock per core (FMA)return flops
compute_capability, clock_rate, sm_count, cores_per_sm = get_gpu_compute_capability_and_clock_rate()
theoretical_flops = calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm)
print(f"GPU compute capability: {compute_capability}")#计算能力
print(f"Clock rate (kHz): {clock_rate}")#最大时钟频率
print(f"Number of SMs: {sm_count}")#流多处理器数量
print(f"Cores per SM: {cores_per_sm}")#流多处理器核数
print(f"Theoretical FLOPS for float32: {theoretical_flops / 1e12} TFLOPS")#理论浮点运算能力
2. 手动绘制rootline model.
roofline_model.py
绘图字体下载:https://blog.csdn.net/qq_35165004/article/details/132738991
import sys
import torch
import torch.nn as nn
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
import os
import argparse
# 定义一个测试模型
class SimpleModel(nn.Module):
def __init__(self, input_features, output_features):
super(SimpleModel, self).__init__()
self.fc1 = nn.Linear(input_features, output_features, bias=False)
def forward(self, x):
x = self.fc1(x)return x
def main(args):
# 模型和输入数据
input_features =8192
output_features =8192
batch_size =8192
model = SimpleModel(input_features, output_features)
input_data = torch.randn(batch_size, input_features)
test_count =10# 计算 FLOPs 和内存访问量
flops = FlopCountAnalysis(model, input_data).total() * test_count
activations = ActivationCountAnalysis(model, input_data).total() + input_data.numel()
print("activations:", activations)# 计算参数个数
params = sum(p.numel()forpin model.parameters())# 内存访问量假定为 activations 和params 乘以 4 字节(假设 activations 和 params 是 float32 类型)
activation_memory_access = activations * 4
params_memory_access = params * 4
memory_access = activation_memory_access + params_memory_access
memory_access = memory_access * test_count
model = model.cuda()
input_data = input_data.float().cuda()foriin range(5):
output = model(input_data)
torch.cuda.synchronize()# 使用 PyTorch Profiler 计算 FLOPs、内存访问和执行时间
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
for_in range(test_count):
output = model(input_data)
key_averages = prof.key_averages()forevin key_averages:
print(ev)
total_cuda_time = sum([event.self_cuda_time_total foreventin key_averages if event.key.find("sgemm")>=0]) / 1e6 # 转换至秒# FLOPs 转换至 GFLOPs
flops_measured_glops = flops / 1e9
# 内存带宽测量
memory_access_gb = memory_access / 1e9
bandwidth_measured = memory_access_gb / total_cuda_time # 单位:GB/s
print("bandwidth_measured:", bandwidth_measured)# GPU 的峰值性能和带宽
peak_performance =38.98368 * 1e3 # 单位:GFLOPs
memory_bandwidth =1555.0# 单位:GB/s# 计算 Roofline 模型中的数据点
Io = np.logspace(-2, 4, 100)# GFLOPs/GB
performance = np.minimum(peak_performance, Io * memory_bandwidth)# 不同计算密度下的最大FLOPs/S,上限为峰值算力peak_performance#==============================================================================# 绘制 Roofline 模型
plt.figure(figsize=(10, 6))
thresold =0.75#设定使用率# 设置字体以支持中文
font_path ='simsun.ttc'# 在这里替换为你的字体路径
font_prop = FontProperties(fname=font_path)# Bandwidth Bound
x = Io[Io <(peak_performance / memory_bandwidth)]
plt.fill_between(x, np.minimum(peak_performance, x * memory_bandwidth) * thresold,
np.minimum(peak_performance, x * memory_bandwidth),
color='lightblue', alpha=0.6, label='Bandwidth Bound')# Compute Bound
x2 = Io[Io >=(peak_performance / memory_bandwidth)]
plt.fill_between(x2, np.minimum(peak_performance, x2 * memory_bandwidth) * thresold,
np.minimum(peak_performance, x2 * memory_bandwidth),
color='green', alpha=0.6, label='Compute Bound')# 绘制低性能区域
plt.fill_between(Io, 0, np.minimum(peak_performance, Io * memory_bandwidth) * thresold,
color='gray', alpha=0.6, label='poor performance')
plt.axhline(y=peak_performance, color='b', linestyle='--',
label=f'峰值计算能力:{peak_performance / 1e3:.2f}TFLOPs')
plt.axvline(x=peak_performance / memory_bandwidth, color='g', linestyle='--',
label=f'{peak_performance / memory_bandwidth:.2f}GFLOPs/GB')
plt.loglog(Io, performance, label='Roofline')
arithmetic_intensity_measured = flops_measured_glops / memory_access_gb # GFLOPs/GB(算法的静态属性)
point_y = arithmetic_intensity_measured * bandwidth_measured
plt.scatter(arithmetic_intensity_measured, point_y, c='r',
label=f'Measured Points {point_y / 1e3:.2f} TFLOPs/sec {point_y * 100 / peak_performance:.2f}%')
plt.xlabel('操作强度 [GFLOPs/GB]', fontproperties=font_prop)
plt.ylabel('性能 [GFLOPs/sec]', fontproperties=font_prop)
plt.title('Roofline 模型', fontproperties=font_prop)
plt.legend(prop=font_prop)# 保存图片而不显示
plt.savefig('roofline_model.png')
plt.close()
print(f"FLOPs: {flops} FLOPs")
print(f"内存访问量: {memory_access} 字节")
print(f"执行时间: {total_cuda_time:.4f} 秒")
print(f"理论值的:{point_y * 100 / peak_performance:.2f}%")
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--warmup_only", action="store_true", help="warmup_only")
args = parser.parse_args()
main(args)
python roofline_model.py
理论值24.32%,即算力只使用了24%。
如下图,29.87% 为pytorch 实现,37.80%为triton 算子实现,因此,pytorch改进方向为提升模型的算力,降低执行时间。
提升模型的flops,即改进算子的速度、冗余度。
减少内存读取大小,可以量化、改进算子计算冗余度。
3. nsight system 安装教程
官方文档:https://docs.nvidia.com/nsight-systems/UserGuide/index.html#profiling-windows-targets-from-the-gui
nsight system 需要安装服务器段 和客户端
方式一: linux服务器端采用命令行执行,获取结果文件,window客户端读取文件
linux服务器端安装:cli 文件
aptinstall ./NsightSystems-linux-cli-public-2023.2.1.122-3259852.deb
windows 端安装:选择Windows Host下载安装
- 使用:demo:1. linux 生成可执行文件
#示例,产生的nsight_t5_fp16下载到本地用nsight工具查看
nsys profile -o nsight_t5_fp16 -ftrue python temp.py
- window 端加载文件
文件-->打开:加载nsys-rep文件
方式二: window 服务器执行命令,window客户端远程连接服务器,实时读取文件
- 以管理员的方式打开cmd,并执行指令
>>D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda -o baseline -wtrue D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py
2. windows host 客户端打开nsys-rep软件
》》>>file–>open–>baseline.nsys-rep
4. nsight system 使用教程–nsight system 分析pytorch 神经网络
import torch.cuda.nvtx as nvtx
nvtx.range_push("copy to device")#添加注释
nvtx.range_pop()#弹出
也可以使用
import nvtx
@nvtx.annotate(color="blue")
def my_function():
foriin range(5):
with nvtx.annotate("my_loop", color="red"):
time.sleep(i)
my_function()
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())import torch.cuda.nvtx as nvtx
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)return output
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
nvtx.range_push("data loading")for batch_idx, (data, target)in enumerate(train_loader):
nvtx.range_pop()
nvtx.range_push("batch:"+str(batch_idx))
nvtx.range_push("copy to device")
data, target = data.to(device), target.to(device)
nvtx.range_pop()
nvtx.range_push("forward pass")
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
nvtx.range_pop()
nvtx.range_push("backward pass")
loss.backward()
optimizer.step()
nvtx.range_pop()
nvtx.range_pop()if batch_idx % args.log_interval ==0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:
break
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=3, metavar='N',
help='number of epochs to train (default: 14)')
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--no-mps', action='store_true', default=False,
help='disables macOS GPU training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()
torch.manual_seed(args.seed)if use_cuda:
device = torch.device("cuda")elif use_mps:
device = torch.device("mps")
else:
device = torch.device("cpu")
train_kwargs ={'batch_size': args.batch_size}
test_kwargs ={'batch_size': args.test_batch_size}if use_cuda:
cuda_kwargs ={'num_workers':1,
'pin_memory': True,
'shuffle': True}
train_kwargs.update(cuda_kwargs)
test_kwargs.update(cuda_kwargs)transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,),(0.3081,))])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)forepochin range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)
scheduler.step()if args.save_model:
torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ =='__main__':
main()# D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda,nvtx,cudnn -o baseline -w true D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py
客户端加载baseline.nsys-rep 文件。
5. nsight compute 安装教程
https://developer.nvidia.com/tools-overview/nsight-compute/get-started
下载linux 版本,sh nsight-compute-linux-2024.2.1.2-34372528.run
安装默认路径: /usr/local/NVIDIA-Nsight-Compute
window 安装:
安装windows 版本
6 nsight compute 使用教程
》》 >>ncu --list-sets
采用c++ 代码测试
#include <stdio.h>
__global__ void kernel_A(double* A, int N, int M){
double d =0.0;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// printf("Kernel A\n");if(idx < N){#pragma unroll(100)for(int j =0; j < M; ++j){
d += A[idx];}
A[idx]= d;}}
__global__ void kernel_B(double* A, int N, int M){
double d =0.0;
int idx = threadIdx.x + blockIdx.x * blockDim.x;if(idx < N){#pragma unroll(100)for(int j =0; j < M; ++j){
d += A[idx];}
A[idx]= d;}}
__global__ void kernel_C(double* A, const double* B, int N){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// printf("Kernel C\n");
// Strided memory access: warp 0 accesses (0, stride, 2*stride, ...), warp 1 accesses
// (1, stride + 1, 2*stride + 1, ...).
const int stride =16;
int strided_idx = threadIdx.x * stride + blockIdx.x % stride + (blockIdx.x / stride) * stride * blockDim.x;if(strided_idx < N){
A[idx]= B[strided_idx] + B[strided_idx];}}
int main(){
double* A;
double* B;
int N =80 * 2048 * 100;
size_t sz = N * sizeof(double);
cudaMalloc((void**)&A, sz);
cudaMalloc((void**)&B, sz);
cudaMemset(A, 0, sz);
cudaMemset(B, 0, sz);
int threadsPerBlock =64;
int numBlocks =(N + threadsPerBlock - 1) / threadsPerBlock;
int M =10000;
kernel_A<<<numBlocks, threadsPerBlock>>>(A, N, M);
cudaFuncSetAttribute(kernel_B, cudaFuncAttributeMaxDynamicSharedMemorySize, 48 * 1024);
kernel_B<<<numBlocks, threadsPerBlock, 48 * 1024>>>(A, N, M);
kernel_C<<<numBlocks, threadsPerBlock>>>(A, B, N);
cudaDeviceSynchronize();
将c++ 文件编译
》》>>nvcc -o kernel_abc kernel_abc.cu
采用nsight compute 进行profile
》》>>ncu --set detailed -o kernel_abc ./test/kernel_abc
window 客户端打开
python 测试
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())#print(torch.cuda.list_gpu_processes())import torch.cuda.nvtx as nvtx
print(torch.cuda.device_count())
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)return output
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
nvtx.range_push("data loading")for batch_idx, (data, target)in enumerate(train_loader):
nvtx.range_pop()
nvtx.range_push("batch:"+str(batch_idx))
nvtx.range_push("copy to device")
data, target = data.to(device), target.to(device)
nvtx.range_pop()
nvtx.range_push("forward pass")
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
nvtx.range_pop()
nvtx.range_push("backward pass")
loss.backward()
optimizer.step()
nvtx.range_pop()
nvtx.range_pop()if batch_idx % args.log_interval ==0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:
breakif batch_idx>10:
break
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=1, metavar='N',
help='number of epochs to train (default: 14)')
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--no-mps', action='store_true', default=False,
help='disables macOS GPU training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()
torch.manual_seed(args.seed)if use_cuda:
device = torch.device("cuda")elif use_mps:
device = torch.device("mps")
else:
device = torch.device("cpu")
train_kwargs ={'batch_size': args.batch_size}
test_kwargs ={'batch_size': args.test_batch_size}if use_cuda:
cuda_kwargs ={'num_workers':1,
'pin_memory': True,
'shuffle': True}
train_kwargs.update(cuda_kwargs)
test_kwargs.update(cuda_kwargs)transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,),(0.3081,))])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)forepochin range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)
scheduler.step()if args.save_model:
torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ =='__main__':
main()
ncu --set roofline -o models python test/test1.py
windows 客户端加载models.ncu-rep
7 nsight system 与nsight compute 应用区别
nsight system 倾向于全局角度分析核函数的读写、调度、内存占用率,cpu与gpu的异步调用
nsight compute 倾向于核函数内部操作,不同模块的执行时间、吞吐量、带宽分析等。
8 遇到的问题
8.1. 加载到98%,卡住不动。原因:笔记本电脑性能低,带不动。换台式机即可。
8.2 ERROR ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM
解决:添加root 权限
docker run -it --gpus all --name llm-04 -v /jiayafei_linux/:/home/ -p38014:8014 --privileged=true pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel /bin/bash
版权归原作者 云帆@ 所有, 如有侵权,请联系我们删除。