1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
| from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", load_in_8bit=True, device_map="auto", ) for i, para in enumerate(model.named_parameters()): print(f'{i}, \\t {para[1].device} \\t{para[1].dtype}')
import torch import torch.nn as nn import torch.optim as optim class ToyModel(nn.Module): def __init__(self): super(ToyModel, self).__init__() self.net1 = torch.nn.Linear(10000, 10).to('cuda:0') self.relu = torch.nn.ReLU() self.net2 = torch.nn.Linear(10, 5).to('cuda:1')
def forward(self, x): x = self.relu(self.net1(x.to('cuda:0'))) return self.net2(x.to('cuda:1'))
model = ToyModel() loss_fn = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.001) optimizer.zero_grad() outputs = model(torch.randn(20, 10000)) labels = torch.randn(20, 5).to('cuda:1') loss_fn(outputs, labels).backward() optimizer.step()
import torch from torch import nn from torchvision.models.resnet import ResNet, Bottleneck
model = ResNet(Bottleneck, [3, 4, 6, 3]) from torchsummary import summary summary(model, input_size=(3, 128, 128), device='cpu')
class ModelParallelResNet50(ResNet): def __init__(self, num_classes=1000): super().__init__(Bottleneck, [3, 4, 6, 3], num_classes=num_classes) self.seq1 = nn.Sequential( self.conv1, self.bn1, self.relu, self.maxpool, self.layer1, self.layer2 ).to('cuda:0') self.seq2 = nn.Sequential( self.layer3, self.layer4, self.avgpool, ).to('cuda:1') self.fc.to('cuda:1') def forward(self, x): x = self.seq2(self.seq1(x).to('cuda:1')) return self.fc(x.view(x.size(0), -1)) def model_size(model): return sum([para.numel() for para in model.parameters()])
one_hot_indices = torch.LongTensor(5) \\ .random_(0, num_classes) \\ .view(5, 1) labels = torch.zeros(5, num_classes) \\ .scatter_(1, one_hot_indices, 1)
|