Skip to content

Reference for ultralytics/nn/tasks.py

Note

This file is available at https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py. If you spot a problem please help fix it by contributing a Pull Request 🛠️. Thank you 🙏!


ultralytics.nn.tasks.BaseModel

Bases: Module

Base class for all YOLO models in the Ultralytics family.

This class provides common functionality for YOLO models including forward pass handling, model fusion, information display, and weight loading capabilities.

Attributes:

Name Type Description
model Module

The neural network model.

save list

List of layer indices to save outputs from.

stride Tensor

Model stride values.

Methods:

Name Description
forward

Perform forward pass for training or inference.

predict

Perform inference on input tensor.

fuse

Fuse Conv2d and BatchNorm2d layers for optimization.

info

Print model information.

load

Load weights into the model.

loss

Compute loss for training.

Examples:

Create a BaseModel instance

>>> model = BaseModel()
>>> model.info()  # Display model information

forward

forward(x, *args, **kwargs)

Perform forward pass of the model for either training or inference.

If x is a dict, calculates and returns the loss for training. Otherwise, returns predictions for inference.

Parameters:

Name Type Description Default
x Tensor | dict

Input tensor for inference, or dict with image tensor and labels for training.

required
*args Any

Variable length argument list.

()
**kwargs Any

Arbitrary keyword arguments.

{}

Returns:

Type Description
Tensor

Loss if x is a dict (training), or network predictions (inference).

Source code in ultralytics/nn/tasks.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def forward(self, x, *args, **kwargs):
    """
    Perform forward pass of the model for either training or inference.

    If x is a dict, calculates and returns the loss for training. Otherwise, returns predictions for inference.

    Args:
        x (torch.Tensor | dict): Input tensor for inference, or dict with image tensor and labels for training.
        *args (Any): Variable length argument list.
        **kwargs (Any): Arbitrary keyword arguments.

    Returns:
        (torch.Tensor): Loss if x is a dict (training), or network predictions (inference).
    """
    if isinstance(x, dict):  # for cases of training and validating while training.
        return self.loss(x, *args, **kwargs)
    return self.predict(x, *args, **kwargs)

fuse

fuse(verbose=True)

Fuse the Conv2d() and BatchNorm2d() layers of the model into a single layer for improved computation efficiency.

Returns:

Type Description
Module

The fused model is returned.

Source code in ultralytics/nn/tasks.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def fuse(self, verbose=True):
    """
    Fuse the `Conv2d()` and `BatchNorm2d()` layers of the model into a single layer for improved computation
    efficiency.

    Returns:
        (torch.nn.Module): The fused model is returned.
    """
    if not self.is_fused():
        for m in self.model.modules():
            if isinstance(m, (Conv, Conv2, DWConv)) and hasattr(m, "bn"):
                if isinstance(m, Conv2):
                    m.fuse_convs()
                m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
                delattr(m, "bn")  # remove batchnorm
                m.forward = m.forward_fuse  # update forward
            if isinstance(m, ConvTranspose) and hasattr(m, "bn"):
                m.conv_transpose = fuse_deconv_and_bn(m.conv_transpose, m.bn)
                delattr(m, "bn")  # remove batchnorm
                m.forward = m.forward_fuse  # update forward
            if isinstance(m, RepConv):
                m.fuse_convs()
                m.forward = m.forward_fuse  # update forward
            if isinstance(m, RepVGGDW):
                m.fuse()
                m.forward = m.forward_fuse
            if isinstance(m, v10Detect):
                m.fuse()  # remove one2many head
        self.info(verbose=verbose)

    return self

info

info(detailed=False, verbose=True, imgsz=640)

Print model information.

Parameters:

Name Type Description Default
detailed bool

If True, prints out detailed information about the model.

False
verbose bool

If True, prints out the model information.

True
imgsz int

The size of the image that the model will be trained on.

640
Source code in ultralytics/nn/tasks.py
268
269
270
271
272
273
274
275
276
277
def info(self, detailed=False, verbose=True, imgsz=640):
    """
    Print model information.

    Args:
        detailed (bool): If True, prints out detailed information about the model.
        verbose (bool): If True, prints out the model information.
        imgsz (int): The size of the image that the model will be trained on.
    """
    return model_info(self, detailed=detailed, verbose=verbose, imgsz=imgsz)

init_criterion

init_criterion()

Initialize the loss criterion for the BaseModel.

Source code in ultralytics/nn/tasks.py
339
340
341
def init_criterion(self):
    """Initialize the loss criterion for the BaseModel."""
    raise NotImplementedError("compute_loss() needs to be implemented by task heads")

is_fused

is_fused(thresh=10)

Check if the model has less than a certain threshold of BatchNorm layers.

Parameters:

Name Type Description Default
thresh int

The threshold number of BatchNorm layers.

10

Returns:

Type Description
bool

True if the number of BatchNorm layers in the model is less than the threshold, False otherwise.

Source code in ultralytics/nn/tasks.py
255
256
257
258
259
260
261
262
263
264
265
266
def is_fused(self, thresh=10):
    """
    Check if the model has less than a certain threshold of BatchNorm layers.

    Args:
        thresh (int, optional): The threshold number of BatchNorm layers.

    Returns:
        (bool): True if the number of BatchNorm layers in the model is less than the threshold, False otherwise.
    """
    bn = tuple(v for k, v in torch.nn.__dict__.items() if "Norm" in k)  # normalization layers, i.e. BatchNorm2d()
    return sum(isinstance(v, bn) for v in self.modules()) < thresh  # True if < 'thresh' BatchNorm layers in model

load

load(weights, verbose=True)

Load weights into the model.

Parameters:

Name Type Description Default
weights dict | Module

The pre-trained weights to be loaded.

required
verbose bool

Whether to log the transfer progress.

True
Source code in ultralytics/nn/tasks.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
def load(self, weights, verbose=True):
    """
    Load weights into the model.

    Args:
        weights (dict | torch.nn.Module): The pre-trained weights to be loaded.
        verbose (bool, optional): Whether to log the transfer progress.
    """
    model = weights["model"] if isinstance(weights, dict) else weights  # torchvision models are not dicts
    csd = model.float().state_dict()  # checkpoint state_dict as FP32
    updated_csd = intersect_dicts(csd, self.state_dict())  # intersect
    self.load_state_dict(updated_csd, strict=False)  # load
    len_updated_csd = len(updated_csd)
    first_conv = "model.0.conv.weight"  # hard-coded to yolo models for now
    # mostly used to boost multi-channel training
    state_dict = self.state_dict()
    if first_conv not in updated_csd and first_conv in state_dict:
        c1, c2, h, w = state_dict[first_conv].shape
        cc1, cc2, ch, cw = csd[first_conv].shape
        if ch == h and cw == w:
            c1, c2 = min(c1, cc1), min(c2, cc2)
            state_dict[first_conv][:c1, :c2] = csd[first_conv][:c1, :c2]
            len_updated_csd += 1
    if verbose:
        LOGGER.info(f"Transferred {len_updated_csd}/{len(self.model.state_dict())} items from pretrained weights")

loss

loss(batch, preds=None)

Compute loss.

Parameters:

Name Type Description Default
batch dict

Batch to compute loss on.

required
preds Tensor | List[Tensor]

Predictions.

None
Source code in ultralytics/nn/tasks.py
325
326
327
328
329
330
331
332
333
334
335
336
337
def loss(self, batch, preds=None):
    """
    Compute loss.

    Args:
        batch (dict): Batch to compute loss on.
        preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
    """
    if getattr(self, "criterion", None) is None:
        self.criterion = self.init_criterion()

    preds = self.forward(batch["img"]) if preds is None else preds
    return self.criterion(preds, batch)

predict

predict(x, profile=False, visualize=False, augment=False, embed=None)

Perform a forward pass through the network.

Parameters:

Name Type Description Default
x Tensor

The input tensor to the model.

required
profile bool

Print the computation time of each layer if True.

False
visualize bool

Save the feature maps of the model if True.

False
augment bool

Augment image during prediction.

False
embed list

A list of feature vectors/embeddings to return.

None

Returns:

Type Description
Tensor

The last output of the model.

Source code in ultralytics/nn/tasks.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def predict(self, x, profile=False, visualize=False, augment=False, embed=None):
    """
    Perform a forward pass through the network.

    Args:
        x (torch.Tensor): The input tensor to the model.
        profile (bool): Print the computation time of each layer if True.
        visualize (bool): Save the feature maps of the model if True.
        augment (bool): Augment image during prediction.
        embed (list, optional): A list of feature vectors/embeddings to return.

    Returns:
        (torch.Tensor): The last output of the model.
    """
    if augment:
        return self._predict_augment(x)
    return self._predict_once(x, profile, visualize, embed)





ultralytics.nn.tasks.DetectionModel

DetectionModel(cfg='yolo11n.yaml', ch=3, nc=None, verbose=True)

Bases: BaseModel

YOLO detection model.

This class implements the YOLO detection architecture, handling model initialization, forward pass, augmented inference, and loss computation for object detection tasks.

Attributes:

Name Type Description
yaml dict

Model configuration dictionary.

model Sequential

The neural network model.

save list

List of layer indices to save outputs from.

names dict

Class names dictionary.

inplace bool

Whether to use inplace operations.

end2end bool

Whether the model uses end-to-end detection.

stride Tensor

Model stride values.

Methods:

Name Description
_predict_augment

Perform augmented inference.

_descale_pred

De-scale predictions following augmented inference.

_clip_augmented

Clip YOLO augmented inference tails.

init_criterion

Initialize the loss criterion.

Examples:

Initialize a detection model

>>> model = DetectionModel("yolo11n.yaml", ch=3, nc=80)
>>> results = model.predict(image_tensor)

Parameters:

Name Type Description Default
cfg str | dict

Model configuration file path or dictionary.

'yolo11n.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
verbose bool

Whether to display model information.

True
Source code in ultralytics/nn/tasks.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
def __init__(self, cfg="yolo11n.yaml", ch=3, nc=None, verbose=True):
    """
    Initialize the YOLO detection model with the given config and parameters.

    Args:
        cfg (str | dict): Model configuration file path or dictionary.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        verbose (bool): Whether to display model information.
    """
    super().__init__()
    self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict
    if self.yaml["backbone"][0][2] == "Silence":
        LOGGER.warning(
            "YOLOv9 `Silence` module is deprecated in favor of torch.nn.Identity. "
            "Please delete local *.pt file and re-download the latest model checkpoint."
        )
        self.yaml["backbone"][0][2] = "nn.Identity"

    # Define model
    self.yaml["channels"] = ch  # save channels
    if nc and nc != self.yaml["nc"]:
        LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
        self.yaml["nc"] = nc  # override YAML value
    self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose)  # model, savelist
    self.names = {i: f"{i}" for i in range(self.yaml["nc"])}  # default names dict
    self.inplace = self.yaml.get("inplace", True)
    self.end2end = getattr(self.model[-1], "end2end", False)

    # Build strides
    m = self.model[-1]  # Detect()
    if isinstance(m, Detect):  # includes all Detect subclasses like Segment, Pose, OBB, YOLOEDetect, YOLOESegment
        s = 256  # 2x min stride
        m.inplace = self.inplace

        def _forward(x):
            """Perform a forward pass through the model, handling different Detect subclass types accordingly."""
            if self.end2end:
                return self.forward(x)["one2many"]
            return self.forward(x)[0] if isinstance(m, (Segment, YOLOESegment, Pose, OBB)) else self.forward(x)

        self.model.eval()  # Avoid changing batch statistics until training begins
        m.training = True  # Setting it to True to properly return strides
        m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))])  # forward
        self.stride = m.stride
        self.model.train()  # Set model back to training(default) mode
        m.bias_init()  # only run once
    else:
        self.stride = torch.Tensor([32])  # default stride for i.e. RTDETR

    # Init weights, biases
    initialize_weights(self)
    if verbose:
        self.info()
        LOGGER.info("")

init_criterion

init_criterion()

Initialize the loss criterion for the DetectionModel.

Source code in ultralytics/nn/tasks.py
496
497
498
def init_criterion(self):
    """Initialize the loss criterion for the DetectionModel."""
    return E2EDetectLoss(self) if getattr(self, "end2end", False) else v8DetectionLoss(self)





ultralytics.nn.tasks.OBBModel

OBBModel(cfg='yolo11n-obb.yaml', ch=3, nc=None, verbose=True)

Bases: DetectionModel

YOLO Oriented Bounding Box (OBB) model.

This class extends DetectionModel to handle oriented bounding box detection tasks, providing specialized loss computation for rotated object detection.

Methods:

Name Description
init_criterion

Initialize the loss criterion for OBB detection.

Examples:

Initialize an OBB model

>>> model = OBBModel("yolo11n-obb.yaml", ch=3, nc=80)
>>> results = model.predict(image_tensor)

Parameters:

Name Type Description Default
cfg str | dict

Model configuration file path or dictionary.

'yolo11n-obb.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
verbose bool

Whether to display model information.

True
Source code in ultralytics/nn/tasks.py
518
519
520
521
522
523
524
525
526
527
528
def __init__(self, cfg="yolo11n-obb.yaml", ch=3, nc=None, verbose=True):
    """
    Initialize YOLO OBB model with given config and parameters.

    Args:
        cfg (str | dict): Model configuration file path or dictionary.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        verbose (bool): Whether to display model information.
    """
    super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

init_criterion

init_criterion()

Initialize the loss criterion for the model.

Source code in ultralytics/nn/tasks.py
530
531
532
def init_criterion(self):
    """Initialize the loss criterion for the model."""
    return v8OBBLoss(self)





ultralytics.nn.tasks.SegmentationModel

SegmentationModel(cfg='yolo11n-seg.yaml', ch=3, nc=None, verbose=True)

Bases: DetectionModel

YOLO segmentation model.

This class extends DetectionModel to handle instance segmentation tasks, providing specialized loss computation for pixel-level object detection and segmentation.

Methods:

Name Description
init_criterion

Initialize the loss criterion for segmentation.

Examples:

Initialize a segmentation model

>>> model = SegmentationModel("yolo11n-seg.yaml", ch=3, nc=80)
>>> results = model.predict(image_tensor)

Parameters:

Name Type Description Default
cfg str | dict

Model configuration file path or dictionary.

'yolo11n-seg.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
verbose bool

Whether to display model information.

True
Source code in ultralytics/nn/tasks.py
552
553
554
555
556
557
558
559
560
561
562
def __init__(self, cfg="yolo11n-seg.yaml", ch=3, nc=None, verbose=True):
    """
    Initialize Ultralytics YOLO segmentation model with given config and parameters.

    Args:
        cfg (str | dict): Model configuration file path or dictionary.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        verbose (bool): Whether to display model information.
    """
    super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

init_criterion

init_criterion()

Initialize the loss criterion for the SegmentationModel.

Source code in ultralytics/nn/tasks.py
564
565
566
def init_criterion(self):
    """Initialize the loss criterion for the SegmentationModel."""
    return v8SegmentationLoss(self)





ultralytics.nn.tasks.PoseModel

PoseModel(
    cfg="yolo11n-pose.yaml",
    ch=3,
    nc=None,
    data_kpt_shape=(None, None),
    verbose=True,
)

Bases: DetectionModel

YOLO pose model.

This class extends DetectionModel to handle human pose estimation tasks, providing specialized loss computation for keypoint detection and pose estimation.

Attributes:

Name Type Description
kpt_shape tuple

Shape of keypoints data (num_keypoints, num_dimensions).

Methods:

Name Description
init_criterion

Initialize the loss criterion for pose estimation.

Examples:

Initialize a pose model

>>> model = PoseModel("yolo11n-pose.yaml", ch=3, nc=1, data_kpt_shape=(17, 3))
>>> results = model.predict(image_tensor)

Parameters:

Name Type Description Default
cfg str | dict

Model configuration file path or dictionary.

'yolo11n-pose.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
data_kpt_shape tuple

Shape of keypoints data.

(None, None)
verbose bool

Whether to display model information.

True
Source code in ultralytics/nn/tasks.py
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
def __init__(self, cfg="yolo11n-pose.yaml", ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
    """
    Initialize Ultralytics YOLO Pose model.

    Args:
        cfg (str | dict): Model configuration file path or dictionary.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        data_kpt_shape (tuple): Shape of keypoints data.
        verbose (bool): Whether to display model information.
    """
    if not isinstance(cfg, dict):
        cfg = yaml_model_load(cfg)  # load model YAML
    if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg["kpt_shape"]):
        LOGGER.info(f"Overriding model.yaml kpt_shape={cfg['kpt_shape']} with kpt_shape={data_kpt_shape}")
        cfg["kpt_shape"] = data_kpt_shape
    super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

init_criterion

init_criterion()

Initialize the loss criterion for the PoseModel.

Source code in ultralytics/nn/tasks.py
607
608
609
def init_criterion(self):
    """Initialize the loss criterion for the PoseModel."""
    return v8PoseLoss(self)





ultralytics.nn.tasks.ClassificationModel

ClassificationModel(cfg='yolo11n-cls.yaml', ch=3, nc=None, verbose=True)

Bases: BaseModel

YOLO classification model.

This class implements the YOLO classification architecture for image classification tasks, providing model initialization, configuration, and output reshaping capabilities.

Attributes:

Name Type Description
yaml dict

Model configuration dictionary.

model Sequential

The neural network model.

stride Tensor

Model stride values.

names dict

Class names dictionary.

Methods:

Name Description
_from_yaml

Set model configurations and define architecture.

reshape_outputs

Update model to specified class count.

init_criterion

Initialize the loss criterion.

Examples:

Initialize a classification model

>>> model = ClassificationModel("yolo11n-cls.yaml", ch=3, nc=1000)
>>> results = model.predict(image_tensor)

Parameters:

Name Type Description Default
cfg str | dict

Model configuration file path or dictionary.

'yolo11n-cls.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
verbose bool

Whether to display model information.

True
Source code in ultralytics/nn/tasks.py
637
638
639
640
641
642
643
644
645
646
647
648
def __init__(self, cfg="yolo11n-cls.yaml", ch=3, nc=None, verbose=True):
    """
    Initialize ClassificationModel with YAML, channels, number of classes, verbose flag.

    Args:
        cfg (str | dict): Model configuration file path or dictionary.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        verbose (bool): Whether to display model information.
    """
    super().__init__()
    self._from_yaml(cfg, ch, nc, verbose)

init_criterion

init_criterion()

Initialize the loss criterion for the ClassificationModel.

Source code in ultralytics/nn/tasks.py
703
704
705
def init_criterion(self):
    """Initialize the loss criterion for the ClassificationModel."""
    return v8ClassificationLoss()

reshape_outputs staticmethod

reshape_outputs(model, nc)

Update a TorchVision classification model to class count 'n' if required.

Parameters:

Name Type Description Default
model Module

Model to update.

required
nc int

New number of classes.

required
Source code in ultralytics/nn/tasks.py
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
@staticmethod
def reshape_outputs(model, nc):
    """
    Update a TorchVision classification model to class count 'n' if required.

    Args:
        model (torch.nn.Module): Model to update.
        nc (int): New number of classes.
    """
    name, m = list((model.model if hasattr(model, "model") else model).named_children())[-1]  # last module
    if isinstance(m, Classify):  # YOLO Classify() head
        if m.linear.out_features != nc:
            m.linear = torch.nn.Linear(m.linear.in_features, nc)
    elif isinstance(m, torch.nn.Linear):  # ResNet, EfficientNet
        if m.out_features != nc:
            setattr(model, name, torch.nn.Linear(m.in_features, nc))
    elif isinstance(m, torch.nn.Sequential):
        types = [type(x) for x in m]
        if torch.nn.Linear in types:
            i = len(types) - 1 - types[::-1].index(torch.nn.Linear)  # last torch.nn.Linear index
            if m[i].out_features != nc:
                m[i] = torch.nn.Linear(m[i].in_features, nc)
        elif torch.nn.Conv2d in types:
            i = len(types) - 1 - types[::-1].index(torch.nn.Conv2d)  # last torch.nn.Conv2d index
            if m[i].out_channels != nc:
                m[i] = torch.nn.Conv2d(
                    m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None
                )





ultralytics.nn.tasks.RTDETRDetectionModel

RTDETRDetectionModel(cfg='rtdetr-l.yaml', ch=3, nc=None, verbose=True)

Bases: DetectionModel

RTDETR (Real-time DEtection and Tracking using Transformers) Detection Model class.

This class is responsible for constructing the RTDETR architecture, defining loss functions, and facilitating both the training and inference processes. RTDETR is an object detection and tracking model that extends from the DetectionModel base class.

Attributes:

Name Type Description
nc int

Number of classes for detection.

criterion RTDETRDetectionLoss

Loss function for training.

Methods:

Name Description
init_criterion

Initialize the loss criterion.

loss

Compute loss for training.

predict

Perform forward pass through the model.

Examples:

Initialize an RTDETR model

>>> model = RTDETRDetectionModel("rtdetr-l.yaml", ch=3, nc=80)
>>> results = model.predict(image_tensor)

Parameters:

Name Type Description Default
cfg str | dict

Configuration file name or path.

'rtdetr-l.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
verbose bool

Print additional information during initialization.

True
Source code in ultralytics/nn/tasks.py
732
733
734
735
736
737
738
739
740
741
742
def __init__(self, cfg="rtdetr-l.yaml", ch=3, nc=None, verbose=True):
    """
    Initialize the RTDETRDetectionModel.

    Args:
        cfg (str | dict): Configuration file name or path.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        verbose (bool): Print additional information during initialization.
    """
    super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

init_criterion

init_criterion()

Initialize the loss criterion for the RTDETRDetectionModel.

Source code in ultralytics/nn/tasks.py
744
745
746
747
748
def init_criterion(self):
    """Initialize the loss criterion for the RTDETRDetectionModel."""
    from ultralytics.models.utils.loss import RTDETRDetectionLoss

    return RTDETRDetectionLoss(nc=self.nc, use_vfl=True)

loss

loss(batch, preds=None)

Compute the loss for the given batch of data.

Parameters:

Name Type Description Default
batch dict

Dictionary containing image and label data.

required
preds Tensor

Precomputed model predictions.

None

Returns:

Name Type Description
loss_sum Tensor

Total loss value.

loss_items Tensor

Main three losses in a tensor.

Source code in ultralytics/nn/tasks.py
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
def loss(self, batch, preds=None):
    """
    Compute the loss for the given batch of data.

    Args:
        batch (dict): Dictionary containing image and label data.
        preds (torch.Tensor, optional): Precomputed model predictions.

    Returns:
        loss_sum (torch.Tensor): Total loss value.
        loss_items (torch.Tensor): Main three losses in a tensor.
    """
    if not hasattr(self, "criterion"):
        self.criterion = self.init_criterion()

    img = batch["img"]
    # NOTE: preprocess gt_bbox and gt_labels to list.
    bs = len(img)
    batch_idx = batch["batch_idx"]
    gt_groups = [(batch_idx == i).sum().item() for i in range(bs)]
    targets = {
        "cls": batch["cls"].to(img.device, dtype=torch.long).view(-1),
        "bboxes": batch["bboxes"].to(device=img.device),
        "batch_idx": batch_idx.to(img.device, dtype=torch.long).view(-1),
        "gt_groups": gt_groups,
    }

    preds = self.predict(img, batch=targets) if preds is None else preds
    dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta = preds if self.training else preds[1]
    if dn_meta is None:
        dn_bboxes, dn_scores = None, None
    else:
        dn_bboxes, dec_bboxes = torch.split(dec_bboxes, dn_meta["dn_num_split"], dim=2)
        dn_scores, dec_scores = torch.split(dec_scores, dn_meta["dn_num_split"], dim=2)

    dec_bboxes = torch.cat([enc_bboxes.unsqueeze(0), dec_bboxes])  # (7, bs, 300, 4)
    dec_scores = torch.cat([enc_scores.unsqueeze(0), dec_scores])

    loss = self.criterion(
        (dec_bboxes, dec_scores), targets, dn_bboxes=dn_bboxes, dn_scores=dn_scores, dn_meta=dn_meta
    )
    # NOTE: There are like 12 losses in RTDETR, backward with all losses but only show the main three losses.
    return sum(loss.values()), torch.as_tensor(
        [loss[k].detach() for k in ["loss_giou", "loss_class", "loss_bbox"]], device=img.device
    )

predict

predict(
    x, profile=False, visualize=False, batch=None, augment=False, embed=None
)

Perform a forward pass through the model.

Parameters:

Name Type Description Default
x Tensor

The input tensor.

required
profile bool

If True, profile the computation time for each layer.

False
visualize bool

If True, save feature maps for visualization.

False
batch dict

Ground truth data for evaluation.

None
augment bool

If True, perform data augmentation during inference.

False
embed list

A list of feature vectors/embeddings to return.

None

Returns:

Type Description
Tensor

Model's output tensor.

Source code in ultralytics/nn/tasks.py
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
def predict(self, x, profile=False, visualize=False, batch=None, augment=False, embed=None):
    """
    Perform a forward pass through the model.

    Args:
        x (torch.Tensor): The input tensor.
        profile (bool): If True, profile the computation time for each layer.
        visualize (bool): If True, save feature maps for visualization.
        batch (dict, optional): Ground truth data for evaluation.
        augment (bool): If True, perform data augmentation during inference.
        embed (list, optional): A list of feature vectors/embeddings to return.

    Returns:
        (torch.Tensor): Model's output tensor.
    """
    y, dt, embeddings = [], [], []  # outputs
    embed = frozenset(embed) if embed is not None else {-1}
    max_idx = max(embed)
    for m in self.model[:-1]:  # except the head part
        if m.f != -1:  # if not from previous layer
            x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
        if profile:
            self._profile_one_layer(m, x, dt)
        x = m(x)  # run
        y.append(x if m.i in self.save else None)  # save output
        if visualize:
            feature_visualization(x, m.type, m.i, save_dir=visualize)
        if m.i in embed:
            embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
            if m.i == max_idx:
                return torch.unbind(torch.cat(embeddings, 1), dim=0)
    head = self.model[-1]
    x = head([y[j] for j in head.f], batch)  # head inference
    return x





ultralytics.nn.tasks.WorldModel

WorldModel(cfg='yolov8s-world.yaml', ch=3, nc=None, verbose=True)

Bases: DetectionModel

YOLOv8 World Model.

This class implements the YOLOv8 World model for open-vocabulary object detection, supporting text-based class specification and CLIP model integration for zero-shot detection capabilities.

Attributes:

Name Type Description
txt_feats Tensor

Text feature embeddings for classes.

clip_model Module

CLIP model for text encoding.

Methods:

Name Description
set_classes

Set classes for offline inference.

get_text_pe

Get text positional embeddings.

predict

Perform forward pass with text features.

loss

Compute loss with text features.

Examples:

Initialize a world model

>>> model = WorldModel("yolov8s-world.yaml", ch=3, nc=80)
>>> model.set_classes(["person", "car", "bicycle"])
>>> results = model.predict(image_tensor)

Parameters:

Name Type Description Default
cfg str | dict

Model configuration file path or dictionary.

'yolov8s-world.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
verbose bool

Whether to display model information.

True
Source code in ultralytics/nn/tasks.py
857
858
859
860
861
862
863
864
865
866
867
868
869
def __init__(self, cfg="yolov8s-world.yaml", ch=3, nc=None, verbose=True):
    """
    Initialize YOLOv8 world model with given config and parameters.

    Args:
        cfg (str | dict): Model configuration file path or dictionary.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        verbose (bool): Whether to display model information.
    """
    self.txt_feats = torch.randn(1, nc or 80, 512)  # features placeholder
    self.clip_model = None  # CLIP model placeholder
    super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

get_text_pe

get_text_pe(text, batch=80, cache_clip_model=True)

Set classes in advance so that model could do offline-inference without clip model.

Parameters:

Name Type Description Default
text List[str]

List of class names.

required
batch int

Batch size for processing text tokens.

80
cache_clip_model bool

Whether to cache the CLIP model.

True

Returns:

Type Description
Tensor

Text positional embeddings.

Source code in ultralytics/nn/tasks.py
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
@smart_inference_mode()
def get_text_pe(self, text, batch=80, cache_clip_model=True):
    """
    Set classes in advance so that model could do offline-inference without clip model.

    Args:
        text (List[str]): List of class names.
        batch (int): Batch size for processing text tokens.
        cache_clip_model (bool): Whether to cache the CLIP model.

    Returns:
        (torch.Tensor): Text positional embeddings.
    """
    from ultralytics.nn.text_model import build_text_model

    device = next(self.model.parameters()).device
    if not getattr(self, "clip_model", None) and cache_clip_model:
        # For backwards compatibility of models lacking clip_model attribute
        self.clip_model = build_text_model("clip:ViT-B/32", device=device)
    model = self.clip_model if cache_clip_model else build_text_model("clip:ViT-B/32", device=device)
    text_token = model.tokenize(text)
    txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
    txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
    return txt_feats.reshape(-1, len(text), txt_feats.shape[-1])

loss

loss(batch, preds=None)

Compute loss.

Parameters:

Name Type Description Default
batch dict

Batch to compute loss on.

required
preds Tensor | List[Tensor]

Predictions.

None
Source code in ultralytics/nn/tasks.py
953
954
955
956
957
958
959
960
961
962
963
964
965
966
def loss(self, batch, preds=None):
    """
    Compute loss.

    Args:
        batch (dict): Batch to compute loss on.
        preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
    """
    if not hasattr(self, "criterion"):
        self.criterion = self.init_criterion()

    if preds is None:
        preds = self.forward(batch["img"], txt_feats=batch["txt_feats"])
    return self.criterion(preds, batch)

predict

predict(
    x, profile=False, visualize=False, txt_feats=None, augment=False, embed=None
)

Perform a forward pass through the model.

Parameters:

Name Type Description Default
x Tensor

The input tensor.

required
profile bool

If True, profile the computation time for each layer.

False
visualize bool

If True, save feature maps for visualization.

False
txt_feats Tensor

The text features, use it if it's given.

None
augment bool

If True, perform data augmentation during inference.

False
embed list

A list of feature vectors/embeddings to return.

None

Returns:

Type Description
Tensor

Model's output tensor.

Source code in ultralytics/nn/tasks.py
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
def predict(self, x, profile=False, visualize=False, txt_feats=None, augment=False, embed=None):
    """
    Perform a forward pass through the model.

    Args:
        x (torch.Tensor): The input tensor.
        profile (bool): If True, profile the computation time for each layer.
        visualize (bool): If True, save feature maps for visualization.
        txt_feats (torch.Tensor, optional): The text features, use it if it's given.
        augment (bool): If True, perform data augmentation during inference.
        embed (list, optional): A list of feature vectors/embeddings to return.

    Returns:
        (torch.Tensor): Model's output tensor.
    """
    txt_feats = (self.txt_feats if txt_feats is None else txt_feats).to(device=x.device, dtype=x.dtype)
    if len(txt_feats) != len(x) or self.model[-1].export:
        txt_feats = txt_feats.expand(x.shape[0], -1, -1)
    ori_txt_feats = txt_feats.clone()
    y, dt, embeddings = [], [], []  # outputs
    embed = frozenset(embed) if embed is not None else {-1}
    max_idx = max(embed)
    for m in self.model:  # except the head part
        if m.f != -1:  # if not from previous layer
            x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
        if profile:
            self._profile_one_layer(m, x, dt)
        if isinstance(m, C2fAttn):
            x = m(x, txt_feats)
        elif isinstance(m, WorldDetect):
            x = m(x, ori_txt_feats)
        elif isinstance(m, ImagePoolingAttn):
            txt_feats = m(x, txt_feats)
        else:
            x = m(x)  # run

        y.append(x if m.i in self.save else None)  # save output
        if visualize:
            feature_visualization(x, m.type, m.i, save_dir=visualize)
        if m.i in embed:
            embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
            if m.i == max_idx:
                return torch.unbind(torch.cat(embeddings, 1), dim=0)
    return x

set_classes

set_classes(text, batch=80, cache_clip_model=True)

Set classes in advance so that model could do offline-inference without clip model.

Parameters:

Name Type Description Default
text List[str]

List of class names.

required
batch int

Batch size for processing text tokens.

80
cache_clip_model bool

Whether to cache the CLIP model.

True
Source code in ultralytics/nn/tasks.py
871
872
873
874
875
876
877
878
879
880
881
def set_classes(self, text, batch=80, cache_clip_model=True):
    """
    Set classes in advance so that model could do offline-inference without clip model.

    Args:
        text (List[str]): List of class names.
        batch (int): Batch size for processing text tokens.
        cache_clip_model (bool): Whether to cache the CLIP model.
    """
    self.txt_feats = self.get_text_pe(text, batch=batch, cache_clip_model=cache_clip_model)
    self.model[-1].nc = len(text)





ultralytics.nn.tasks.YOLOEModel

YOLOEModel(cfg='yoloe-v8s.yaml', ch=3, nc=None, verbose=True)

Bases: DetectionModel

YOLOE detection model.

This class implements the YOLOE architecture for efficient object detection with text and visual prompts, supporting both prompt-based and prompt-free inference modes.

Attributes:

Name Type Description
pe Tensor

Prompt embeddings for classes.

clip_model Module

CLIP model for text encoding.

Methods:

Name Description
get_text_pe

Get text positional embeddings.

get_visual_pe

Get visual embeddings.

set_vocab

Set vocabulary for prompt-free model.

get_vocab

Get fused vocabulary layer.

set_classes

Set classes for offline inference.

get_cls_pe

Get class positional embeddings.

predict

Perform forward pass with prompts.

loss

Compute loss with prompts.

Examples:

Initialize a YOLOE model

>>> model = YOLOEModel("yoloe-v8s.yaml", ch=3, nc=80)
>>> results = model.predict(image_tensor, tpe=text_embeddings)

Parameters:

Name Type Description Default
cfg str | dict

Model configuration file path or dictionary.

'yoloe-v8s.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
verbose bool

Whether to display model information.

True
Source code in ultralytics/nn/tasks.py
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
def __init__(self, cfg="yoloe-v8s.yaml", ch=3, nc=None, verbose=True):
    """
    Initialize YOLOE model with given config and parameters.

    Args:
        cfg (str | dict): Model configuration file path or dictionary.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        verbose (bool): Whether to display model information.
    """
    super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

get_cls_pe

get_cls_pe(tpe, vpe)

Get class positional embeddings.

Parameters:

Name Type Description Default
tpe Tensor

Text positional embeddings.

required
vpe Tensor

Visual positional embeddings.

required

Returns:

Type Description
Tensor

Class positional embeddings.

Source code in ultralytics/nn/tasks.py
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
def get_cls_pe(self, tpe, vpe):
    """
    Get class positional embeddings.

    Args:
        tpe (torch.Tensor, optional): Text positional embeddings.
        vpe (torch.Tensor, optional): Visual positional embeddings.

    Returns:
        (torch.Tensor): Class positional embeddings.
    """
    all_pe = []
    if tpe is not None:
        assert tpe.ndim == 3
        all_pe.append(tpe)
    if vpe is not None:
        assert vpe.ndim == 3
        all_pe.append(vpe)
    if not all_pe:
        all_pe.append(getattr(self, "pe", torch.zeros(1, 80, 512)))
    return torch.cat(all_pe, dim=1)

get_text_pe

get_text_pe(text, batch=80, cache_clip_model=False, without_reprta=False)

Set classes in advance so that model could do offline-inference without clip model.

Parameters:

Name Type Description Default
text List[str]

List of class names.

required
batch int

Batch size for processing text tokens.

80
cache_clip_model bool

Whether to cache the CLIP model.

False
without_reprta bool

Whether to return text embeddings cooperated with reprta module.

False

Returns:

Type Description
Tensor

Text positional embeddings.

Source code in ultralytics/nn/tasks.py
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
@smart_inference_mode()
def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
    """
    Set classes in advance so that model could do offline-inference without clip model.

    Args:
        text (List[str]): List of class names.
        batch (int): Batch size for processing text tokens.
        cache_clip_model (bool): Whether to cache the CLIP model.
        without_reprta (bool): Whether to return text embeddings cooperated with reprta module.

    Returns:
        (torch.Tensor): Text positional embeddings.
    """
    from ultralytics.nn.text_model import build_text_model

    device = next(self.model.parameters()).device
    if not getattr(self, "clip_model", None) and cache_clip_model:
        # For backwards compatibility of models lacking clip_model attribute
        self.clip_model = build_text_model("mobileclip:blt", device=device)

    model = self.clip_model if cache_clip_model else build_text_model("mobileclip:blt", device=device)
    text_token = model.tokenize(text)
    txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
    txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
    txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
    if without_reprta:
        return txt_feats

    assert not self.training
    head = self.model[-1]
    assert isinstance(head, YOLOEDetect)
    return head.get_tpe(txt_feats)  # run auxiliary text head

get_visual_pe

get_visual_pe(img, visual)

Get visual embeddings.

Parameters:

Name Type Description Default
img Tensor

Input image tensor.

required
visual Tensor

Visual features.

required

Returns:

Type Description
Tensor

Visual positional embeddings.

Source code in ultralytics/nn/tasks.py
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
@smart_inference_mode()
def get_visual_pe(self, img, visual):
    """
    Get visual embeddings.

    Args:
        img (torch.Tensor): Input image tensor.
        visual (torch.Tensor): Visual features.

    Returns:
        (torch.Tensor): Visual positional embeddings.
    """
    return self(img, vpe=visual, return_vpe=True)

get_vocab

get_vocab(names)

Get fused vocabulary layer from the model.

Parameters:

Name Type Description Default
names list

List of class names.

required

Returns:

Type Description
ModuleList

List of vocabulary modules.

Source code in ultralytics/nn/tasks.py
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
def get_vocab(self, names):
    """
    Get fused vocabulary layer from the model.

    Args:
        names (list): List of class names.

    Returns:
        (nn.ModuleList): List of vocabulary modules.
    """
    assert not self.training
    head = self.model[-1]
    assert isinstance(head, YOLOEDetect)
    assert not head.is_fused

    tpe = self.get_text_pe(names)
    self.set_classes(names, tpe)
    device = next(self.model.parameters()).device
    head.fuse(self.pe.to(device))  # fuse prompt embeddings to classify head

    vocab = nn.ModuleList()
    for cls_head in head.cv3:
        assert isinstance(cls_head, nn.Sequential)
        vocab.append(cls_head[-1])
    return vocab

loss

loss(batch, preds=None)

Compute loss.

Parameters:

Name Type Description Default
batch dict

Batch to compute loss on.

required
preds Tensor | List[Tensor]

Predictions.

None
Source code in ultralytics/nn/tasks.py
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
def loss(self, batch, preds=None):
    """
    Compute loss.

    Args:
        batch (dict): Batch to compute loss on.
        preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
    """
    if not hasattr(self, "criterion"):
        from ultralytics.utils.loss import TVPDetectLoss

        visual_prompt = batch.get("visuals", None) is not None  # TODO
        self.criterion = TVPDetectLoss(self) if visual_prompt else self.init_criterion()

    if preds is None:
        preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
    return self.criterion(preds, batch)

predict

predict(
    x,
    profile=False,
    visualize=False,
    tpe=None,
    augment=False,
    embed=None,
    vpe=None,
    return_vpe=False,
)

Perform a forward pass through the model.

Parameters:

Name Type Description Default
x Tensor

The input tensor.

required
profile bool

If True, profile the computation time for each layer.

False
visualize bool

If True, save feature maps for visualization.

False
tpe Tensor

Text positional embeddings.

None
augment bool

If True, perform data augmentation during inference.

False
embed list

A list of feature vectors/embeddings to return.

None
vpe Tensor

Visual positional embeddings.

None
return_vpe bool

If True, return visual positional embeddings.

False

Returns:

Type Description
Tensor

Model's output tensor.

Source code in ultralytics/nn/tasks.py
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
def predict(
    self, x, profile=False, visualize=False, tpe=None, augment=False, embed=None, vpe=None, return_vpe=False
):
    """
    Perform a forward pass through the model.

    Args:
        x (torch.Tensor): The input tensor.
        profile (bool): If True, profile the computation time for each layer.
        visualize (bool): If True, save feature maps for visualization.
        tpe (torch.Tensor, optional): Text positional embeddings.
        augment (bool): If True, perform data augmentation during inference.
        embed (list, optional): A list of feature vectors/embeddings to return.
        vpe (torch.Tensor, optional): Visual positional embeddings.
        return_vpe (bool): If True, return visual positional embeddings.

    Returns:
        (torch.Tensor): Model's output tensor.
    """
    y, dt, embeddings = [], [], []  # outputs
    b = x.shape[0]
    embed = frozenset(embed) if embed is not None else {-1}
    max_idx = max(embed)
    for m in self.model:  # except the head part
        if m.f != -1:  # if not from previous layer
            x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
        if profile:
            self._profile_one_layer(m, x, dt)
        if isinstance(m, YOLOEDetect):
            vpe = m.get_vpe(x, vpe) if vpe is not None else None
            if return_vpe:
                assert vpe is not None
                assert not self.training
                return vpe
            cls_pe = self.get_cls_pe(m.get_tpe(tpe), vpe).to(device=x[0].device, dtype=x[0].dtype)
            if cls_pe.shape[0] != b or m.export:
                cls_pe = cls_pe.expand(b, -1, -1)
            x = m(x, cls_pe)
        else:
            x = m(x)  # run

        y.append(x if m.i in self.save else None)  # save output
        if visualize:
            feature_visualization(x, m.type, m.i, save_dir=visualize)
        if m.i in embed:
            embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
            if m.i == max_idx:
                return torch.unbind(torch.cat(embeddings, 1), dim=0)
    return x

set_classes

set_classes(names, embeddings)

Set classes in advance so that model could do offline-inference without clip model.

Parameters:

Name Type Description Default
names List[str]

List of class names.

required
embeddings Tensor

Embeddings tensor.

required
Source code in ultralytics/nn/tasks.py
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
def set_classes(self, names, embeddings):
    """
    Set classes in advance so that model could do offline-inference without clip model.

    Args:
        names (List[str]): List of class names.
        embeddings (torch.Tensor): Embeddings tensor.
    """
    assert not hasattr(self.model[-1], "lrpc"), (
        "Prompt-free model does not support setting classes. Please try with Text/Visual prompt models."
    )
    assert embeddings.ndim == 3
    self.pe = embeddings
    self.model[-1].nc = len(names)
    self.names = check_class_names(names)

set_vocab

set_vocab(vocab, names)

Set vocabulary for the prompt-free model.

Parameters:

Name Type Description Default
vocab ModuleList

List of vocabulary items.

required
names List[str]

List of class names.

required
Source code in ultralytics/nn/tasks.py
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
def set_vocab(self, vocab, names):
    """
    Set vocabulary for the prompt-free model.

    Args:
        vocab (nn.ModuleList): List of vocabulary items.
        names (List[str]): List of class names.
    """
    assert not self.training
    head = self.model[-1]
    assert isinstance(head, YOLOEDetect)

    # Cache anchors for head
    device = next(self.parameters()).device
    self(torch.empty(1, 3, self.args["imgsz"], self.args["imgsz"]).to(device))  # warmup

    # re-parameterization for prompt-free model
    self.model[-1].lrpc = nn.ModuleList(
        LRPCHead(cls, pf[-1], loc[-1], enabled=i != 2)
        for i, (cls, pf, loc) in enumerate(zip(vocab, head.cv3, head.cv2))
    )
    for loc_head, cls_head in zip(head.cv2, head.cv3):
        assert isinstance(loc_head, nn.Sequential)
        assert isinstance(cls_head, nn.Sequential)
        del loc_head[-1]
        del cls_head[-1]
    self.model[-1].nc = len(names)
    self.names = check_class_names(names)





ultralytics.nn.tasks.YOLOESegModel

YOLOESegModel(cfg='yoloe-v8s-seg.yaml', ch=3, nc=None, verbose=True)

Bases: YOLOEModel, SegmentationModel

YOLOE segmentation model.

This class extends YOLOEModel to handle instance segmentation tasks with text and visual prompts, providing specialized loss computation for pixel-level object detection and segmentation.

Methods:

Name Description
loss

Compute loss with prompts for segmentation.

Examples:

Initialize a YOLOE segmentation model

>>> model = YOLOESegModel("yoloe-v8s-seg.yaml", ch=3, nc=80)
>>> results = model.predict(image_tensor, tpe=text_embeddings)

Parameters:

Name Type Description Default
cfg str | dict

Model configuration file path or dictionary.

'yoloe-v8s-seg.yaml'
ch int

Number of input channels.

3
nc int

Number of classes.

None
verbose bool

Whether to display model information.

True
Source code in ultralytics/nn/tasks.py
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
def __init__(self, cfg="yoloe-v8s-seg.yaml", ch=3, nc=None, verbose=True):
    """
    Initialize YOLOE segmentation model with given config and parameters.

    Args:
        cfg (str | dict): Model configuration file path or dictionary.
        ch (int): Number of input channels.
        nc (int, optional): Number of classes.
        verbose (bool): Whether to display model information.
    """
    super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

loss

loss(batch, preds=None)

Compute loss.

Parameters:

Name Type Description Default
batch dict

Batch to compute loss on.

required
preds Tensor | List[Tensor]

Predictions.

None
Source code in ultralytics/nn/tasks.py
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
def loss(self, batch, preds=None):
    """
    Compute loss.

    Args:
        batch (dict): Batch to compute loss on.
        preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
    """
    if not hasattr(self, "criterion"):
        from ultralytics.utils.loss import TVPSegmentLoss

        visual_prompt = batch.get("visuals", None) is not None  # TODO
        self.criterion = TVPSegmentLoss(self) if visual_prompt else self.init_criterion()

    if preds is None:
        preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
    return self.criterion(preds, batch)





ultralytics.nn.tasks.Ensemble

Ensemble()

Bases: ModuleList

Ensemble of models.

This class allows combining multiple YOLO models into an ensemble for improved performance through model averaging or other ensemble techniques.

Methods:

Name Description
forward

Generate predictions from all models in the ensemble.

Examples:

Create an ensemble of models

>>> ensemble = Ensemble()
>>> ensemble.append(model1)
>>> ensemble.append(model2)
>>> results = ensemble(image_tensor)
Source code in ultralytics/nn/tasks.py
1286
1287
1288
def __init__(self):
    """Initialize an ensemble of models."""
    super().__init__()

forward

forward(x, augment=False, profile=False, visualize=False)

Generate the YOLO network's final layer.

Parameters:

Name Type Description Default
x Tensor

Input tensor.

required
augment bool

Whether to augment the input.

False
profile bool

Whether to profile the model.

False
visualize bool

Whether to visualize the features.

False

Returns:

Name Type Description
y Tensor

Concatenated predictions from all models.

train_out None

Always None for ensemble inference.

Source code in ultralytics/nn/tasks.py
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
def forward(self, x, augment=False, profile=False, visualize=False):
    """
    Generate the YOLO network's final layer.

    Args:
        x (torch.Tensor): Input tensor.
        augment (bool): Whether to augment the input.
        profile (bool): Whether to profile the model.
        visualize (bool): Whether to visualize the features.

    Returns:
        y (torch.Tensor): Concatenated predictions from all models.
        train_out (None): Always None for ensemble inference.
    """
    y = [module(x, augment, profile, visualize)[0] for module in self]
    # y = torch.stack(y).max(0)[0]  # max ensemble
    # y = torch.stack(y).mean(0)  # mean ensemble
    y = torch.cat(y, 2)  # nms ensemble, y shape(B, HW, C)
    return y, None  # inference, train output





ultralytics.nn.tasks.SafeClass

SafeClass(*args, **kwargs)

A placeholder class to replace unknown classes during unpickling.

Source code in ultralytics/nn/tasks.py
1366
1367
1368
def __init__(self, *args, **kwargs):
    """Initialize SafeClass instance, ignoring all arguments."""
    pass

__call__

__call__(*args, **kwargs)

Run SafeClass instance, ignoring all arguments.

Source code in ultralytics/nn/tasks.py
1370
1371
1372
def __call__(self, *args, **kwargs):
    """Run SafeClass instance, ignoring all arguments."""
    pass





ultralytics.nn.tasks.SafeUnpickler

Bases: Unpickler

Custom Unpickler that replaces unknown classes with SafeClass.

find_class

find_class(module, name)

Attempt to find a class, returning SafeClass if not among safe modules.

Parameters:

Name Type Description Default
module str

Module name.

required
name str

Class name.

required

Returns:

Type Description
type

Found class or SafeClass.

Source code in ultralytics/nn/tasks.py
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
def find_class(self, module, name):
    """
    Attempt to find a class, returning SafeClass if not among safe modules.

    Args:
        module (str): Module name.
        name (str): Class name.

    Returns:
        (type): Found class or SafeClass.
    """
    safe_modules = (
        "torch",
        "collections",
        "collections.abc",
        "builtins",
        "math",
        "numpy",
        # Add other modules considered safe
    )
    if module in safe_modules:
        return super().find_class(module, name)
    else:
        return SafeClass





ultralytics.nn.tasks.temporary_modules

temporary_modules(modules=None, attributes=None)

Context manager for temporarily adding or modifying modules in Python's module cache (sys.modules).

This function can be used to change the module paths during runtime. It's useful when refactoring code, where you've moved a module from one ___location to another, but you still want to support the old import paths for backwards compatibility.

Parameters:

Name Type Description Default
modules dict

A dictionary mapping old module paths to new module paths.

None
attributes dict

A dictionary mapping old module attributes to new module attributes.

None

Examples:

>>> with temporary_modules({"old.module": "new.module"}, {"old.module.attribute": "new.module.attribute"}):
>>> import old.module  # this will now import new.module
>>> from old.module import attribute  # this will now import new.module.attribute
Note

The changes are only in effect inside the context manager and are undone once the context manager exits. Be aware that directly manipulating sys.modules can lead to unpredictable results, especially in larger applications or libraries. Use this function with caution.

Source code in ultralytics/nn/tasks.py
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
@contextlib.contextmanager
def temporary_modules(modules=None, attributes=None):
    """
    Context manager for temporarily adding or modifying modules in Python's module cache (`sys.modules`).

    This function can be used to change the module paths during runtime. It's useful when refactoring code,
    where you've moved a module from one ___location to another, but you still want to support the old import
    paths for backwards compatibility.

    Args:
        modules (dict, optional): A dictionary mapping old module paths to new module paths.
        attributes (dict, optional): A dictionary mapping old module attributes to new module attributes.

    Examples:
        >>> with temporary_modules({"old.module": "new.module"}, {"old.module.attribute": "new.module.attribute"}):
        >>> import old.module  # this will now import new.module
        >>> from old.module import attribute  # this will now import new.module.attribute

    Note:
        The changes are only in effect inside the context manager and are undone once the context manager exits.
        Be aware that directly manipulating `sys.modules` can lead to unpredictable results, especially in larger
        applications or libraries. Use this function with caution.
    """
    if modules is None:
        modules = {}
    if attributes is None:
        attributes = {}
    import sys
    from importlib import import_module

    try:
        # Set attributes in sys.modules under their old name
        for old, new in attributes.items():
            old_module, old_attr = old.rsplit(".", 1)
            new_module, new_attr = new.rsplit(".", 1)
            setattr(import_module(old_module), old_attr, getattr(import_module(new_module), new_attr))

        # Set modules in sys.modules under their old name
        for old, new in modules.items():
            sys.modules[old] = import_module(new)

        yield
    finally:
        # Remove the temporary module paths
        for old in modules:
            if old in sys.modules:
                del sys.modules[old]





ultralytics.nn.tasks.torch_safe_load

torch_safe_load(weight, safe_only=False)

Attempt to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised, it catches the error, logs a warning message, and attempts to install the missing module via the check_requirements() function. After installation, the function again attempts to load the model using torch.load().

Parameters:

Name Type Description Default
weight str

The file path of the PyTorch model.

required
safe_only bool

If True, replace unknown classes with SafeClass during loading.

False

Returns:

Name Type Description
ckpt dict

The loaded model checkpoint.

file str

The loaded filename.

Examples:

>>> from ultralytics.nn.tasks import torch_safe_load
>>> ckpt, file = torch_safe_load("path/to/best.pt", safe_only=True)
Source code in ultralytics/nn/tasks.py
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
def torch_safe_load(weight, safe_only=False):
    """
    Attempt to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised, it catches the
    error, logs a warning message, and attempts to install the missing module via the check_requirements() function.
    After installation, the function again attempts to load the model using torch.load().

    Args:
        weight (str): The file path of the PyTorch model.
        safe_only (bool): If True, replace unknown classes with SafeClass during loading.

    Returns:
        ckpt (dict): The loaded model checkpoint.
        file (str): The loaded filename.

    Examples:
        >>> from ultralytics.nn.tasks import torch_safe_load
        >>> ckpt, file = torch_safe_load("path/to/best.pt", safe_only=True)
    """
    from ultralytics.utils.downloads import attempt_download_asset

    check_suffix(file=weight, suffix=".pt")
    file = attempt_download_asset(weight)  # search online if missing locally
    try:
        with temporary_modules(
            modules={
                "ultralytics.yolo.utils": "ultralytics.utils",
                "ultralytics.yolo.v8": "ultralytics.models.yolo",
                "ultralytics.yolo.data": "ultralytics.data",
            },
            attributes={
                "ultralytics.nn.modules.block.Silence": "torch.nn.Identity",  # YOLOv9e
                "ultralytics.nn.tasks.YOLOv10DetectionModel": "ultralytics.nn.tasks.DetectionModel",  # YOLOv10
                "ultralytics.utils.loss.v10DetectLoss": "ultralytics.utils.loss.E2EDetectLoss",  # YOLOv10
            },
        ):
            if safe_only:
                # Load via custom pickle module
                safe_pickle = types.ModuleType("safe_pickle")
                safe_pickle.Unpickler = SafeUnpickler
                safe_pickle.load = lambda file_obj: SafeUnpickler(file_obj).load()
                with open(file, "rb") as f:
                    ckpt = torch.load(f, pickle_module=safe_pickle)
            else:
                ckpt = torch.load(file, map_location="cpu")

    except ModuleNotFoundError as e:  # e.name is missing module name
        if e.name == "models":
            raise TypeError(
                emojis(
                    f"ERROR ❌️ {weight} appears to be an Ultralytics YOLOv5 model originally trained "
                    f"with https://github.com/ultralytics/yolov5.\nThis model is NOT forwards compatible with "
                    f"YOLOv8 at https://github.com/ultralytics/ultralytics."
                    f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
                    f"run a command with an official Ultralytics model, i.e. 'yolo predict model=yolo11n.pt'"
                )
            ) from e
        LOGGER.warning(
            f"{weight} appears to require '{e.name}', which is not in Ultralytics requirements."
            f"\nAutoInstall will run now for '{e.name}' but this feature will be removed in the future."
            f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
            f"run a command with an official Ultralytics model, i.e. 'yolo predict model=yolo11n.pt'"
        )
        check_requirements(e.name)  # install missing module
        ckpt = torch.load(file, map_location="cpu")

    if not isinstance(ckpt, dict):
        # File is likely a YOLO instance saved with i.e. torch.save(model, "saved_model.pt")
        LOGGER.warning(
            f"The file '{weight}' appears to be improperly saved or formatted. "
            f"For optimal results, use model.save('filename.pt') to correctly save YOLO models."
        )
        ckpt = {"model": ckpt.model}

    return ckpt, file





ultralytics.nn.tasks.attempt_load_weights

attempt_load_weights(weights, device=None, inplace=True, fuse=False)

Load an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a.

Parameters:

Name Type Description Default
weights str | List[str]

Model weights path(s).

required
device device

Device to load model to.

None
inplace bool

Whether to do inplace operations.

True
fuse bool

Whether to fuse model.

False

Returns:

Type Description
Module

Loaded model.

Source code in ultralytics/nn/tasks.py
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
    """
    Load an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a.

    Args:
        weights (str | List[str]): Model weights path(s).
        device (torch.device, optional): Device to load model to.
        inplace (bool): Whether to do inplace operations.
        fuse (bool): Whether to fuse model.

    Returns:
        (torch.nn.Module): Loaded model.
    """
    ensemble = Ensemble()
    for w in weights if isinstance(weights, list) else [weights]:
        ckpt, w = torch_safe_load(w)  # load ckpt
        args = {**DEFAULT_CFG_DICT, **ckpt["train_args"]} if "train_args" in ckpt else None  # combined args
        model = (ckpt.get("ema") or ckpt["model"]).to(device).float()  # FP32 model

        # Model compatibility updates
        model.args = args  # attach args to model
        model.pt_path = w  # attach *.pt file path to model
        model.task = guess_model_task(model)
        if not hasattr(model, "stride"):
            model.stride = torch.tensor([32.0])

        # Append
        ensemble.append(model.fuse().eval() if fuse and hasattr(model, "fuse") else model.eval())  # model in eval mode

    # Module updates
    for m in ensemble.modules():
        if hasattr(m, "inplace"):
            m.inplace = inplace
        elif isinstance(m, torch.nn.Upsample) and not hasattr(m, "recompute_scale_factor"):
            m.recompute_scale_factor = None  # torch 1.11.0 compatibility

    # Return model
    if len(ensemble) == 1:
        return ensemble[-1]

    # Return ensemble
    LOGGER.info(f"Ensemble created with {weights}\n")
    for k in "names", "nc", "yaml":
        setattr(ensemble, k, getattr(ensemble[0], k))
    ensemble.stride = ensemble[int(torch.argmax(torch.tensor([m.stride.max() for m in ensemble])))].stride
    assert all(ensemble[0].nc == m.nc for m in ensemble), f"Models differ in class counts {[m.nc for m in ensemble]}"
    return ensemble





ultralytics.nn.tasks.attempt_load_one_weight

attempt_load_one_weight(weight, device=None, inplace=True, fuse=False)

Load a single model weights.

Parameters:

Name Type Description Default
weight str

Model weight path.

required
device device

Device to load model to.

None
inplace bool

Whether to do inplace operations.

True
fuse bool

Whether to fuse model.

False

Returns:

Name Type Description
model Module

Loaded model.

ckpt dict

Model checkpoint dictionary.

Source code in ultralytics/nn/tasks.py
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
    """
    Load a single model weights.

    Args:
        weight (str): Model weight path.
        device (torch.device, optional): Device to load model to.
        inplace (bool): Whether to do inplace operations.
        fuse (bool): Whether to fuse model.

    Returns:
        model (torch.nn.Module): Loaded model.
        ckpt (dict): Model checkpoint dictionary.
    """
    ckpt, weight = torch_safe_load(weight)  # load ckpt
    args = {**DEFAULT_CFG_DICT, **(ckpt.get("train_args", {}))}  # combine model and default args, preferring model args
    model = (ckpt.get("ema") or ckpt["model"]).to(device).float()  # FP32 model

    # Model compatibility updates
    model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS}  # attach args to model
    model.pt_path = weight  # attach *.pt file path to model
    model.task = guess_model_task(model)
    if not hasattr(model, "stride"):
        model.stride = torch.tensor([32.0])

    model = model.fuse().eval() if fuse and hasattr(model, "fuse") else model.eval()  # model in eval mode

    # Module updates
    for m in model.modules():
        if hasattr(m, "inplace"):
            m.inplace = inplace
        elif isinstance(m, torch.nn.Upsample) and not hasattr(m, "recompute_scale_factor"):
            m.recompute_scale_factor = None  # torch 1.11.0 compatibility

    # Return model and ckpt
    return model, ckpt





ultralytics.nn.tasks.parse_model

parse_model(d, ch, verbose=True)

Parse a YOLO model.yaml dictionary into a PyTorch model.

Parameters:

Name Type Description Default
d dict

Model dictionary.

required
ch int

Input channels.

required
verbose bool

Whether to print model details.

True

Returns:

Name Type Description
model Sequential

PyTorch model.

save list

Sorted list of output layers.

Source code in ultralytics/nn/tasks.py
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
def parse_model(d, ch, verbose=True):
    """
    Parse a YOLO model.yaml dictionary into a PyTorch model.

    Args:
        d (dict): Model dictionary.
        ch (int): Input channels.
        verbose (bool): Whether to print model details.

    Returns:
        model (torch.nn.Sequential): PyTorch model.
        save (list): Sorted list of output layers.
    """
    import ast

    # Args
    legacy = True  # backward compatibility for v3/v5/v8/v9 models
    max_channels = float("inf")
    nc, act, scales = (d.get(x) for x in ("nc", "activation", "scales"))
    depth, width, kpt_shape = (d.get(x, 1.0) for x in ("depth_multiple", "width_multiple", "kpt_shape"))
    if scales:
        scale = d.get("scale")
        if not scale:
            scale = tuple(scales.keys())[0]
            LOGGER.warning(f"no model scale passed. Assuming scale='{scale}'.")
        depth, width, max_channels = scales[scale]

    if act:
        Conv.default_act = eval(act)  # redefine default activation, i.e. Conv.default_act = torch.nn.SiLU()
        if verbose:
            LOGGER.info(f"{colorstr('activation:')} {act}")  # print

    if verbose:
        LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10}  {'module':<45}{'arguments':<30}")
    ch = [ch]
    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
    base_modules = frozenset(
        {
            Classify,
            Conv,
            ConvTranspose,
            GhostConv,
            Bottleneck,
            GhostBottleneck,
            SPP,
            SPPF,
            C2fPSA,
            C2PSA,
            DWConv,
            Focus,
            BottleneckCSP,
            C1,
            C2,
            C2f,
            C3k2,
            RepNCSPELAN4,
            ELAN1,
            ADown,
            AConv,
            SPPELAN,
            C2fAttn,
            C3,
            C3TR,
            C3Ghost,
            torch.nn.ConvTranspose2d,
            DWConvTranspose2d,
            C3x,
            RepC3,
            PSA,
            SCDown,
            C2fCIB,
            A2C2f,
        }
    )
    repeat_modules = frozenset(  # modules with 'repeat' arguments
        {
            BottleneckCSP,
            C1,
            C2,
            C2f,
            C3k2,
            C2fAttn,
            C3,
            C3TR,
            C3Ghost,
            C3x,
            RepC3,
            C2fPSA,
            C2fCIB,
            C2PSA,
            A2C2f,
        }
    )
    for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]):  # from, number, module, args
        m = (
            getattr(torch.nn, m[3:])
            if "nn." in m
            else getattr(__import__("torchvision").ops, m[16:])
            if "torchvision.ops." in m
            else globals()[m]
        )  # get module
        for j, a in enumerate(args):
            if isinstance(a, str):
                with contextlib.suppress(ValueError):
                    args[j] = locals()[a] if a in locals() else ast.literal_eval(a)
        n = n_ = max(round(n * depth), 1) if n > 1 else n  # depth gain
        if m in base_modules:
            c1, c2 = ch[f], args[0]
            if c2 != nc:  # if c2 not equal to number of classes (i.e. for Classify() output)
                c2 = make_divisible(min(c2, max_channels) * width, 8)
            if m is C2fAttn:  # set 1) embed channels and 2) num heads
                args[1] = make_divisible(min(args[1], max_channels // 2) * width, 8)
                args[2] = int(max(round(min(args[2], max_channels // 2 // 32)) * width, 1) if args[2] > 1 else args[2])

            args = [c1, c2, *args[1:]]
            if m in repeat_modules:
                args.insert(2, n)  # number of repeats
                n = 1
            if m is C3k2:  # for M/L/X sizes
                legacy = False
                if scale in "mlx":
                    args[3] = True
            if m is A2C2f:
                legacy = False
                if scale in "lx":  # for L/X sizes
                    args.extend((True, 1.2))
            if m is C2fCIB:
                legacy = False
        elif m is AIFI:
            args = [ch[f], *args]
        elif m in frozenset({HGStem, HGBlock}):
            c1, cm, c2 = ch[f], args[0], args[1]
            args = [c1, cm, c2, *args[2:]]
            if m is HGBlock:
                args.insert(4, n)  # number of repeats
                n = 1
        elif m is ResNetLayer:
            c2 = args[1] if args[3] else args[1] * 4
        elif m is torch.nn.BatchNorm2d:
            args = [ch[f]]
        elif m is Concat:
            c2 = sum(ch[x] for x in f)
        elif m in frozenset(
            {Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect}
        ):
            args.append([ch[x] for x in f])
            if m is Segment or m is YOLOESegment:
                args[2] = make_divisible(min(args[2], max_channels) * width, 8)
            if m in {Detect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB}:
                m.legacy = legacy
        elif m is RTDETRDecoder:  # special case, channels arg must be passed in index 1
            args.insert(1, [ch[x] for x in f])
        elif m is CBLinear:
            c2 = args[0]
            c1 = ch[f]
            args = [c1, c2, *args[1:]]
        elif m is CBFuse:
            c2 = ch[f[-1]]
        elif m in frozenset({TorchVision, Index}):
            c2 = args[0]
            c1 = ch[f]
            args = [*args[1:]]
        else:
            c2 = ch[f]

        m_ = torch.nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
        t = str(m)[8:-2].replace("__main__.", "")  # module type
        m_.np = sum(x.numel() for x in m_.parameters())  # number params
        m_.i, m_.f, m_.type = i, f, t  # attach index, 'from' index, type
        if verbose:
            LOGGER.info(f"{i:>3}{str(f):>20}{n_:>3}{m_.np:10.0f}  {t:<45}{str(args):<30}")  # print
        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
        layers.append(m_)
        if i == 0:
            ch = []
        ch.append(c2)
    return torch.nn.Sequential(*layers), sorted(save)





ultralytics.nn.tasks.yaml_model_load

yaml_model_load(path)

Load a YOLOv8 model from a YAML file.

Parameters:

Name Type Description Default
path str | Path

Path to the YAML file.

required

Returns:

Type Description
dict

Model dictionary.

Source code in ultralytics/nn/tasks.py
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
def yaml_model_load(path):
    """
    Load a YOLOv8 model from a YAML file.

    Args:
        path (str | Path): Path to the YAML file.

    Returns:
        (dict): Model dictionary.
    """
    path = Path(path)
    if path.stem in (f"yolov{d}{x}6" for x in "nsmlx" for d in (5, 8)):
        new_stem = re.sub(r"(\d+)([nslmx])6(.+)?$", r"\1\2-p6\3", path.stem)
        LOGGER.warning(f"Ultralytics YOLO P6 models now use -p6 suffix. Renaming {path.stem} to {new_stem}.")
        path = path.with_name(new_stem + path.suffix)

    unified_path = re.sub(r"(\d+)([nslmx])(.+)?$", r"\1\3", str(path))  # i.e. yolov8x.yaml -> yolov8.yaml
    yaml_file = check_yaml(unified_path, hard=False) or check_yaml(path)
    d = YAML.load(yaml_file)  # model dict
    d["scale"] = guess_model_scale(path)
    d["yaml_file"] = str(path)
    return d





ultralytics.nn.tasks.guess_model_scale

guess_model_scale(model_path)

Extract the size character n, s, m, l, or x of the model's scale from the model path.

Parameters:

Name Type Description Default
model_path str | Path

The path to the YOLO model's YAML file.

required

Returns:

Type Description
str

The size character of the model's scale (n, s, m, l, or x).

Source code in ultralytics/nn/tasks.py
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
def guess_model_scale(model_path):
    """
    Extract the size character n, s, m, l, or x of the model's scale from the model path.

    Args:
        model_path (str | Path): The path to the YOLO model's YAML file.

    Returns:
        (str): The size character of the model's scale (n, s, m, l, or x).
    """
    try:
        return re.search(r"yolo(e-)?[v]?\d+([nslmx])", Path(model_path).stem).group(2)  # noqa
    except AttributeError:
        return ""





ultralytics.nn.tasks.guess_model_task

guess_model_task(model)

Guess the task of a PyTorch model from its architecture or configuration.

Parameters:

Name Type Description Default
model Module | dict

PyTorch model or model configuration in YAML format.

required

Returns:

Type Description
str

Task of the model ('detect', 'segment', 'classify', 'pose', 'obb').

Source code in ultralytics/nn/tasks.py
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
def guess_model_task(model):
    """
    Guess the task of a PyTorch model from its architecture or configuration.

    Args:
        model (torch.nn.Module | dict): PyTorch model or model configuration in YAML format.

    Returns:
        (str): Task of the model ('detect', 'segment', 'classify', 'pose', 'obb').
    """

    def cfg2task(cfg):
        """Guess from YAML dictionary."""
        m = cfg["head"][-1][-2].lower()  # output module name
        if m in {"classify", "classifier", "cls", "fc"}:
            return "classify"
        if "detect" in m:
            return "detect"
        if "segment" in m:
            return "segment"
        if m == "pose":
            return "pose"
        if m == "obb":
            return "obb"

    # Guess from model cfg
    if isinstance(model, dict):
        with contextlib.suppress(Exception):
            return cfg2task(model)
    # Guess from PyTorch model
    if isinstance(model, torch.nn.Module):  # PyTorch model
        for x in "model.args", "model.model.args", "model.model.model.args":
            with contextlib.suppress(Exception):
                return eval(x)["task"]
        for x in "model.yaml", "model.model.yaml", "model.model.model.yaml":
            with contextlib.suppress(Exception):
                return cfg2task(eval(x))
        for m in model.modules():
            if isinstance(m, (Segment, YOLOESegment)):
                return "segment"
            elif isinstance(m, Classify):
                return "classify"
            elif isinstance(m, Pose):
                return "pose"
            elif isinstance(m, OBB):
                return "obb"
            elif isinstance(m, (Detect, WorldDetect, YOLOEDetect, v10Detect)):
                return "detect"

    # Guess from model filename
    if isinstance(model, (str, Path)):
        model = Path(model)
        if "-seg" in model.stem or "segment" in model.parts:
            return "segment"
        elif "-cls" in model.stem or "classify" in model.parts:
            return "classify"
        elif "-pose" in model.stem or "pose" in model.parts:
            return "pose"
        elif "-obb" in model.stem or "obb" in model.parts:
            return "obb"
        elif "detect" in model.parts:
            return "detect"

    # Unable to determine task from model
    LOGGER.warning(
        "Unable to automatically guess model task, assuming 'task=detect'. "
        "Explicitly define task for your model, i.e. 'task=detect', 'segment', 'classify','pose' or 'obb'."
    )
    return "detect"  # assume detect





📅 Created 1 year ago ✏️ Updated 2 months ago