Skip to content

Reference for ultralytics/models/yolo/model.py

Note

This file is available at https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/yolo/model.py. If you spot a problem please help fix it by contributing a Pull Request 🛠️. Thank you 🙏!


ultralytics.models.yolo.model.YOLO

YOLO(
    model: Union[str, Path] = "yolo11n.pt",
    task: Optional[str] = None,
    verbose: bool = False,
)

Bases: Model

YOLO (You Only Look Once) object detection model.

This class provides a unified interface for YOLO models, automatically switching to specialized model types (YOLOWorld or YOLOE) based on the model filename. It supports various computer vision tasks including object detection, segmentation, classification, pose estimation, and oriented bounding box detection.

Attributes:

Name Type Description
model

The loaded YOLO model instance.

task

The task type (detect, segment, classify, pose, obb).

overrides

Configuration overrides for the model.

Methods:

Name Description
task_map

Map tasks to their corresponding model, trainer, validator, and predictor classes.

Examples:

Load a pretrained YOLOv11n detection model

>>> model = YOLO("yolo11n.pt")

Load a pretrained YOLO11n segmentation model

>>> model = YOLO("yolo11n-seg.pt")

Initialize from a YAML configuration

>>> model = YOLO("yolo11n.yaml")

This constructor initializes a YOLO model, automatically switching to specialized model types (YOLOWorld or YOLOE) based on the model filename.

Parameters:

Name Type Description Default
model str | Path

Model name or path to model file, i.e. 'yolo11n.pt', 'yolo11n.yaml'.

'yolo11n.pt'
task str

YOLO task specification, i.e. 'detect', 'segment', 'classify', 'pose', 'obb'. Defaults to auto-detection based on model.

None
verbose bool

Display model info on load.

False

Examples:

>>> from ultralytics import YOLO
>>> model = YOLO("yolo11n.pt")  # load a pretrained YOLOv11n detection model
>>> model = YOLO("yolo11n-seg.pt")  # load a pretrained YOLO11n segmentation model
Source code in ultralytics/models/yolo/model.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def __init__(self, model: Union[str, Path] = "yolo11n.pt", task: Optional[str] = None, verbose: bool = False):
    """
    Initialize a YOLO model.

    This constructor initializes a YOLO model, automatically switching to specialized model types
    (YOLOWorld or YOLOE) based on the model filename.

    Args:
        model (str | Path): Model name or path to model file, i.e. 'yolo11n.pt', 'yolo11n.yaml'.
        task (str, optional): YOLO task specification, i.e. 'detect', 'segment', 'classify', 'pose', 'obb'.
            Defaults to auto-detection based on model.
        verbose (bool): Display model info on load.

    Examples:
        >>> from ultralytics import YOLO
        >>> model = YOLO("yolo11n.pt")  # load a pretrained YOLOv11n detection model
        >>> model = YOLO("yolo11n-seg.pt")  # load a pretrained YOLO11n segmentation model
    """
    path = Path(model if isinstance(model, (str, Path)) else "")
    if "-world" in path.stem and path.suffix in {".pt", ".yaml", ".yml"}:  # if YOLOWorld PyTorch model
        new_instance = YOLOWorld(path, verbose=verbose)
        self.__class__ = type(new_instance)
        self.__dict__ = new_instance.__dict__
    elif "yoloe" in path.stem and path.suffix in {".pt", ".yaml", ".yml"}:  # if YOLOE PyTorch model
        new_instance = YOLOE(path, task=task, verbose=verbose)
        self.__class__ = type(new_instance)
        self.__dict__ = new_instance.__dict__
    else:
        # Continue with default YOLO initialization
        super().__init__(model=model, task=task, verbose=verbose)
        if hasattr(self.model, "model") and "RTDETR" in self.model.model[-1]._get_name():  # if RTDETR head
            from ultralytics import RTDETR

            new_instance = RTDETR(self)
            self.__class__ = type(new_instance)
            self.__dict__ = new_instance.__dict__

task_map property

task_map: Dict[str, Dict[str, Any]]

Map head to model, trainer, validator, and predictor classes.





ultralytics.models.yolo.model.YOLOWorld

YOLOWorld(model: Union[str, Path] = 'yolov8s-world.pt', verbose: bool = False)

Bases: Model

YOLO-World object detection model.

YOLO-World is an open-vocabulary object detection model that can detect objects based on text descriptions without requiring training on specific classes. It extends the YOLO architecture to support real-time open-vocabulary detection.

Attributes:

Name Type Description
model

The loaded YOLO-World model instance.

task

Always set to 'detect' for object detection.

overrides

Configuration overrides for the model.

Methods:

Name Description
task_map

Map tasks to their corresponding model, trainer, validator, and predictor classes.

set_classes

Set the model's class names for detection.

Examples:

Load a YOLOv8-World model

>>> model = YOLOWorld("yolov8s-world.pt")

Set custom classes for detection

>>> model.set_classes(["person", "car", "bicycle"])

Loads a YOLOv8-World model for object detection. If no custom class names are provided, it assigns default COCO class names.

Parameters:

Name Type Description Default
model str | Path

Path to the pre-trained model file. Supports .pt and .yaml formats.

'yolov8s-world.pt'
verbose bool

If True, prints additional information during initialization.

False
Source code in ultralytics/models/yolo/model.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def __init__(self, model: Union[str, Path] = "yolov8s-world.pt", verbose: bool = False) -> None:
    """
    Initialize YOLOv8-World model with a pre-trained model file.

    Loads a YOLOv8-World model for object detection. If no custom class names are provided, it assigns default
    COCO class names.

    Args:
        model (str | Path): Path to the pre-trained model file. Supports *.pt and *.yaml formats.
        verbose (bool): If True, prints additional information during initialization.
    """
    super().__init__(model=model, task="detect", verbose=verbose)

    # Assign default COCO class names when there are no custom names
    if not hasattr(self.model, "names"):
        self.model.names = YAML.load(ROOT / "cfg/datasets/coco8.yaml").get("names")

task_map property

task_map: Dict[str, Dict[str, Any]]

Map head to model, validator, and predictor classes.

set_classes

set_classes(classes: List[str]) -> None

Set the model's class names for detection.

Parameters:

Name Type Description Default
classes List[str]

A list of categories i.e. ["person"].

required
Source code in ultralytics/models/yolo/model.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def set_classes(self, classes: List[str]) -> None:
    """
    Set the model's class names for detection.

    Args:
        classes (List[str]): A list of categories i.e. ["person"].
    """
    self.model.set_classes(classes)
    # Remove background if it's given
    background = " "
    if background in classes:
        classes.remove(background)
    self.model.names = classes

    # Reset method class names
    if self.predictor:
        self.predictor.model.names = classes





ultralytics.models.yolo.model.YOLOE

YOLOE(
    model: Union[str, Path] = "yoloe-11s-seg.pt",
    task: Optional[str] = None,
    verbose: bool = False,
)

Bases: Model

YOLOE object detection and segmentation model.

YOLOE is an enhanced YOLO model that supports both object detection and instance segmentation tasks with improved performance and additional features like visual and text positional embeddings.

Attributes:

Name Type Description
model

The loaded YOLOE model instance.

task

The task type (detect or segment).

overrides

Configuration overrides for the model.

Methods:

Name Description
task_map

Map tasks to their corresponding model, trainer, validator, and predictor classes.

get_text_pe

Get text positional embeddings for the given texts.

get_visual_pe

Get visual positional embeddings for the given image and visual features.

set_vocab

Set vocabulary and class names for the YOLOE model.

get_vocab

Get vocabulary for the given class names.

set_classes

Set the model's class names and embeddings for detection.

val

Validate the model using text or visual prompts.

predict

Run prediction on images, videos, directories, streams, etc.

Examples:

Load a YOLOE detection model

>>> model = YOLOE("yoloe-11s-seg.pt")

Set vocabulary and class names

>>> model.set_vocab(["person", "car", "dog"], ["person", "car", "dog"])

Predict with visual prompts

>>> prompts = {"bboxes": [[10, 20, 100, 200]], "cls": ["person"]}
>>> results = model.predict("image.jpg", visual_prompts=prompts)

Parameters:

Name Type Description Default
model str | Path

Path to the pre-trained model file. Supports .pt and .yaml formats.

'yoloe-11s-seg.pt'
task str

Task type for the model. Auto-detected if None.

None
verbose bool

If True, prints additional information during initialization.

False
Source code in ultralytics/models/yolo/model.py
233
234
235
236
237
238
239
240
241
242
243
244
def __init__(
    self, model: Union[str, Path] = "yoloe-11s-seg.pt", task: Optional[str] = None, verbose: bool = False
) -> None:
    """
    Initialize YOLOE model with a pre-trained model file.

    Args:
        model (str | Path): Path to the pre-trained model file. Supports *.pt and *.yaml formats.
        task (str, optional): Task type for the model. Auto-detected if None.
        verbose (bool): If True, prints additional information during initialization.
    """
    super().__init__(model=model, task=task, verbose=verbose)

task_map property

task_map: Dict[str, Dict[str, Any]]

Map head to model, validator, and predictor classes.

get_text_pe

get_text_pe(texts)

Get text positional embeddings for the given texts.

Source code in ultralytics/models/yolo/model.py
264
265
266
267
def get_text_pe(self, texts):
    """Get text positional embeddings for the given texts."""
    assert isinstance(self.model, YOLOEModel)
    return self.model.get_text_pe(texts)

get_visual_pe

get_visual_pe(img, visual)

Get visual positional embeddings for the given image and visual features.

This method extracts positional embeddings from visual features based on the input image. It requires that the model is an instance of YOLOEModel.

Parameters:

Name Type Description Default
img Tensor

Input image tensor.

required
visual Tensor

Visual features extracted from the image.

required

Returns:

Type Description
Tensor

Visual positional embeddings.

Examples:

>>> model = YOLOE("yoloe-11s-seg.pt")
>>> img = torch.rand(1, 3, 640, 640)
>>> visual_features = torch.rand(1, 1, 80, 80)
>>> pe = model.get_visual_pe(img, visual_features)
Source code in ultralytics/models/yolo/model.py
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def get_visual_pe(self, img, visual):
    """
    Get visual positional embeddings for the given image and visual features.

    This method extracts positional embeddings from visual features based on the input image. It requires
    that the model is an instance of YOLOEModel.

    Args:
        img (torch.Tensor): Input image tensor.
        visual (torch.Tensor): Visual features extracted from the image.

    Returns:
        (torch.Tensor): Visual positional embeddings.

    Examples:
        >>> model = YOLOE("yoloe-11s-seg.pt")
        >>> img = torch.rand(1, 3, 640, 640)
        >>> visual_features = torch.rand(1, 1, 80, 80)
        >>> pe = model.get_visual_pe(img, visual_features)
    """
    assert isinstance(self.model, YOLOEModel)
    return self.model.get_visual_pe(img, visual)

get_vocab

get_vocab(names)

Get vocabulary for the given class names.

Source code in ultralytics/models/yolo/model.py
313
314
315
316
def get_vocab(self, names):
    """Get vocabulary for the given class names."""
    assert isinstance(self.model, YOLOEModel)
    return self.model.get_vocab(names)

predict

predict(
    source=None,
    stream: bool = False,
    visual_prompts: Dict[str, List] = {},
    refer_image=None,
    predictor=None,
    **kwargs
)

Run prediction on images, videos, directories, streams, etc.

Parameters:

Name Type Description Default
source str | int | Image | ndarray

Source for prediction. Accepts image paths, directory paths, URL/YouTube streams, PIL images, numpy arrays, or webcam indices.

None
stream bool

Whether to stream the prediction results. If True, results are yielded as a generator as they are computed.

False
visual_prompts Dict[str, List]

Dictionary containing visual prompts for the model. Must include 'bboxes' and 'cls' keys when non-empty.

{}
refer_image str | Image | ndarray

Reference image for visual prompts.

None
predictor callable

Custom predictor function. If None, a predictor is automatically loaded based on the task.

None
**kwargs Any

Additional keyword arguments passed to the predictor.

{}

Returns:

Type Description
List | generator

List of Results objects or generator of Results objects if stream=True.

Examples:

>>> model = YOLOE("yoloe-11s-seg.pt")
>>> results = model.predict("path/to/image.jpg")
>>> # With visual prompts
>>> prompts = {"bboxes": [[10, 20, 100, 200]], "cls": ["person"]}
>>> results = model.predict("path/to/image.jpg", visual_prompts=prompts)
Source code in ultralytics/models/yolo/model.py
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
def predict(
    self,
    source=None,
    stream: bool = False,
    visual_prompts: Dict[str, List] = {},
    refer_image=None,
    predictor=None,
    **kwargs,
):
    """
    Run prediction on images, videos, directories, streams, etc.

    Args:
        source (str | int | PIL.Image | np.ndarray, optional): Source for prediction. Accepts image paths,
            directory paths, URL/YouTube streams, PIL images, numpy arrays, or webcam indices.
        stream (bool): Whether to stream the prediction results. If True, results are yielded as a
            generator as they are computed.
        visual_prompts (Dict[str, List]): Dictionary containing visual prompts for the model. Must include
            'bboxes' and 'cls' keys when non-empty.
        refer_image (str | PIL.Image | np.ndarray, optional): Reference image for visual prompts.
        predictor (callable, optional): Custom predictor function. If None, a predictor is automatically
            loaded based on the task.
        **kwargs (Any): Additional keyword arguments passed to the predictor.

    Returns:
        (List | generator): List of Results objects or generator of Results objects if stream=True.

    Examples:
        >>> model = YOLOE("yoloe-11s-seg.pt")
        >>> results = model.predict("path/to/image.jpg")
        >>> # With visual prompts
        >>> prompts = {"bboxes": [[10, 20, 100, 200]], "cls": ["person"]}
        >>> results = model.predict("path/to/image.jpg", visual_prompts=prompts)
    """
    if len(visual_prompts):
        assert "bboxes" in visual_prompts and "cls" in visual_prompts, (
            f"Expected 'bboxes' and 'cls' in visual prompts, but got {visual_prompts.keys()}"
        )
        assert len(visual_prompts["bboxes"]) == len(visual_prompts["cls"]), (
            f"Expected equal number of bounding boxes and classes, but got {len(visual_prompts['bboxes'])} and "
            f"{len(visual_prompts['cls'])} respectively"
        )
        if not isinstance(self.predictor, yolo.yoloe.YOLOEVPDetectPredictor):
            self.predictor = (predictor or yolo.yoloe.YOLOEVPDetectPredictor)(
                overrides={
                    "task": self.model.task,
                    "mode": "predict",
                    "save": False,
                    "verbose": refer_image is None,
                    "batch": 1,
                },
                _callbacks=self.callbacks,
            )

        num_cls = (
            max(len(set(c)) for c in visual_prompts["cls"])
            if isinstance(source, list) and refer_image is None  # means multiple images
            else len(set(visual_prompts["cls"]))
        )
        self.model.model[-1].nc = num_cls
        self.model.names = [f"object{i}" for i in range(num_cls)]
        self.predictor.set_prompts(visual_prompts.copy())
        self.predictor.setup_model(model=self.model)

        if refer_image is None and source is not None:
            dataset = load_inference_source(source)
            if dataset.mode in {"video", "stream"}:
                # NOTE: set the first frame as refer image for videos/streams inference
                refer_image = next(iter(dataset))[1][0]
        if refer_image is not None:
            vpe = self.predictor.get_vpe(refer_image)
            self.model.set_classes(self.model.names, vpe)
            self.task = "segment" if isinstance(self.predictor, yolo.segment.SegmentationPredictor) else "detect"
            self.predictor = None  # reset predictor
    elif isinstance(self.predictor, yolo.yoloe.YOLOEVPDetectPredictor):
        self.predictor = None  # reset predictor if no visual prompts

    return super().predict(source, stream, **kwargs)

set_classes

set_classes(classes: List[str], embeddings) -> None

Set the model's class names and embeddings for detection.

Parameters:

Name Type Description Default
classes List[str]

A list of categories i.e. ["person"].

required
embeddings Tensor

Embeddings corresponding to the classes.

required
Source code in ultralytics/models/yolo/model.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def set_classes(self, classes: List[str], embeddings) -> None:
    """
    Set the model's class names and embeddings for detection.

    Args:
        classes (List[str]): A list of categories i.e. ["person"].
        embeddings (torch.Tensor): Embeddings corresponding to the classes.
    """
    assert isinstance(self.model, YOLOEModel)
    self.model.set_classes(classes, embeddings)
    # Verify no background class is present
    assert " " not in classes
    self.model.names = classes

    # Reset method class names
    if self.predictor:
        self.predictor.model.names = classes

set_vocab

set_vocab(vocab: List[str], names: List[str]) -> None

Set vocabulary and class names for the YOLOE model.

This method configures the vocabulary and class names used by the model for text processing and classification tasks. The model must be an instance of YOLOEModel.

Parameters:

Name Type Description Default
vocab List[str]

Vocabulary list containing tokens or words used by the model for text processing.

required
names List[str]

List of class names that the model can detect or classify.

required

Raises:

Type Description
AssertionError

If the model is not an instance of YOLOEModel.

Examples:

>>> model = YOLOE("yoloe-11s-seg.pt")
>>> model.set_vocab(["person", "car", "dog"], ["person", "car", "dog"])
Source code in ultralytics/models/yolo/model.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def set_vocab(self, vocab: List[str], names: List[str]) -> None:
    """
    Set vocabulary and class names for the YOLOE model.

    This method configures the vocabulary and class names used by the model for text processing and
    classification tasks. The model must be an instance of YOLOEModel.

    Args:
        vocab (List[str]): Vocabulary list containing tokens or words used by the model for text processing.
        names (List[str]): List of class names that the model can detect or classify.

    Raises:
        AssertionError: If the model is not an instance of YOLOEModel.

    Examples:
        >>> model = YOLOE("yoloe-11s-seg.pt")
        >>> model.set_vocab(["person", "car", "dog"], ["person", "car", "dog"])
    """
    assert isinstance(self.model, YOLOEModel)
    self.model.set_vocab(vocab, names=names)

val

val(
    validator=None,
    load_vp: bool = False,
    refer_data: Optional[str] = None,
    **kwargs
)

Validate the model using text or visual prompts.

Parameters:

Name Type Description Default
validator callable

A callable validator function. If None, a default validator is loaded.

None
load_vp bool

Whether to load visual prompts. If False, text prompts are used.

False
refer_data str

Path to the reference data for visual prompts.

None
**kwargs Any

Additional keyword arguments to override default settings.

{}

Returns:

Type Description
dict

Validation statistics containing metrics computed during validation.

Source code in ultralytics/models/yolo/model.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def val(
    self,
    validator=None,
    load_vp: bool = False,
    refer_data: Optional[str] = None,
    **kwargs,
):
    """
    Validate the model using text or visual prompts.

    Args:
        validator (callable, optional): A callable validator function. If None, a default validator is loaded.
        load_vp (bool): Whether to load visual prompts. If False, text prompts are used.
        refer_data (str, optional): Path to the reference data for visual prompts.
        **kwargs (Any): Additional keyword arguments to override default settings.

    Returns:
        (dict): Validation statistics containing metrics computed during validation.
    """
    custom = {"rect": not load_vp}  # method defaults
    args = {**self.overrides, **custom, **kwargs, "mode": "val"}  # highest priority args on the right

    validator = (validator or self._smart_load("validator"))(args=args, _callbacks=self.callbacks)
    validator(model=self.model, load_vp=load_vp, refer_data=refer_data)
    self.metrics = validator.metrics
    return validator.metrics





📅 Created 1 year ago ✏️ Updated 3 months ago