YOLOv5 v5.0 release compatibility update for YOLOv3

2021-05-30 18:55:56 +02:00
parent 47ac6833ca
commit 4d0c2e6eee
38 changed files with 1192 additions and 528 deletions
@@ -13,8 +13,8 @@ from PIL import Image
 from torch.cuda import amp

 from utils.datasets import letterbox
-from utils.general import non_max_suppression, make_divisible, scale_coords, increment_path, xyxy2xywh
-from utils.plots import color_list, plot_one_box
+from utils.general import non_max_suppression, make_divisible, scale_coords, increment_path, xyxy2xywh, save_one_box
+from utils.plots import colors, plot_one_box
 from utils.torch_utils import time_synchronized


@@ -215,32 +215,34 @@ class NMS(nn.Module):
    conf = 0.25  # confidence threshold
    iou = 0.45  # IoU threshold
    classes = None  # (optional list) filter by class
+    max_det = 1000  # maximum number of detections per image

    def __init__(self):
        super(NMS, self).__init__()

    def forward(self, x):
-        return non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)
+        return non_max_suppression(x[0], self.conf, iou_thres=self.iou, classes=self.classes, max_det=self.max_det)


-class autoShape(nn.Module):
+class AutoShape(nn.Module):
    # input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS
    conf = 0.25  # NMS confidence threshold
    iou = 0.45  # NMS IoU threshold
    classes = None  # (optional list) filter by class
+    max_det = 1000  # maximum number of detections per image

    def __init__(self, model):
-        super(autoShape, self).__init__()
+        super(AutoShape, self).__init__()
        self.model = model.eval()

    def autoshape(self):
-        print('autoShape already enabled, skipping... ')  # model already converted to model.autoshape()
+        print('AutoShape already enabled, skipping... ')  # model already converted to model.autoshape()
        return self

    @torch.no_grad()
    def forward(self, imgs, size=640, augment=False, profile=False):
        # Inference from various sources. For height=640, width=1280, RGB images example inputs are:
-        #   filename:   imgs = 'data/samples/zidane.jpg'
+        #   filename:   imgs = 'data/images/zidane.jpg'
        #   URI:             = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/zidane.jpg'
        #   OpenCV:          = cv2.imread('image.jpg')[:,:,::-1]  # HWC BGR to RGB x(640,1280,3)
        #   PIL:             = Image.open('image.jpg')  # HWC x(640,1280,3)
@@ -271,7 +273,7 @@ class autoShape(nn.Module):
            shape0.append(s)  # image shape
            g = (size / max(s))  # gain
            shape1.append([y * g for y in s])
-            imgs[i] = im  # update
+            imgs[i] = im if im.data.contiguous else np.ascontiguousarray(im)  # update
        shape1 = [make_divisible(x, int(self.stride.max())) for x in np.stack(shape1, 0).max(0)]  # inference shape
        x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs]  # pad
        x = np.stack(x, 0) if n > 1 else x[0][None]  # stack
@@ -285,7 +287,7 @@ class autoShape(nn.Module):
            t.append(time_synchronized())

            # Post-process
-            y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)  # NMS
+            y = non_max_suppression(y, self.conf, iou_thres=self.iou, classes=self.classes, max_det=self.max_det)  # NMS
            for i in range(n):
                scale_coords(shape1, y[i][:, :4], shape0[i])

@@ -311,29 +313,32 @@ class Detections:
        self.t = tuple((times[i + 1] - times[i]) * 1000 / self.n for i in range(3))  # timestamps (ms)
        self.s = shape  # inference BCHW shape

-    def display(self, pprint=False, show=False, save=False, render=False, save_dir=''):
-        colors = color_list()
-        for i, (img, pred) in enumerate(zip(self.imgs, self.pred)):
-            str = f'image {i + 1}/{len(self.pred)}: {img.shape[0]}x{img.shape[1]} '
+    def display(self, pprint=False, show=False, save=False, crop=False, render=False, save_dir=Path('')):
+        for i, (im, pred) in enumerate(zip(self.imgs, self.pred)):
+            str = f'image {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} '
            if pred is not None:
                for c in pred[:, -1].unique():
                    n = (pred[:, -1] == c).sum()  # detections per class
                    str += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, "  # add to string
-                if show or save or render:
+                if show or save or render or crop:
                    for *box, conf, cls in pred:  # xyxy, confidence, class
                        label = f'{self.names[int(cls)]} {conf:.2f}'
-                        plot_one_box(box, img, label=label, color=colors[int(cls) % 10])
-            img = Image.fromarray(img.astype(np.uint8)) if isinstance(img, np.ndarray) else img  # from np
+                        if crop:
+                            save_one_box(box, im, file=save_dir / 'crops' / self.names[int(cls)] / self.files[i])
+                        else:  # all others
+                            plot_one_box(box, im, label=label, color=colors(cls))
+
+            im = Image.fromarray(im.astype(np.uint8)) if isinstance(im, np.ndarray) else im  # from np
            if pprint:
                print(str.rstrip(', '))
            if show:
-                img.show(self.files[i])  # show
+                im.show(self.files[i])  # show
            if save:
                f = self.files[i]
-                img.save(Path(save_dir) / f)  # save
+                im.save(save_dir / f)  # save
                print(f"{'Saved' * (i == 0)} {f}", end=',' if i < self.n - 1 else f' to {save_dir}\n')
            if render:
-                self.imgs[i] = np.asarray(img)
+                self.imgs[i] = np.asarray(im)

    def print(self):
        self.display(pprint=True)  # print results
@@ -343,10 +348,14 @@ class Detections:
        self.display(show=True)  # show results

    def save(self, save_dir='runs/hub/exp'):
-        save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/hub/exp')  # increment save_dir
-        Path(save_dir).mkdir(parents=True, exist_ok=True)
+        save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/hub/exp', mkdir=True)  # increment save_dir
        self.display(save=True, save_dir=save_dir)  # save results

+    def crop(self, save_dir='runs/hub/exp'):
+        save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/hub/exp', mkdir=True)  # increment save_dir
+        self.display(crop=True, save_dir=save_dir)  # crop results
+        print(f'Saved results to {save_dir}\n')
+
    def render(self):
        self.display(render=True)  # render results
        return self.imgs
@@ -110,25 +110,27 @@ class Ensemble(nn.ModuleList):
        return y, None  # inference, train output


-def attempt_load(weights, map_location=None):
+def attempt_load(weights, map_location=None, inplace=True):
+    from models.yolo import Detect, Model
+
    # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a
    model = Ensemble()
    for w in weights if isinstance(weights, list) else [weights]:
-        attempt_download(w)
-        ckpt = torch.load(w, map_location=map_location)  # load
+        ckpt = torch.load(attempt_download(w), map_location=map_location)  # load
        model.append(ckpt['ema' if ckpt.get('ema') else 'model'].float().fuse().eval())  # FP32 model

    # Compatibility updates
    for m in model.modules():
-        if type(m) in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
-            m.inplace = True  # pytorch 1.7.0 compatibility
+        if type(m) in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Model]:
+            m.inplace = inplace  # pytorch 1.7.0 compatibility
        elif type(m) is Conv:
            m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatibility

    if len(model) == 1:
        return model[-1]  # return model
    else:
-        print('Ensemble created with %s\n' % weights)
-        for k in ['names', 'stride']:
+        print(f'Ensemble created with {weights}\n')
+        for k in ['names']:
            setattr(model, k, getattr(model[-1], k))
+        model.stride = model[torch.argmax(torch.tensor([m.stride.max() for m in model])).int()].stride  # max stride
        return model  # return ensemble
@@ -1,34 +1,43 @@
-"""Exports a YOLOv3 *.pt model to ONNX and TorchScript formats
+"""Exports a YOLOv3 *.pt model to TorchScript, ONNX, CoreML formats

 Usage:
-    $ export PYTHONPATH="$PWD" && python models/export.py --weights ./weights/yolov3.pt --img 640 --batch 1
+    $ python path/to/models/export.py --weights yolov3.pt --img 640 --batch 1
 """

 import argparse
 import sys
 import time
+from pathlib import Path

-sys.path.append('./')  # to run '$ python *.py' files in subdirectories
+sys.path.append(Path(__file__).parent.parent.absolute().__str__())  # to run '$ python *.py' files in subdirectories

 import torch
 import torch.nn as nn
+from torch.utils.mobile_optimizer import optimize_for_mobile

 import models
 from models.experimental import attempt_load
 from utils.activations import Hardswish, SiLU
-from utils.general import set_logging, check_img_size
+from utils.general import colorstr, check_img_size, check_requirements, file_size, set_logging
 from utils.torch_utils import select_device

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('--weights', type=str, default='./yolov3.pt', help='weights path')  # from yolov3/models/
+    parser.add_argument('--weights', type=str, default='./yolov3.pt', help='weights path')
    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size')  # height, width
    parser.add_argument('--batch-size', type=int, default=1, help='batch size')
-    parser.add_argument('--dynamic', action='store_true', help='dynamic ONNX axes')
-    parser.add_argument('--grid', action='store_true', help='export Detect() layer grid')
    parser.add_argument('--device', default='cpu', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--include', nargs='+', default=['torchscript', 'onnx', 'coreml'], help='include formats')
+    parser.add_argument('--half', action='store_true', help='FP16 half-precision export')
+    parser.add_argument('--inplace', action='store_true', help='set YOLOv3 Detect() inplace=True')
+    parser.add_argument('--train', action='store_true', help='model.train() mode')
+    parser.add_argument('--optimize', action='store_true', help='optimize TorchScript for mobile')  # TorchScript-only
+    parser.add_argument('--dynamic', action='store_true', help='dynamic ONNX axes')  # ONNX-only
+    parser.add_argument('--simplify', action='store_true', help='simplify ONNX model')  # ONNX-only
+    parser.add_argument('--opset-version', type=int, default=12, help='ONNX opset version')  # ONNX-only
    opt = parser.parse_args()
    opt.img_size *= 2 if len(opt.img_size) == 1 else 1  # expand
+    opt.include = [x.lower() for x in opt.include]
    print(opt)
    set_logging()
    t = time.time()
@@ -41,11 +50,16 @@ if __name__ == '__main__':
    # Checks
    gs = int(max(model.stride))  # grid size (max stride)
    opt.img_size = [check_img_size(x, gs) for x in opt.img_size]  # verify img_size are gs-multiples
+    assert not (opt.device.lower() == 'cpu' and opt.half), '--half only compatible with GPU export, i.e. use --device 0'

    # Input
    img = torch.zeros(opt.batch_size, 3, *opt.img_size).to(device)  # image size(1,3,320,192) iDetection

    # Update model
+    if opt.half:
+        img, model = img.half(), model.half()  # to FP16
+    if opt.train:
+        model.train()  # training mode (no grid construction in Detect layer)
    for k, m in model.named_modules():
        m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatibility
        if isinstance(m, models.common.Conv):  # assign export-friendly activations
@@ -53,52 +67,79 @@ if __name__ == '__main__':
                m.act = Hardswish()
            elif isinstance(m.act, nn.SiLU):
                m.act = SiLU()
-        # elif isinstance(m, models.yolo.Detect):
-        #     m.forward = m.forward_export  # assign forward (optional)
-    model.model[-1].export = not opt.grid  # set Detect() layer grid export
-    y = model(img)  # dry run
+        elif isinstance(m, models.yolo.Detect):
+            m.inplace = opt.inplace
+            m.onnx_dynamic = opt.dynamic
+            # m.forward = m.forward_export  # assign forward (optional)

-    # TorchScript export
-    try:
-        print('\nStarting TorchScript export with torch %s...' % torch.__version__)
-        f = opt.weights.replace('.pt', '.torchscript.pt')  # filename
-        ts = torch.jit.trace(model, img, strict=False)
-        ts.save(f)
-        print('TorchScript export success, saved as %s' % f)
-    except Exception as e:
-        print('TorchScript export failure: %s' % e)
+    for _ in range(2):
+        y = model(img)  # dry runs
+    print(f"\n{colorstr('PyTorch:')} starting from {opt.weights} ({file_size(opt.weights):.1f} MB)")

-    # ONNX export
-    try:
-        import onnx
+    # TorchScript export -----------------------------------------------------------------------------------------------
+    if 'torchscript' in opt.include or 'coreml' in opt.include:
+        prefix = colorstr('TorchScript:')
+        try:
+            print(f'\n{prefix} starting export with torch {torch.__version__}...')
+            f = opt.weights.replace('.pt', '.torchscript.pt')  # filename
+            ts = torch.jit.trace(model, img, strict=False)
+            (optimize_for_mobile(ts) if opt.optimize else ts).save(f)
+            print(f'{prefix} export success, saved as {f} ({file_size(f):.1f} MB)')
+        except Exception as e:
+            print(f'{prefix} export failure: {e}')

-        print('\nStarting ONNX export with onnx %s...' % onnx.__version__)
-        f = opt.weights.replace('.pt', '.onnx')  # filename
-        torch.onnx.export(model, img, f, verbose=False, opset_version=12, input_names=['images'],
-                          output_names=['classes', 'boxes'] if y is None else ['output'],
-                          dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'},  # size(1,3,640,640)
-                                        'output': {0: 'batch', 2: 'y', 3: 'x'}} if opt.dynamic else None)
+    # ONNX export ------------------------------------------------------------------------------------------------------
+    if 'onnx' in opt.include:
+        prefix = colorstr('ONNX:')
+        try:
+            import onnx

-        # Checks
-        onnx_model = onnx.load(f)  # load onnx model
-        onnx.checker.check_model(onnx_model)  # check onnx model
-        # print(onnx.helper.printable_graph(onnx_model.graph))  # print a human readable model
-        print('ONNX export success, saved as %s' % f)
-    except Exception as e:
-        print('ONNX export failure: %s' % e)
+            print(f'{prefix} starting export with onnx {onnx.__version__}...')
+            f = opt.weights.replace('.pt', '.onnx')  # filename
+            torch.onnx.export(model, img, f, verbose=False, opset_version=opt.opset_version, input_names=['images'],
+                              training=torch.onnx.TrainingMode.TRAINING if opt.train else torch.onnx.TrainingMode.EVAL,
+                              do_constant_folding=not opt.train,
+                              dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'},  # size(1,3,640,640)
+                                            'output': {0: 'batch', 2: 'y', 3: 'x'}} if opt.dynamic else None)

-    # CoreML export
-    try:
-        import coremltools as ct
+            # Checks
+            model_onnx = onnx.load(f)  # load onnx model
+            onnx.checker.check_model(model_onnx)  # check onnx model
+            # print(onnx.helper.printable_graph(model_onnx.graph))  # print

-        print('\nStarting CoreML export with coremltools %s...' % ct.__version__)
-        # convert model from torchscript and apply pixel scaling as per detect.py
-        model = ct.convert(ts, inputs=[ct.ImageType(name='image', shape=img.shape, scale=1 / 255.0, bias=[0, 0, 0])])
-        f = opt.weights.replace('.pt', '.mlmodel')  # filename
-        model.save(f)
-        print('CoreML export success, saved as %s' % f)
-    except Exception as e:
-        print('CoreML export failure: %s' % e)
+            # Simplify
+            if opt.simplify:
+                try:
+                    check_requirements(['onnx-simplifier'])
+                    import onnxsim
+
+                    print(f'{prefix} simplifying with onnx-simplifier {onnxsim.__version__}...')
+                    model_onnx, check = onnxsim.simplify(
+                        model_onnx,
+                        dynamic_input_shape=opt.dynamic,
+                        input_shapes={'images': list(img.shape)} if opt.dynamic else None)
+                    assert check, 'assert check failed'
+                    onnx.save(model_onnx, f)
+                except Exception as e:
+                    print(f'{prefix} simplifier failure: {e}')
+            print(f'{prefix} export success, saved as {f} ({file_size(f):.1f} MB)')
+        except Exception as e:
+            print(f'{prefix} export failure: {e}')
+
+    # CoreML export ----------------------------------------------------------------------------------------------------
+    if 'coreml' in opt.include:
+        prefix = colorstr('CoreML:')
+        try:
+            import coremltools as ct
+
+            print(f'{prefix} starting export with coremltools {ct.__version__}...')
+            assert opt.train, 'CoreML exports should be placed in model.train() mode with `python export.py --train`'
+            model = ct.convert(ts, inputs=[ct.ImageType('image', shape=img.shape, scale=1 / 255.0, bias=[0, 0, 0])])
+            f = opt.weights.replace('.pt', '.mlmodel')  # filename
+            model.save(f)
+            print(f'{prefix} export success, saved as {f} ({file_size(f):.1f} MB)')
+        except Exception as e:
+            print(f'{prefix} export failure: {e}')

    # Finish
-    print('\nExport complete (%.2fs). Visualize with https://github.com/lutzroeder/netron.' % (time.time() - t))
+    print(f'\nExport complete ({time.time() - t:.2f}s). Visualize with https://github.com/lutzroeder/netron.')
@@ -1,11 +1,16 @@
-# YOLOv3 YOLO-specific modules
+"""YOLOv3-specific modules
+
+Usage:
+    $ python path/to/models/yolo.py --cfg yolov3.yaml
+"""

 import argparse
 import logging
 import sys
 from copy import deepcopy
+from pathlib import Path

-sys.path.append('./')  # to run '$ python *.py' files in subdirectories
+sys.path.append(Path(__file__).parent.parent.absolute().__str__())  # to run '$ python *.py' files in subdirectories
 logger = logging.getLogger(__name__)

 from models.common import *
@@ -23,9 +28,9 @@ except ImportError:

 class Detect(nn.Module):
    stride = None  # strides computed during build
-    export = False  # onnx export
+    onnx_dynamic = False  # ONNX export parameter

-    def __init__(self, nc=80, anchors=(), ch=()):  # detection layer
+    def __init__(self, nc=80, anchors=(), ch=(), inplace=True):  # detection layer
        super(Detect, self).__init__()
        self.nc = nc  # number of classes
        self.no = nc + 5  # number of outputs per anchor
@@ -36,23 +41,28 @@ class Detect(nn.Module):
        self.register_buffer('anchors', a)  # shape(nl,na,2)
        self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2))  # shape(nl,1,na,1,1,2)
        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
+        self.inplace = inplace  # use in-place ops (e.g. slice assignment)

    def forward(self, x):
        # x = x.copy()  # for profiling
        z = []  # inference output
-        self.training |= self.export
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self.training:  # inference
-                if self.grid[i].shape[2:4] != x[i].shape[2:4]:
+                if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                y = x[i].sigmoid()
-                y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
-                y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+                if self.inplace:
+                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
+                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
+                    xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
+                    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2)  # wh
+                    y = torch.cat((xy, wh, y[..., 4:]), -1)
                z.append(y.view(bs, -1, self.no))

        return x if self.training else (torch.cat(z, 1), x)
@@ -72,7 +82,7 @@ class Model(nn.Module):
            import yaml  # for torch hub
            self.yaml_file = Path(cfg).name
            with open(cfg) as f:
-                self.yaml = yaml.load(f, Loader=yaml.SafeLoader)  # model dict
+                self.yaml = yaml.safe_load(f)  # model dict

        # Define model
        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
@@ -84,18 +94,20 @@ class Model(nn.Module):
            self.yaml['anchors'] = round(anchors)  # override yaml value
        self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist
        self.names = [str(i) for i in range(self.yaml['nc'])]  # default names
-        # print([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])
+        self.inplace = self.yaml.get('inplace', True)
+        # logger.info([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])

        # Build strides, anchors
        m = self.model[-1]  # Detect()
        if isinstance(m, Detect):
            s = 256  # 2x min stride
+            m.inplace = self.inplace
            m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])  # forward
            m.anchors /= m.stride.view(-1, 1, 1)
            check_anchor_order(m)
            self.stride = m.stride
            self._initialize_biases()  # only run once
-            # print('Strides: %s' % m.stride.tolist())
+            # logger.info('Strides: %s' % m.stride.tolist())

        # Init weights, biases
        initialize_weights(self)
@@ -104,24 +116,23 @@ class Model(nn.Module):

    def forward(self, x, augment=False, profile=False):
        if augment:
-            img_size = x.shape[-2:]  # height, width
-            s = [1, 0.83, 0.67]  # scales
-            f = [None, 3, None]  # flips (2-ud, 3-lr)
-            y = []  # outputs
-            for si, fi in zip(s, f):
-                xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
-                yi = self.forward_once(xi)[0]  # forward
-                # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
-                yi[..., :4] /= si  # de-scale
-                if fi == 2:
-                    yi[..., 1] = img_size[0] - yi[..., 1]  # de-flip ud
-                elif fi == 3:
-                    yi[..., 0] = img_size[1] - yi[..., 0]  # de-flip lr
-                y.append(yi)
-            return torch.cat(y, 1), None  # augmented inference, train
+            return self.forward_augment(x)  # augmented inference, None
        else:
            return self.forward_once(x, profile)  # single-scale inference, train

+    def forward_augment(self, x):
+        img_size = x.shape[-2:]  # height, width
+        s = [1, 0.83, 0.67]  # scales
+        f = [None, 3, None]  # flips (2-ud, 3-lr)
+        y = []  # outputs
+        for si, fi in zip(s, f):
+            xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
+            yi = self.forward_once(xi)[0]  # forward
+            # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
+            yi = self._descale_pred(yi, fi, si, img_size)
+            y.append(yi)
+        return torch.cat(y, 1), None  # augmented inference, train
+
    def forward_once(self, x, profile=False):
        y, dt = [], []  # outputs
        for m in self.model:
@@ -134,15 +145,34 @@ class Model(nn.Module):
                for _ in range(10):
                    _ = m(x)
                dt.append((time_synchronized() - t) * 100)
-                print('%10.1f%10.0f%10.1fms %-40s' % (o, m.np, dt[-1], m.type))
+                if m == self.model[0]:
+                    logger.info(f"{'time (ms)':>10s} {'GFLOPS':>10s} {'params':>10s}  {'module'}")
+                logger.info(f'{dt[-1]:10.2f} {o:10.2f} {m.np:10.0f}  {m.type}')

            x = m(x)  # run
            y.append(x if m.i in self.save else None)  # save output

        if profile:
-            print('%.1fms total' % sum(dt))
+            logger.info('%.1fms total' % sum(dt))
        return x

+    def _descale_pred(self, p, flips, scale, img_size):
+        # de-scale predictions following augmented inference (inverse operation)
+        if self.inplace:
+            p[..., :4] /= scale  # de-scale
+            if flips == 2:
+                p[..., 1] = img_size[0] - p[..., 1]  # de-flip ud
+            elif flips == 3:
+                p[..., 0] = img_size[1] - p[..., 0]  # de-flip lr
+        else:
+            x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale  # de-scale
+            if flips == 2:
+                y = img_size[0] - y  # de-flip ud
+            elif flips == 3:
+                x = img_size[1] - x  # de-flip lr
+            p = torch.cat((x, y, wh, p[..., 4:]), -1)
+        return p
+
    def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
        # https://arxiv.org/abs/1708.02002 section 3.3
        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
@@ -157,15 +187,16 @@ class Model(nn.Module):
        m = self.model[-1]  # Detect() module
        for mi in m.m:  # from
            b = mi.bias.detach().view(m.na, -1).T  # conv.bias(255) to (3,85)
-            print(('%6g Conv2d.bias:' + '%10.3g' * 6) % (mi.weight.shape[1], *b[:5].mean(1).tolist(), b[5:].mean()))
+            logger.info(
+                ('%6g Conv2d.bias:' + '%10.3g' * 6) % (mi.weight.shape[1], *b[:5].mean(1).tolist(), b[5:].mean()))

    # def _print_weights(self):
    #     for m in self.model.modules():
    #         if type(m) is Bottleneck:
-    #             print('%10.3g' % (m.w.detach().sigmoid() * 2))  # shortcut weights
+    #             logger.info('%10.3g' % (m.w.detach().sigmoid() * 2))  # shortcut weights

    def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
-        print('Fusing layers... ')
+        logger.info('Fusing layers... ')
        for m in self.model.modules():
            if type(m) is Conv and hasattr(m, 'bn'):
                m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
@@ -177,20 +208,20 @@ class Model(nn.Module):
    def nms(self, mode=True):  # add or remove NMS module
        present = type(self.model[-1]) is NMS  # last layer is NMS
        if mode and not present:
-            print('Adding NMS... ')
+            logger.info('Adding NMS... ')
            m = NMS()  # module
            m.f = -1  # from
            m.i = self.model[-1].i + 1  # index
            self.model.add_module(name='%s' % m.i, module=m)  # add
            self.eval()
        elif not mode and present:
-            print('Removing NMS... ')
+            logger.info('Removing NMS... ')
            self.model = self.model[:-1]  # remove
        return self

-    def autoshape(self):  # add autoShape module
-        print('Adding autoShape... ')
-        m = autoShape(self)  # wrap model
+    def autoshape(self):  # add AutoShape module
+        logger.info('Adding AutoShape... ')
+        m = AutoShape(self)  # wrap model
        copy_attr(m, self, include=('yaml', 'nc', 'hyp', 'names', 'stride'), exclude=())  # copy attributes
        return m

@@ -266,12 +297,12 @@ if __name__ == '__main__':
    model.train()

    # Profile
-    # img = torch.rand(8 if torch.cuda.is_available() else 1, 3, 640, 640).to(device)
+    # img = torch.rand(8 if torch.cuda.is_available() else 1, 3, 320, 320).to(device)
    # y = model(img, profile=True)

-    # Tensorboard
+    # Tensorboard (not working https://github.com/ultralytics/yolov5/issues/2898)
    # from torch.utils.tensorboard import SummaryWriter
-    # tb_writer = SummaryWriter()
-    # print("Run 'tensorboard --logdir=models/runs' to view tensorboard at http://localhost:6006/")
-    # tb_writer.add_graph(model.model, img)  # add model to tensorboard
+    # tb_writer = SummaryWriter('.')
+    # logger.info("Run 'tensorboard --logdir=models' to view tensorboard at http://localhost:6006/")
+    # tb_writer.add_graph(torch.jit.trace(model, img, strict=False), [])  # add model graph
    # tb_writer.add_image('test', img[0], dataformats='CWH')  # add model to tensorboard