Skip to content

Commit 8c5ddb5

Browse files
author
yixu.cui
committed
change some layout and open save ckp
1 parent 50f303f commit 8c5ddb5

File tree

1 file changed

+103
-86
lines changed

1 file changed

+103
-86
lines changed

train_aux.py

Lines changed: 103 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -121,60 +121,60 @@ def train(hyp, opt, device, tb_writer=None):
121121
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
122122
pg1.append(v.weight) # apply decay
123123
if hasattr(v, 'im'):
124-
if hasattr(v.im, 'implicit'):
124+
if hasattr(v.im, 'implicit'):
125125
pg0.append(v.im.implicit)
126126
else:
127127
for iv in v.im:
128128
pg0.append(iv.implicit)
129129
if hasattr(v, 'imc'):
130-
if hasattr(v.imc, 'implicit'):
130+
if hasattr(v.imc, 'implicit'):
131131
pg0.append(v.imc.implicit)
132132
else:
133133
for iv in v.imc:
134134
pg0.append(iv.implicit)
135135
if hasattr(v, 'imb'):
136-
if hasattr(v.imb, 'implicit'):
136+
if hasattr(v.imb, 'implicit'):
137137
pg0.append(v.imb.implicit)
138138
else:
139139
for iv in v.imb:
140140
pg0.append(iv.implicit)
141141
if hasattr(v, 'imo'):
142-
if hasattr(v.imo, 'implicit'):
142+
if hasattr(v.imo, 'implicit'):
143143
pg0.append(v.imo.implicit)
144144
else:
145145
for iv in v.imo:
146146
pg0.append(iv.implicit)
147147
if hasattr(v, 'ia'):
148-
if hasattr(v.ia, 'implicit'):
148+
if hasattr(v.ia, 'implicit'):
149149
pg0.append(v.ia.implicit)
150150
else:
151151
for iv in v.ia:
152152
pg0.append(iv.implicit)
153153
if hasattr(v, 'attn'):
154-
if hasattr(v.attn, 'logit_scale'):
154+
if hasattr(v.attn, 'logit_scale'):
155155
pg0.append(v.attn.logit_scale)
156-
if hasattr(v.attn, 'q_bias'):
156+
if hasattr(v.attn, 'q_bias'):
157157
pg0.append(v.attn.q_bias)
158-
if hasattr(v.attn, 'v_bias'):
158+
if hasattr(v.attn, 'v_bias'):
159159
pg0.append(v.attn.v_bias)
160-
if hasattr(v.attn, 'relative_position_bias_table'):
160+
if hasattr(v.attn, 'relative_position_bias_table'):
161161
pg0.append(v.attn.relative_position_bias_table)
162162
if hasattr(v, 'rbr_dense'):
163-
if hasattr(v.rbr_dense, 'weight_rbr_origin'):
163+
if hasattr(v.rbr_dense, 'weight_rbr_origin'):
164164
pg0.append(v.rbr_dense.weight_rbr_origin)
165-
if hasattr(v.rbr_dense, 'weight_rbr_avg_conv'):
165+
if hasattr(v.rbr_dense, 'weight_rbr_avg_conv'):
166166
pg0.append(v.rbr_dense.weight_rbr_avg_conv)
167-
if hasattr(v.rbr_dense, 'weight_rbr_pfir_conv'):
167+
if hasattr(v.rbr_dense, 'weight_rbr_pfir_conv'):
168168
pg0.append(v.rbr_dense.weight_rbr_pfir_conv)
169-
if hasattr(v.rbr_dense, 'weight_rbr_1x1_kxk_idconv1'):
169+
if hasattr(v.rbr_dense, 'weight_rbr_1x1_kxk_idconv1'):
170170
pg0.append(v.rbr_dense.weight_rbr_1x1_kxk_idconv1)
171-
if hasattr(v.rbr_dense, 'weight_rbr_1x1_kxk_conv2'):
171+
if hasattr(v.rbr_dense, 'weight_rbr_1x1_kxk_conv2'):
172172
pg0.append(v.rbr_dense.weight_rbr_1x1_kxk_conv2)
173-
if hasattr(v.rbr_dense, 'weight_rbr_gconv_dw'):
173+
if hasattr(v.rbr_dense, 'weight_rbr_gconv_dw'):
174174
pg0.append(v.rbr_dense.weight_rbr_gconv_dw)
175-
if hasattr(v.rbr_dense, 'weight_rbr_gconv_pw'):
175+
if hasattr(v.rbr_dense, 'weight_rbr_gconv_pw'):
176176
pg0.append(v.rbr_dense.weight_rbr_gconv_pw)
177-
if hasattr(v.rbr_dense, 'vector'):
177+
if hasattr(v.rbr_dense, 'vector'):
178178
pg0.append(v.rbr_dense.vector)
179179

180180
if opt.adam:
@@ -265,6 +265,9 @@ def train(hyp, opt, device, tb_writer=None):
265265
if plots:
266266
#plot_labels(labels, names, save_dir, loggers)
267267
if tb_writer:
268+
# [cui] raise a TypeError: no loop matching the specified signature and casting was found for ufunc greater
269+
# the reason is np.greater(), solved this error by pip numpy's version from 1.24.2 to 1.23.0
270+
# tb_writer.add_histogram('classes', c, 0, bins='auto', max_bins=20)
268271
tb_writer.add_histogram('classes', c, 0)
269272

270273
# Anchors
@@ -409,18 +412,20 @@ def train(hyp, opt, device, tb_writer=None):
409412
final_epoch = epoch + 1 == epochs
410413
if not opt.notest or final_epoch: # Calculate mAP
411414
wandb_logger.current_epoch = epoch + 1
412-
results, maps, times = test.test(data_dict,
413-
batch_size=batch_size * 2,
414-
imgsz=imgsz_test,
415-
model=ema.ema,
416-
single_cls=opt.single_cls,
417-
dataloader=testloader,
418-
save_dir=save_dir,
419-
verbose=nc < 50 and final_epoch,
420-
plots=plots and final_epoch,
421-
wandb_logger=wandb_logger,
422-
compute_loss=compute_loss,
423-
is_coco=is_coco)
415+
results, maps, times = test.test(
416+
data_dict,
417+
batch_size=batch_size * 2,
418+
imgsz=imgsz_test,
419+
model=ema.ema,
420+
single_cls=opt.single_cls,
421+
dataloader=testloader,
422+
save_dir=save_dir,
423+
verbose=nc < 50 and final_epoch,
424+
plots=plots and final_epoch,
425+
wandb_logger=wandb_logger,
426+
compute_loss=compute_loss,
427+
is_coco=is_coco,
428+
)
424429

425430
# Write
426431
with open(results_file, 'a') as f:
@@ -432,7 +437,7 @@ def train(hyp, opt, device, tb_writer=None):
432437
tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss
433438
'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
434439
'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss
435-
'x/lr0', 'x/lr1', 'x/lr2'] # params
440+
'x/lr0', 'x/lr1', 'x/lr2',] # params
436441
for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
437442
if tb_writer:
438443
tb_writer.add_scalar(tag, x, epoch) # tensorboard
@@ -447,27 +452,32 @@ def train(hyp, opt, device, tb_writer=None):
447452

448453
# Save model
449454
if (not opt.nosave) or (final_epoch and not opt.evolve): # if save
450-
ckpt = {'epoch': epoch,
451-
'best_fitness': best_fitness,
452-
'training_results': results_file.read_text(),
453-
'model': deepcopy(model.module if is_parallel(model) else model).half(),
454-
'ema': deepcopy(ema.ema).half(),
455-
'updates': ema.updates,
456-
'optimizer': optimizer.state_dict(),
457-
'wandb_id': wandb_logger.wandb_run.id if wandb_logger.wandb else None}
455+
ckpt = {
456+
'epoch': epoch,
457+
'best_fitness': best_fitness,
458+
'training_results': results_file.read_text(),
459+
'model': deepcopy(model.module if is_parallel(model) else model).half(),
460+
'ema': deepcopy(ema.ema).half(),
461+
'updates': ema.updates,
462+
'optimizer': optimizer.state_dict(),
463+
'wandb_id': wandb_logger.wandb_run.id if wandb_logger.wandb else None,
464+
}
458465

459466
# Save last, best and delete
467+
print(f"'current epoch': {epoch}, ckpt epoch: {ckpt['epoch']}")
460468
torch.save(ckpt, last)
469+
print(f"saved ckpt epoch: {ckpt['epoch']}")
461470
if best_fitness == fi: # best 衡量的标准是0.1*[email protected] + 0.9*[email protected]:0.95
462471
torch.save(ckpt, best)
463472
if (best_fitness == fi) and (epoch >= 200):
464473
torch.save(ckpt, wdir / 'best_{:03d}.pt'.format(epoch))
465-
# if epoch == 0:
466-
# torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch))
467-
# elif ((epoch+1) % 25) == 0:
468-
# torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch))
469-
# elif epoch >= (epochs-5):
470-
# torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch))
474+
if epoch == 0:
475+
torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch))
476+
elif ((epoch+1) % 30) == 0:
477+
torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch))
478+
# elif epoch >= (epochs-3):
479+
elif epoch >= (epochs - opt.save_tail_epochs):
480+
torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch))
471481
if wandb_logger.wandb:
472482
if ((epoch + 1) % opt.save_period == 0 and not final_epoch) and opt.save_period != -1:
473483
wandb_logger.log_model(
@@ -488,18 +498,20 @@ def train(hyp, opt, device, tb_writer=None):
488498
logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
489499
if opt.data.endswith('coco.yaml') and nc == 80: # if COCO
490500
for m in (last, best) if best.exists() else (last): # speed, mAP tests
491-
results, _, _ = test.test(opt.data,
492-
batch_size=batch_size * 2,
493-
imgsz=imgsz_test,
494-
conf_thres=0.001,
495-
iou_thres=0.7,
496-
model=attempt_load(m, device).half(),
497-
single_cls=opt.single_cls,
498-
dataloader=testloader,
499-
save_dir=save_dir,
500-
save_json=True,
501-
plots=False,
502-
is_coco=is_coco)
501+
results, _, _ = test.test(
502+
opt.data,
503+
batch_size=batch_size * 2,
504+
imgsz=imgsz_test,
505+
conf_thres=0.001,
506+
iou_thres=0.7,
507+
model=attempt_load(m, device).half(),
508+
single_cls=opt.single_cls,
509+
dataloader=testloader,
510+
save_dir=save_dir,
511+
save_json=True,
512+
plots=False,
513+
is_coco=is_coco,
514+
)
503515

504516
# Strip optimizers
505517
final = best if best.exists() else last # final model
@@ -557,6 +569,7 @@ def train(hyp, opt, device, tb_writer=None):
557569
parser.add_argument('--bbox_interval', type=int, default=-1, help='Set bounding-box image logging interval for W&B')
558570
parser.add_argument('--save_period', type=int, default=-1, help='Log model after every "save_period" epoch')
559571
parser.add_argument('--artifact_alias', type=str, default="latest", help='version of dataset artifact to be used')
572+
parser.add_argument('--save-tail-epochs', type=int, default=0, help='save some tail epochs')
560573
opt = parser.parse_args()
561574

562575
# Set DDP variables
@@ -613,34 +626,36 @@ def train(hyp, opt, device, tb_writer=None):
613626
# Evolve hyperparameters (optional)
614627
else:
615628
# Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
616-
meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3)
617-
'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf)
618-
'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1
619-
'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
620-
'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok)
621-
'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum
622-
'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr
623-
'box': (1, 0.02, 0.2), # box loss gain
624-
'cls': (1, 0.2, 4.0), # cls loss gain
625-
'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
626-
'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
627-
'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
628-
'iou_t': (0, 0.1, 0.7), # IoU training threshold
629-
'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
630-
'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore)
631-
'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
632-
'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
633-
'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction)
634-
'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction)
635-
'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
636-
'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
637-
'scale': (1, 0.0, 0.9), # image scale (+/- gain)
638-
'shear': (1, 0.0, 10.0), # image shear (+/- deg)
639-
'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
640-
'flipud': (1, 0.0, 1.0), # image flip up-down (probability)
641-
'fliplr': (0, 0.0, 1.0), # image flip left-right (probability)
642-
'mosaic': (1, 0.0, 1.0), # image mixup (probability)
643-
'mixup': (1, 0.0, 1.0)} # image mixup (probability)
629+
meta = {
630+
'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3)
631+
'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf)
632+
'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1
633+
'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
634+
'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok)
635+
'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum
636+
'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr
637+
'box': (1, 0.02, 0.2), # box loss gain
638+
'cls': (1, 0.2, 4.0), # cls loss gain
639+
'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
640+
'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
641+
'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
642+
'iou_t': (0, 0.1, 0.7), # IoU training threshold
643+
'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
644+
'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore)
645+
'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
646+
'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
647+
'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction)
648+
'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction)
649+
'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
650+
'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
651+
'scale': (1, 0.0, 0.9), # image scale (+/- gain)
652+
'shear': (1, 0.0, 10.0), # image shear (+/- deg)
653+
'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
654+
'flipud': (1, 0.0, 1.0), # image flip up-down (probability)
655+
'fliplr': (0, 0.0, 1.0), # image flip left-right (probability)
656+
'mosaic': (1, 0.0, 1.0), # image mixup (probability)
657+
'mixup': (1, 0.0, 1.0), # image mixup (probability)
658+
}
644659

645660
assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
646661
opt.notest, opt.nosave = True, True # only test/save final epoch
@@ -689,5 +704,7 @@ def train(hyp, opt, device, tb_writer=None):
689704

690705
# Plot results
691706
plot_evolution(yaml_file)
692-
print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n'
693-
f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}')
707+
print(
708+
f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n'
709+
f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}'
710+
)

0 commit comments

Comments
 (0)