#!/usr/bin/env python3 # Copyright (c) 2018 NVIDIA Corporation. All rights reserved. # This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. # https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode from __future__ import print_function ###################################################### """ HOW TO TRAIN DOPE This is the DOPE training code. It is provided as a convenience for researchers, but it is otherwise unsupported. Please refer to `python3 train.py --help` for specific details about the training code. If you download the FAT dataset (https://research.nvidia.com/publication/2018-06_Falling-Things) you can train a YCB object DOPE detector as follows: ``` python3 train.py --data path/to/FAT --object soup --checkpoints soup --gpuids 0 1 2 3 4 5 6 7 ``` This will create a folder called `train_soup` where the weights will be saved after each epoch. It will use the 8 gpus using pytorch data parallel. """ import sys import argparse import random import numpy as np import torch import torch.nn as nn import torch.nn.parallel import torch.optim as optim from torch.optim.lr_scheduler import CosineAnnealingLR import torch.utils.data import torchvision.transforms as transforms from torch.utils.data.distributed import DistributedSampler from torch.autograd import Variable import torch.utils.data as data import torchvision.models as models import torch.distributed as dist import datetime import json import glob import os from PIL import Image from PIL import ImageDraw from PIL import ImageEnhance from math import acos from math import sqrt from math import pi from os.path import exists import cv2 import colorsys import inspect from distutils.util import strtobool import configparser import logging sys.path.append("/home/azureuser/dope/scripts/train2") #from dope.utils import make_grid from train2.utils_dope import make_grid import warnings import traceback import platform print(platform.python_implementation()) print(platform.python_version()) print(platform.uname()) import mlflow # https://github.com/Azure/medical-imaging/blob/main/3d-brain-tumor-segmentation/src/train-brats21.py # Avoid flooding of debug messages in logs logging.basicConfig(level=logging.WARNING) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("azureml").setLevel(logging.WARNING) logging.getLogger("azure").setLevel(logging.WARNING) logging.getLogger("azure.core").setLevel(logging.WARNING) logging.getLogger("azure.mlflow").setLevel(logging.WARNING) # # mlflow.autolog(silent=True) mlflow.autolog() script_path = os.getcwd() print('training script path: ', script_path) warnings.filterwarnings("ignore") ################################################## # NEURAL NETWORK MODEL ################################################## class DopeNetwork(nn.Module): def __init__( self, pretrained=False, numBeliefMap=9, numAffinity=16, stop_at_stage=6 # number of stages to process (if less than total number of stages) ): super(DopeNetwork, self).__init__() self.stop_at_stage = stop_at_stage if pretrained is False: print("Training network without imagenet weights.") else: print("Training network pretrained on imagenet.") vgg_full = models.vgg19(pretrained=pretrained).features self.vgg = nn.Sequential() for i_layer in range(24): self.vgg.add_module(str(i_layer), vgg_full[i_layer]) # Add some layers i_layer = 23 self.vgg.add_module(str(i_layer), nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1)) self.vgg.add_module(str(i_layer+1), nn.ReLU(inplace=True)) self.vgg.add_module(str(i_layer+2), nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1)) self.vgg.add_module(str(i_layer+3), nn.ReLU(inplace=True)) # print('---Belief------------------------------------------------') # _2 are the belief map stages self.m1_2 = DopeNetwork.create_stage(128, numBeliefMap, True) self.m2_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numBeliefMap, False) self.m3_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numBeliefMap, False) self.m4_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numBeliefMap, False) self.m5_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numBeliefMap, False) self.m6_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numBeliefMap, False) # print('---Affinity----------------------------------------------') # _1 are the affinity map stages self.m1_1 = DopeNetwork.create_stage(128, numAffinity, True) self.m2_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numAffinity, False) self.m3_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numAffinity, False) self.m4_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numAffinity, False) self.m5_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numAffinity, False) self.m6_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, numAffinity, False) def forward(self, x): '''Runs inference on the neural network''' out1 = self.vgg(x) out1_2 = self.m1_2(out1) out1_1 = self.m1_1(out1) if self.stop_at_stage == 1: return [out1_2],\ [out1_1] out2 = torch.cat([out1_2, out1_1, out1], 1) out2_2 = self.m2_2(out2) out2_1 = self.m2_1(out2) if self.stop_at_stage == 2: return [out1_2, out2_2],\ [out1_1, out2_1] out3 = torch.cat([out2_2, out2_1, out1], 1) out3_2 = self.m3_2(out3) out3_1 = self.m3_1(out3) if self.stop_at_stage == 3: return [out1_2, out2_2, out3_2],\ [out1_1, out2_1, out3_1] out4 = torch.cat([out3_2, out3_1, out1], 1) out4_2 = self.m4_2(out4) out4_1 = self.m4_1(out4) if self.stop_at_stage == 4: return [out1_2, out2_2, out3_2, out4_2],\ [out1_1, out2_1, out3_1, out4_1] out5 = torch.cat([out4_2, out4_1, out1], 1) out5_2 = self.m5_2(out5) out5_1 = self.m5_1(out5) if self.stop_at_stage == 5: return [out1_2, out2_2, out3_2, out4_2, out5_2],\ [out1_1, out2_1, out3_1, out4_1, out5_1] out6 = torch.cat([out5_2, out5_1, out1], 1) out6_2 = self.m6_2(out6) out6_1 = self.m6_1(out6) return [out1_2, out2_2, out3_2, out4_2, out5_2, out6_2],\ [out1_1, out2_1, out3_1, out4_1, out5_1, out6_1] @staticmethod def create_stage(in_channels, out_channels, first=False): '''Create the neural network layers for a single stage.''' model = nn.Sequential() mid_channels = 128 if first: padding = 1 kernel = 3 count = 6 final_channels = 512 else: padding = 3 kernel = 7 count = 10 final_channels = mid_channels # First convolution model.add_module("0", nn.Conv2d( in_channels, mid_channels, kernel_size=kernel, stride=1, padding=padding) ) # Middle convolutions i = 1 while i < count - 1: model.add_module(str(i), nn.ReLU(inplace=True)) i += 1 model.add_module(str(i), nn.Conv2d( mid_channels, mid_channels, kernel_size=kernel, stride=1, padding=padding)) i += 1 # Penultimate convolution model.add_module(str(i), nn.ReLU(inplace=True)) i += 1 model.add_module(str(i), nn.Conv2d(mid_channels, final_channels, kernel_size=1, stride=1)) i += 1 # Last convolution model.add_module(str(i), nn.ReLU(inplace=True)) i += 1 model.add_module(str(i), nn.Conv2d(final_channels, out_channels, kernel_size=1, stride=1)) i += 1 return model ################################################## # UTILS CODE FOR LOADING THE DATA ################################################## def default_loader(path): return Image.open(path).convert('RGB') def loadjson(path, objectofinterest): """ Loads the data from a json file. If there are no objects of interest, then load all the objects. """ with open(path) as data_file: data = json.load(data_file) pointsBelief = [] centroids = [] translations = [] rotations = [] points = [] for i_line in range(len(data['objects'])): info = data['objects'][i_line] if not objectofinterest is None and \ not objectofinterest in info['class'].lower(): continue # 3d bbox with belief maps points3d = [] pointdata = info['projected_cuboid'] for p in pointdata: points3d.append((p[0], p[1])) if len(points3d) == 8: # NDDS format: 8 points in 'projected_cuboid', 1 point in 'projected_cuboid_centroid' pcenter = info['projected_cuboid_centroid'] points3d.append((pcenter[0], pcenter[1])) elif len(points3d) == 9: # nvisii format: 9 points in 'projected_cuboid', no 'projected_cuboid_centroid' key pcenter = points3d[-1] else: raise RuntimeError(f'projected_cuboid has to have 8 or 9 points while reading "{path}"') pointsBelief.append(points3d) points.append(points3d + [(pcenter[0], pcenter[1])]) # NOTE: Adding the centroid again is probably a bug. centroids.append((pcenter[0], pcenter[1])) # load translations location = info['location'] translations.append([location[0], location[1], location[2]]) # quaternion rot = info["quaternion_xyzw"] rotations.append(rot) return { "pointsBelief": pointsBelief, "rotations": rotations, "translations": translations, "centroids": centroids, "points": points, "keypoints_2d": [], } def loadimages(root): """ Find all the images in the path and folders, return them in imgs. """ imgs = [] def add_json_files(path,): for imgpath in glob.glob(path+"/*.png"): if exists(imgpath) and exists(imgpath.replace('png',"json")): imgs.append((imgpath,imgpath.replace(path,"").replace("/",""), imgpath.replace('png',"json"))) for imgpath in glob.glob(path+"/*.jpg"): if exists(imgpath) and exists(imgpath.replace('jpg',"json")): imgs.append((imgpath,imgpath.replace(path,"").replace("/",""), imgpath.replace('jpg',"json"))) def explore(path): if not os.path.isdir(path): return folders = [os.path.join(path, o) for o in os.listdir(path) if os.path.isdir(os.path.join(path,o))] if len(folders)>0: for path_entry in folders: explore(path_entry) add_json_files(path) explore(root) return imgs class MultipleVertexJson(data.Dataset): """ Dataloader for the data generated by NDDS (https://github.com/NVIDIA/Dataset_Synthesizer). This is the same data as the data used in FAT. """ def __init__(self, root,transform=None, nb_vertex = 8, keep_orientation = True, normal = None, test=False, target_transform = None, loader = default_loader, objectofinterest = "", img_size = 400, save = False, noise = 2, data_size = None, sigma = 16, random_translation = (25.0,25.0), random_rotation = 15.0, ): ################### self.objectofinterest = objectofinterest self.img_size = img_size self.loader = loader self.transform = transform self.target_transform = target_transform self.root = root self.imgs = [] self.test = test self.normal = normal self.keep_orientation = keep_orientation self.save = save self.noise = noise self.data_size = data_size self.sigma = sigma self.random_translation = random_translation self.random_rotation = random_rotation def load_data(path): '''Recursively load the data. This is useful to load all of the FAT dataset.''' imgs = loadimages(path) # Check all the folders in path for name in os.listdir(str(path)): imgs += loadimages(path +"/"+name) return imgs self.imgs = load_data(root) # Shuffle the data, this is useful when we want to use a subset. np.random.shuffle(self.imgs) def __len__(self): # When limiting the number of data if not self.data_size is None: return int(self.data_size) return len(self.imgs) def __getitem__(self, index): """ Depending on how the data loader is configured, this will return the debug info with the cuboid drawn on it, this happens when self.save is set to true. Otherwise, during training this function returns the belief maps and affinity fields and image as tensors. """ path, name, txt = self.imgs[index] img = self.loader(path) img_size = img.size img_size = (400,400) loader = loadjson data = loader(txt, self.objectofinterest) pointsBelief = data['pointsBelief'] objects_centroid = data['centroids'] points_all = data['points'] points_keypoints = data['keypoints_2d'] translations = torch.from_numpy(np.array( data['translations'])).float() rotations = torch.from_numpy(np.array( data['rotations'])).float() if len(points_all) == 0: points_all = torch.zeros(1, 10, 2).double() # self.save == true assumes there is only # one object instance in the scene. if translations.size()[0] > 1: translations = translations[0].unsqueeze(0) rotations = rotations[0].unsqueeze(0) # If there are no objects, still need to return similar shape array if len(translations) == 0: translations = torch.zeros(1,3).float() rotations = torch.zeros(1,4).float() # Camera intrinsics path_cam = path.replace(name,'_camera_settings.json') with open(path_cam) as data_file: data = json.load(data_file) # Assumes one camera cam = data['camera_settings'][0]['intrinsic_settings'] matrix_camera = np.zeros((3,3)) matrix_camera[0,0] = cam['fx'] matrix_camera[1,1] = cam['fy'] matrix_camera[0,2] = cam['cx'] matrix_camera[1,2] = cam['cy'] matrix_camera[2,2] = 1 # Load the cuboid sizes path_set = path.replace(name,'_object_settings.json') with open(path_set) as data_file: data = json.load(data_file) cuboid = torch.zeros(1) if self.objectofinterest is None: cuboid = np.array(data['exported_objects'][0]['cuboid_dimensions']) else: for info in data["exported_objects"]: if self.objectofinterest in info['class']: cuboid = np.array(info['cuboid_dimensions']) img_original = img.copy() def Reproject(points,tm, rm): """ Reprojection of points when rotating the image """ proj_cuboid = np.array(points) rmat = np.identity(3) rmat[0:2] = rm tmat = np.identity(3) tmat[0:2] = tm new_cuboid = np.matmul( rmat, np.vstack((proj_cuboid.T, np.ones(len(points))))) new_cuboid = np.matmul(tmat, new_cuboid) new_cuboid = new_cuboid[0:2].T return new_cuboid # Random image manipulation, rotation and translation with zero padding # These create a bug, thank you to # https://tanelp.github.io/posts/a-bug-that-plagues-thousands-of-open-source-ml-projects/ # dx = round(np.random.normal(0, 2) * float(self.random_translation[0])) # dy = round(np.random.normal(0, 2) * float(self.random_translation[1])) # angle = round(np.random.normal(0, 1) * float(self.random_rotation)) dx = round(float(torch.normal(torch.tensor(0.0), torch.tensor(2.0)) * float(self.random_translation[0]))) dy = round(float(torch.normal(torch.tensor(0.0), torch.tensor(2.0)) * float(self.random_translation[1]))) angle = round(float(torch.normal(torch.tensor(0.0), torch.tensor(1.0)) * float(self.random_rotation))) tm = np.float32([[1, 0, dx], [0, 1, dy]]) rm = cv2.getRotationMatrix2D( (img.size[0]/2, img.size[1]/2), angle, 1) for i_objects in range(len(pointsBelief)): points = pointsBelief[i_objects] new_cuboid = Reproject(points, tm, rm) pointsBelief[i_objects] = new_cuboid.tolist() objects_centroid[i_objects] = tuple(new_cuboid.tolist()[-1]) pointsBelief[i_objects] = list(map(tuple, pointsBelief[i_objects])) for i_objects in range(len(points_keypoints)): points = points_keypoints[i_objects] new_cuboid = Reproject(points, tm, rm) points_keypoints[i_objects] = new_cuboid.tolist() points_keypoints[i_objects] = list(map(tuple, points_keypoints[i_objects])) image_r = cv2.warpAffine(np.array(img), rm, img.size) result = cv2.warpAffine(image_r, tm, img.size) img = Image.fromarray(result) # Note: All point coordinates are in the image space, e.g., pixel value. # This is used when we do saving --- helpful for debugging if self.save or self.test: # Use the save to debug the data if self.test: draw = ImageDraw.Draw(img_original) else: draw = ImageDraw.Draw(img) # PIL drawing functions, here for sharing draw def DrawKeypoints(points): for key in points: DrawDot(key,(12, 115, 170),7) def DrawLine(point1, point2, lineColor, lineWidth): if not point1 is None and not point2 is None: draw.line([point1,point2],fill=lineColor,width=lineWidth) def DrawDot(point, pointColor, pointRadius): if not point is None: xy = [point[0]-pointRadius, point[1]-pointRadius, point[0]+pointRadius, point[1]+pointRadius] draw.ellipse(xy, fill=pointColor, outline=pointColor) def DrawCube(points, which_color = 0, color = None): '''Draw cube with a thick solid line across the front top edge.''' lineWidthForDrawing = 2 lineColor1 = (255, 215, 0) # yellow-ish lineColor2 = (12, 115, 170) # blue-ish lineColor3 = (45, 195, 35) # green-ish if which_color == 3: lineColor = lineColor3 else: lineColor = lineColor1 if not color is None: lineColor = color # draw front DrawLine(points[0], points[1], lineColor, 8) #lineWidthForDrawing) DrawLine(points[1], points[2], lineColor, lineWidthForDrawing) DrawLine(points[3], points[2], lineColor, lineWidthForDrawing) DrawLine(points[3], points[0], lineColor, lineWidthForDrawing) # draw back DrawLine(points[4], points[5], lineColor, lineWidthForDrawing) DrawLine(points[6], points[5], lineColor, lineWidthForDrawing) DrawLine(points[6], points[7], lineColor, lineWidthForDrawing) DrawLine(points[4], points[7], lineColor, lineWidthForDrawing) # draw sides DrawLine(points[0], points[4], lineColor, lineWidthForDrawing) DrawLine(points[7], points[3], lineColor, lineWidthForDrawing) DrawLine(points[5], points[1], lineColor, lineWidthForDrawing) DrawLine(points[2], points[6], lineColor, lineWidthForDrawing) # draw dots DrawDot(points[0], pointColor=(255,255,255), pointRadius = 3) DrawDot(points[1], pointColor=(0,0,0), pointRadius = 3) # Draw all the found objects. for points_belief_objects in pointsBelief: DrawCube(points_belief_objects) for keypoint in points_keypoints: DrawKeypoints(keypoint) img = self.transform(img) return { "img":img, "translations":translations, "rot_quaternions":rotations, 'pointsBelief':np.array(points_all[0]), 'matrix_camera':matrix_camera, 'img_original': np.array(img_original), 'cuboid': cuboid, 'file_name':name, } # Create the belief map beliefsImg = CreateBeliefMap( img, pointsBelief=pointsBelief, nbpoints = 9, sigma = self.sigma) # Create the image maps for belief transform = transforms.Compose([transforms.Resize(min(img_size))]) totensor = transforms.Compose([transforms.ToTensor()]) for j in range(len(beliefsImg)): beliefsImg[j] = self.target_transform(beliefsImg[j]) # beliefsImg[j].save('{}.png'.format(j)) beliefsImg[j] = totensor(beliefsImg[j]) beliefs = torch.zeros((len(beliefsImg),beliefsImg[0].size(1),beliefsImg[0].size(2))) for j in range(len(beliefsImg)): beliefs[j] = beliefsImg[j][0] # Create affinity maps scale = 8 if min (img.size) / 8.0 != min (img_size)/8.0: # print (scale) scale = min (img.size)/(min (img_size)/8.0) affinities = GenerateMapAffinity(img,8,pointsBelief,objects_centroid,scale) img = self.transform(img) # Transform the images for training input w_crop = np.random.randint(0, img.size[0] - img_size[0]+1) h_crop = np.random.randint(0, img.size[1] - img_size[1]+1) transform = transforms.Compose([transforms.Resize(min(img_size))]) totensor = transforms.Compose([transforms.ToTensor()]) if not self.normal is None: normalize = transforms.Compose([transforms.Normalize ((self.normal[0],self.normal[0],self.normal[0]), (self.normal[1],self.normal[1],self.normal[1])), AddNoise(self.noise)]) else: normalize = transforms.Compose([AddNoise(0.0001)]) img = crop(img,h_crop,w_crop,img_size[1],img_size[0]) img = totensor(img) img = normalize(img) w_crop = int(w_crop/8) h_crop = int(h_crop/8) affinities = affinities[:,h_crop:h_crop+int(img_size[1]/8),w_crop:w_crop+int(img_size[0]/8)] beliefs = beliefs[:,h_crop:h_crop+int(img_size[1]/8),w_crop:w_crop+int(img_size[0]/8)] if affinities.size()[1] == 49 and not self.test: affinities = torch.cat([affinities,torch.zeros(16,1,50)],dim=1) if affinities.size()[2] == 49 and not self.test: affinities = torch.cat([affinities,torch.zeros(16,50,1)],dim=2) return { 'img':img, "affinities":affinities, 'beliefs':beliefs, } """ Some simple vector math functions to find the angle between two points, used by affinity fields. """ def length(v): return sqrt(v[0]**2+v[1]**2) def dot_product(v,w): return v[0]*w[0]+v[1]*w[1] def normalize(v): norm=np.linalg.norm(v, ord=1) if norm==0: norm=np.finfo(v.dtype).eps return v/norm def determinant(v,w): return v[0]*w[1]-v[1]*w[0] def inner_angle(v,w): cosx=dot_product(v,w)/(length(v)*length(w)) rad=acos(cosx) # in radians return rad*180/pi # returns degrees def py_ang(A, B=(1,0)): inner=inner_angle(A,B) det = determinant(A,B) if det<0: #this is a property of the det. If the det < 0 then B is clockwise of A return inner else: # if the det > 0 then A is immediately clockwise of B return 360-inner def GenerateMapAffinity(img,nb_vertex,pointsInterest,objects_centroid,scale): """ Function to create the affinity maps, e.g., vector maps pointing toward the object center. Args: img: PIL image nb_vertex: (int) number of points pointsInterest: list of points objects_centroid: (x,y) centroids for the obects scale: (float) by how much you need to scale down the image return: return a list of tensors for each point except centroid point """ # Apply the downscale right now, so the vectors are correct. img_affinity = Image.new(img.mode, (int(img.size[0]/scale),int(img.size[1]/scale)), "black") # Create the empty tensors totensor = transforms.Compose([transforms.ToTensor()]) affinities = [] for i_points in range(nb_vertex): affinities.append(torch.zeros(2,int(img.size[1]/scale),int(img.size[0]/scale))) for i_pointsImage in range(len(pointsInterest)): pointsImage = pointsInterest[i_pointsImage] center = objects_centroid[i_pointsImage] for i_points in range(nb_vertex): point = pointsImage[i_points] affinity_pair, img_affinity = getAfinityCenter(int(img.size[0]/scale), int(img.size[1]/scale), tuple((np.array(pointsImage[i_points])/scale).tolist()), tuple((np.array(center)/scale).tolist()), img_affinity = img_affinity, radius=1) affinities[i_points] = (affinities[i_points] + affinity_pair)/2 # Normalizing v = affinities[i_points].numpy() xvec = v[0] yvec = v[1] norms = np.sqrt(xvec * xvec + yvec * yvec) nonzero = norms > 0 xvec[nonzero]/=norms[nonzero] yvec[nonzero]/=norms[nonzero] affinities[i_points] = torch.from_numpy(np.concatenate([[xvec],[yvec]])) affinities = torch.cat(affinities,0) return affinities def getAfinityCenter(width, height, point, center, radius=7, img_affinity=None): """ Function to create the affinity maps, e.g., vector maps pointing toward the object center. Args: width: image wight height: image height point: (x,y) center: (x,y) radius: pixel radius img_affinity: tensor to add to return: return a tensor """ tensor = torch.zeros(2,height,width).float() # Create the canvas for the afinity output imgAffinity = Image.new("RGB", (width,height), "black") totensor = transforms.Compose([transforms.ToTensor()]) draw = ImageDraw.Draw(imgAffinity) r1 = radius p = point draw.ellipse((p[0]-r1,p[1]-r1,p[0]+r1,p[1]+r1),(255,255,255)) del draw # Compute the array to add the afinity array = (np.array(imgAffinity)/255)[:,:,0] angle_vector = np.array(center) - np.array(point) angle_vector = normalize(angle_vector) affinity = np.concatenate([[array*angle_vector[0]],[array*angle_vector[1]]]) # print (tensor) if not img_affinity is None: # Find the angle vector # print (angle_vector) if length(angle_vector) >0: angle=py_ang(angle_vector) else: angle = 0 # print(angle) c = np.array(colorsys.hsv_to_rgb(angle/360,1,1)) * 255 draw = ImageDraw.Draw(img_affinity) draw.ellipse((p[0]-r1,p[1]-r1,p[0]+r1,p[1]+r1),fill=(int(c[0]),int(c[1]),int(c[2]))) del draw re = torch.from_numpy(affinity).float() + tensor return re, img_affinity def CreateBeliefMap(img,pointsBelief,nbpoints,sigma=16): """ Args: img: image pointsBelief: list of points in the form of [nb object, nb points, 2 (x,y)] nbpoints: (int) number of points, DOPE uses 8 points here sigma: (int) size of the belief map point return: return an array of PIL black and white images representing the belief maps """ beliefsImg = [] sigma = int(sigma) for numb_point in range(nbpoints): array = np.zeros(img.size) out = np.zeros(img.size) for point in pointsBelief: p = point[numb_point] w = int(sigma*2) if p[0]-w>=0 and p[0]+w=0 and p[1]+w 1 self_is_main_node = world_rank == 0 print("world size is: ", world_size) print("global rank is {} and local_rank is {}".format(world_rank, local_rank)) else: raise NotImplementedError( f"distributed_backend={distributed_backend} is not implemented yet." ) if self_is_main_node: with open (opt.checkpoints+'/header.txt','w') as file: file.write(str(opt)+"\n") with open (opt.checkpoints+'/header.txt','w') as file: file.write(str(opt)) file.write("seed: "+ str(opt.manualseed)+'\n') with open (opt.checkpoints+'/test_metric.csv','w') as file: file.write("epoch, passed,total \n") dist_url = "env://" # default # dist_url = "auto" is_distributed = world_size > 1 if is_distributed: batch_size = opt.batch_size // world_size batch_size = max(batch_size, 1) else: batch_size = opt.batch_size print("is_distributed is {} and batch_size is {}".format(is_distributed, batch_size)) env_dict = { key: os.environ[key] for key in ("MASTER_ADDR", "MASTER_PORT","LOCAL_RANK", "RANK", "WORLD_SIZE") } print("os.getpid() is {} and initializing process group with {}".format(os.getpid(), env_dict)) logger.info("os.getpid() is {} and initializing process group with {}".format(os.getpid(), env_dict)) # DISTRIBUTED: this is required to initialize the pytorch backend dist.init_process_group( backend=distributed_backend, init_method=dist_url, timeout=datetime.timedelta(seconds=2000), #default is 30 min world_size=world_size, rank=world_rank # so should it be RANK or WORLD_RANK ) torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) if torch.cuda.is_available() else 'cpu' print("device is {}".format(device)) # this will make all .cuda() calls work properly # synchronize all the threads to reach this point before moving on dist.barrier() # is this necessary? # DISTRIBUTED: in distributed mode, you want to report parameters # only from main process (rank==0) to avoid conflict # if self_is_main_node: # MLFLOW: report relevant parameters using mlflow logged_params = { "instance_per_node": world_size, "cuda_available": torch.cuda.is_available(), "distributed": multinode_available, "distributed_backend": distributed_backend, # data loading params "batch_size": batch_size, "num_epochs": opt.num_epochs, "num_workers": opt.num_workers, "cpu_count": os.cpu_count(), "prefetch_factor": opt.prefetch_factor, "persistent_workers": opt.persistent_workers, "pin_memory": opt.pin_memory, "non_blocking": opt.non_blocking, # "multiprocessing_sharing_strategy": opt.multiprocessing_sharing_strategy, # training params "model_arch": opt.model_arch, "model_arch_pretrained": opt.model_arch_pretrained, "optimizer.learning_rate": opt.learning_rate, # profiling params "enable_profiling": opt.enable_profiling, } if torch.cuda.is_available(): # add some gpu properties logged_params["cuda_device_count"] = torch.cuda.device_count() cuda_device_properties = torch.cuda.get_device_properties(device) logged_params["cuda_device_name"] = cuda_device_properties.name logged_params["cuda_device_major"] = cuda_device_properties.major logged_params["cuda_device_minor"] = cuda_device_properties.minor logged_params[ "cuda_device_memory" ] = cuda_device_properties.total_memory logged_params[ "cuda_device_processor_count" ] = cuda_device_properties.multi_processor_count print("MLflow version:", mlflow.__version__) print("Tracking URI:", mlflow.get_tracking_uri()) print("Artifact URI:", mlflow.get_artifact_uri()) tags = {"team": "ARD", "dataset": "FAT120K", "model": "pose"} exp_name = "dope6d" if self_is_main_node: mlflow.set_experiment(experiment_name=exp_name) mlflow.set_tags(tags) mlflow.log_params(logged_params) # save if not opt.save: contrast = 0.2 brightness = 0.2 noise = 0.1 normal_imgs = [0.59,0.25] transform = transforms.Compose([ AddRandomContrast(contrast), AddRandomBrightness(brightness), transforms.Resize(opt.imagesize), ]) else: contrast = 0.00001 brightness = 0.00001 noise = 0.00001 normal_imgs = None transform = transforms.Compose([ transforms.Resize(opt.imagesize), transforms.ToTensor()]) print ("load data") #load the dataset using the loader in utils_pose optional_data_loading_kwargs = {} if opt.num_workers > 0: # NOTE: this option _ONLY_ applies if num_workers > 0 # or else DataLoader will except optional_data_loading_kwargs[ "prefetch_factor" ] = opt.prefetch_factor optional_data_loading_kwargs[ "persistent_workers" ] = opt.persistent_workers trainingdata = None if not opt.data == "": train_dataset = MultipleVertexJson( root = opt.data, objectofinterest=opt.object, keep_orientation = True, noise = opt.noise, sigma = opt.sigma, data_size = opt.datasize, save = opt.save, transform = transform, normal = normal_imgs, target_transform = transforms.Compose([ transforms.Resize(opt.imagesize//8), ]), ) # If you set shuffle=True for both, the data will be shuffled at both the process and batch level, # which may improve the randomness of the batches seen by each process, # but it may also increase the communication overhead. num_replicas = world_size train_sampler = DistributedSampler(dataset=train_dataset, num_replicas=num_replicas, rank=world_rank, shuffle=True) if is_distributed else None print('train data size: ', len(train_dataset)) trainingdata = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, sampler=train_sampler, shuffle = (train_sampler is None), num_workers = opt.num_workers, pin_memory = True, **optional_data_loading_kwargs ) print('training data len: ', len(trainingdata.dataset)) if opt.save: for i in range(2): images = iter(trainingdata).next() if normal_imgs is None: normal_imgs = [0,1] save_image(images['img'],'{}/train_{}.png'.format( opt.checkpoints,str(i).zfill(5)),mean=normal_imgs[0],std=normal_imgs[1]) print (i) print ('Checkpoints are saved in {}'.format(opt.checkpoints)) quit() testingdata = None if not opt.datatest == "": testingdata = torch.utils.data.DataLoader( MultipleVertexJson( root = opt.datatest, objectofinterest=opt.object, keep_orientation = True, noise = opt.noise, sigma = opt.sigma, data_size = opt.datasize, save = opt.save, transform = transform, normal = normal_imgs, target_transform = transforms.Compose([ transforms.Resize(opt.imagesize//8), ]), ), batch_size = batch_size, shuffle = True, num_workers = opt.num_workers, pin_memory = True) print("batch size is: ", batch_size) if not trainingdata is None: print('training data: {} batches'.format(len(trainingdata))) if not testingdata is None: print ("testing data: {} batches".format(len(testingdata))) print('load models') print("torch.cuda.device_count(): ", torch.cuda.device_count()) print('type opt.gpuids: {}'.format(type(opt.gpuids))) print("gpuids are: {}".format(opt.gpuids)) #net = DopeNetwork(pretrained=opt.model_arch_pretrained).cuda() net = DopeNetwork(pretrained=opt.model_arch_pretrained).cuda(local_rank) # net = torch.nn.DataParallel(net,device_ids=opt.gpuids).cuda() net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[local_rank], output_device=local_rank) # do we need output_device=local_rank? if opt.net != '': ## what is this doing mona net.load_state_dict(torch.load(opt.net)) parameters = filter(lambda p: p.requires_grad, net.parameters()) ### Is this even necessary? optimizer = optim.Adam(parameters, lr=opt.learning_rate) # scheduler = CosineAnnealingLR(optimizer, T_max=opt.num_epochs) #, eta_min=1e-5 # not using lr scheduler for now # optimizer = optim.SGD(parameters, lr=opt.learning_rate, momentum=opt.momentum, nesterov=True) if self_is_main_node: with open (opt.checkpoints + '/loss_train.csv', 'w') as file: file.write('epoch,batchid,loss\n') with open (opt.checkpoints + '/loss_test.csv', 'w') as file: file.write('epoch,batchid,loss\n') nb_update_network = 0 def _runnetwork(epoch, loader, train=True): global nb_update_network # net if train: net.train() else: net.eval() for batch_idx, targets in enumerate(loader): data = Variable(targets['img'].cuda()) output_belief, output_affinities = net(data) if train: optimizer.zero_grad() target_belief = Variable(targets['beliefs'].cuda()) target_affinity = Variable(targets['affinities'].cuda()) loss = None # Belief maps loss for l in output_belief: #output, each belief map layers. if loss is None: loss = ((l - target_belief) * (l-target_belief)).mean() else: loss_tmp = ((l - target_belief) * (l-target_belief)).mean() loss += loss_tmp # Affinities loss for l in output_affinities: #output, each belief map layers. loss_tmp = ((l - target_affinity) * (l-target_affinity)).mean() loss += loss_tmp if train: loss.backward() optimizer.step() nb_update_network+=1 if train: namefile = '/loss_train.csv' else: namefile = '/loss_test.csv' if self_is_main_node: with open (opt.checkpoints + namefile,'a') as file: s = '{}, {},{:.15f}\n'.format( epoch,batch_idx,loss.data.item()) file.write(s) if train: if batch_idx % opt.loginterval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.15f}'.format( epoch, batch_idx * len(data), len(loader.dataset), 100. * batch_idx / len(loader), loss.data.item())) print("epoch is: {} and train loss is {}".format(epoch, loss)) if self_is_main_node: mlflow.log_metric("train_loss", loss.data.item()) else: if batch_idx % opt.loginterval == 0: print('Test Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.15f}'.format( epoch, batch_idx * len(data), len(loader.dataset), 100. * batch_idx / len(loader), loss.data.item())) print("epoch is: {} and test loss is {}".format(epoch, loss)) if self_is_main_node: mlflow.log_metric("test_loss", loss.data.item()) # break if not opt.nbupdates is None and nb_update_network > int(opt.nbupdates): if self_is_main_node and opt.checkpoints is not None: torch.save(net.state_dict(), '{}/net_{}.pth'.format(opt.checkpoints, opt.namefile)) break dist.barrier() for epoch in range(1, opt.num_epochs + 1): if not trainingdata is None: _runnetwork(epoch,trainingdata) # scheduler.step() # not using scheduler for now if not opt.datatest == "": _runnetwork(epoch,testingdata,train = False) if opt.data == "": break # lets get out of this if we are only testing if self_is_main_node and opt.checkpoints is not None: try: torch.save(net.state_dict(), '{}/net_{}_{}_{}.pth'.format(opt.checkpoints, opt.object, opt.namefile ,epoch)) except: pass if not opt.nbupdates is None and nb_update_network > int(opt.nbupdates): break print ("end:" , datetime.datetime.now().time()) if self_is_main_node: print(mlflow.active_run().info.run_uuid) if mlflow.active_run(): mlflow.end_run() def cleanup(): dist.destroy_process_group() cleanup()