import sys import csv sys.path.insert(0, "../kitti_codes/") import socket import time import re import argparse import numpy as np from subprocess import call from IPython.core.debugger import Tracer; debug_here = Tracer() machine_name=socket.gethostname(); if (re.search('vision',machine_name) or re.search('eldar', machine_name) or re.search('adriana', machine_name) or re.search('jaechul', machine_name)): hpcsystem='condor'; #error('Not implemented'); sys.path.insert(0, "/scratch/vision/dineshj/caffe2/distribute_CPU/python") else: hpcsystem='slurm'; sys.path.insert(0, "/work/01932/dineshj/caffe2/python") import caffe from caffe import layers as L from caffe import params as P import layer_stack as LS if re.search('maverick', machine_name): clustername='maverick'; elif re.search('stampede', machine_name): clustername='stampede'; else: clustername='condor'; #hpcsystem='slurm' #clustername='maverick' #read clusterno lastclusterno=int(open('.lastclusterno').read()); print("Last clusterno: %d" % lastclusterno); clusterno=lastclusterno; resume_flag=False snapshot='' finetune_flag=False weights='' def main(): global clusterno #config='drlim'; base_solver=LS.CaffeSolver(debug=args.debug) if args.config == 'cls': num_cls=397; cls_batchsize=64; cls_loss_weights=10*np.ones(args.numjobs); base_solver.sp["max_iter"]=str(30000); base_solver.sp["snapshot"]=str(10000); base_solver.sp["stepsize"]=str(5000); base_solver.sp["gamma"]=str(0.5); base_solver.sp["weight_decay"]=str(0.0005); runtime=720; learning_rates=np.logspace(-4,-4,args.numjobs); elif args.config == 'drlim' or args.config == "ved_drlim": num_cls=397; nonDiscrete_flag=False dynamicCrop_flag=False pair_batchsize=64; cls_batchsize=64; #drlim_loss_weights=np.ones(args.numjobs); drlim_loss_weights=np.zeros(args.numjobs); #trans_loss_weights=1*np.ones(args.numjobs)*(1 if args.config=='ved_drlim' else 0); #trans_loss_weights=np.logspace(-2,0,args.numjobs)*(1 if args.config=='ved_drlim' else 0); #trans_loss_weights=np.linspace(0.1,0.9,args.numjobs)*(1 if args.config=='ved_drlim' else 0); trans_loss_weights=np.ones(args.numjobs)*(1 if args.config=='ved_drlim' else 0); drlim_loss_margins=100*np.ones(args.numjobs); trans_loss_margins=100*np.ones(args.numjobs); cls_loss_weights=10*np.ones(args.numjobs); base_solver.sp["max_iter"]=str(70000); base_solver.sp["snapshot"]=str(10000); base_solver.sp["stepsize"]=str(5000); base_solver.sp["gamma"]=str(0.5); base_solver.sp["weight_decay"]=str(0.005); runtime=720; learning_rates=np.logspace(-4.5,-2.5,args.numjobs); csvfile = open('hyperparams.csv', 'w'); fieldnames=[ "jobno" , "config" , "base_lr" , "nonDiscrete_flag" , "dynamicCrop_flag" , "pair_batchsize" , "cls_batchsize" , "cls_loss_weight" , "drlim_loss_weight" , "trans_loss_weight" , "drlim_loss_margin" , "trans_loss_margin" , "weight_decay" , "max_iter" , "snapshot" , "stepsize" , "gamma" , "type" , "momentum" , "momentum2" , "random_seed" ]; csvwriter= csv.DictWriter(csvfile, fieldnames=fieldnames); csvwriter.writeheader() for jobno in range(args.numjobs): if jobno>0: print("Delay..."); time.sleep(5); clusterno=clusterno+1; print ("Creating files for job clusterno %d" % clusterno); solver_file="../condor/" + str(clusterno)+'_solver.prototxt' trainnet_file="../condor/" + str(clusterno) + '_trainnet.prototxt' testnet_file="../condor/" + str(clusterno) + '_testnet.prototxt' deploynet_file="../condor/" + str(clusterno) + '_deploynet.prototxt' if args.config=='drlim' or args.config=='ved_drlim': #pretrain=True solver=base_solver; solver.sp["random_seed"]=str(clusterno); solver.sp["train_net"]= '"'+trainnet_file+'"'; solver.sp["test_net"]= '"'+testnet_file+'"'; solver.sp["snapshot_prefix"]='"'+"../caffe_snapshots/"+ str(clusterno) + "_snap"+'"'; solver.sp["base_lr"]= str(learning_rates[jobno]); def net_phase(phase): # phase == 'train', 'test' or 'deploy' assert(phase=='train' or phase=='test' or phase=='deploy' ) net=caffe.NetSpec() ## Data layers ######################## if phase=='train': if not dynamicCrop_flag: net["pair_data"]=L.Data( source="../kitti_codes/KITTI_files/trn_trans_pairs_full-clust6_3-nbd10/" , transform_param=dict( mean_value=[104,117,123,104,117,123]), batch_size=pair_batchsize, ntop=1, backend=P.Data.LMDB, #include=dict(phase=caffe.TRAIN), name="pair_kitti_data_trn"); net["trans_labelvec"]=L.HDF5Data( source="../kitti_codes/trn_full_trans_pairs-clust6_3-nbd10_labels.txt", batch_size=pair_batchsize, ntop=1, #include=dict(phase=caffe.TRAIN), name="pair_kitti_label_trn" ) else: net["pair_data"]=L.Data( source="../kitti_codes/KITTI/trn_trans_pairs_370x1226_clust6_3-nbd7/" , transform_param=dict( mirror=False, crop_size=227, mean_value=[104,117,123,104,117,123]), batch_size=pair_batchsize, ntop=1, backend=P.Data.LMDB, #include=dict(phase=caffe.TRAIN), name="pair_kitti_data_trn"); net["trans_labelvec"]=L.HDF5Data( source="../kitti_codes/trn_370x1226_trans_pairs-clust6_3-nbd7_labels.txt", batch_size=pair_batchsize, ntop=1, #include=dict(phase=caffe.TRAIN), name="pair_kitti_label_trn" ) elif phase=='test': if not dynamicCrop_flag: net["pair_data"]=L.Data( source="../kitti_codes/KITTI_files/trn_trans_pairs_full-clust6_3-nbd10/" , transform_param=dict( mean_value=[104,117,123,104,117,123]), batch_size=pair_batchsize, ntop=1, backend=P.Data.LMDB, #include=dict(phase=caffe.TRAIN), name="pair_kitti_data_tst"); net["trans_labelvec"]=L.HDF5Data( source="../kitti_codes/trn_full_trans_pairs-clust6_3-nbd10_labels.txt", batch_size=pair_batchsize, ntop=1, #include=dict(phase=caffe.TRAIN), name="pair_kitti_label_tst" ) else: net["pair_data"]=L.Data( source="../kitti_codes/KITTI/tst_trans_pairs_370x1226_clust6_3-nbd7/" , transform_param=dict( mirror=False, crop_size=227, mean_value=[104,117,123,104,117,123]), batch_size=pair_batchsize, ntop=1, backend=P.Data.LMDB, #include=dict(phase=caffe.TRAIN), name="pair_kitti_data_tst"); net["trans_labelvec"]=L.HDF5Data( source="../kitti_codes/tst_370x1226_trans_pairs-clust6_3-nbd7_labels.txt", batch_size=pair_batchsize, ntop=1, #include=dict(phase=caffe.TRAIN), name="pair_kitti_label_tst" ) #net["pair_data"], net["pair_labelvec"] = L.DummyData(name="dummy_pair_data", # ntop=2, # shape=[dict(dim=[pair_batchsize, 6, 227, 227]), dict(dim=[pair_batchsize, 1, 1, 7])] # ); if phase=='train' or phase == 'test': # no deploy net["a_data"], net["b_data"] = L.Slice( net["pair_data"], name="slice_data", slice_param=dict( slice_dim=1, slice_point=[3]), ntop=2 ); net["sim_label"], net["trans_label1"], net["trans_label2"], net["trans_label3"], net["trans_mot_labels"] = L.Slice( net["trans_labelvec"], name="slice_pair_label", slice_param=dict( slice_point=[1, 2, 3, 4] ), ntop=5, ) net["silent"]=L.Silence( net["sim_label"], net["trans_label1"], net["trans_label2"], net["trans_label3"], net["trans_mot_labels"], ntop=0 ) ## Siamese network ######################## net=LS.generate_conv1_to_bn6( net, blob_prefix="a_", layer_prefix="a_", param_prefix="shared_", bottom_blob="a_data", top_blob="a_top", num_dropouts=0, learn_all=True, in_place_pool5=False ); net=LS.generate_conv1_to_bn6( net, blob_prefix="b_", layer_prefix="b_", param_prefix="shared_", bottom_blob="b_data", top_blob="b_top", num_dropouts=0, learn_all=True, in_place_pool5=False ); ## drlim loss ######################## net=LS.generate_contrastive_loss( net, blob_prefix="drlim_", layer_prefix="drlim_", param_prefix="", bottom_blob=["a_top", "b_top", "sim_label"], loss_weight=drlim_loss_weights[jobno], loss_margin=drlim_loss_margins[jobno], learn_all=True ); ## equivariance loss ######################## if args.config=='ved_drlim': if not nonDiscrete_flag: num_transforms=3; for i in range(num_transforms): net=LS.generate_equivariant_map( net, blob_prefix="trans"+str(i+1)+"_", layer_prefix="trans"+str(i+1)+"_", bottom_blob="b_top", top_blob="c_top" + str(i+1), bottleneck_size=128, nonDiscrete_flag=nonDiscrete_flag, learn_all=True ) net=LS.generate_contrastive_loss( net, blob_prefix="trans_", blob_suffix=str(i+1), layer_prefix="trans_", layer_suffix=str(i+1), bottom_blob=["a_top", "c_top" + str(i+1), "trans_label" + str(i+1)], loss_weight=trans_loss_weights[jobno], loss_margin=trans_loss_margins[jobno], learn_all=True ) else: net=LS.generate_equivariant_map( net, blob_prefix="trans_", layer_prefix="trans_", bottom_blob="b_top", top_blob="c_top", bottleneck_size=128, nonDiscrete_flag=nonDiscrete_flag, motion_blob="trans_mot_labels", learn_all=True ) net["trans_loss"]=L.EuclideanLoss( net["a_top"], net["c_top"], loss_weight=trans_loss_weights[jobno] ) ## classification pipeline (for either monitoring, or training) if phase == 'train': net["cls_data"], net["cls_label"]=L.Data( source="../kitti_codes/SUN/pulkit_lmdbs/sun_imSz256_ntpc5_run1_train-lmdb" , transform_param=dict( mirror=True, crop_size=227, mean_value=[104,117,123]), batch_size=cls_batchsize, ntop=2, backend=P.Data.LMDB, #include=dict(phase=caffe.TRAIN), name="cls_sun_trn"); elif phase=='test': net["cls_data"], net["cls_label"]=L.Data( source="../kitti_codes/SUN/pulkit_lmdbs/sun_imSz256_ntpc50_run1_test-lmdb/" , transform_param=dict( mirror=False, crop_size=227, mean_value=[104,117,123]), batch_size=cls_batchsize, ntop=2, backend=P.Data.LMDB, #include=dict(phase=caffe.TEST), name="cls_sun_tst"); elif phase=='deploy': net["cls_data"]=L.Input( input_param=dict( shape=dict(dim=[1,3,227,227]) ), name="cls_sun_deploy" ) #net["cls_data"], net["cls_label"] = L.DummyData(name="dummy_cls_data", # ntop=2, # shape=[dict(dim=[cls_batchsize,3,227,227]), dict(dim=[cls_batchsize, 1])] # ) if phase=='train' or phase== 'test' or phase== 'deploy': # trivially satisfied since there is a previous assert, but just in case something changes later net=LS.generate_conv1_to_bn6( net, blob_prefix="cls_", layer_prefix="", param_prefix="shared_", bottom_blob="cls_data", top_blob="cls_bn6", num_dropouts=1 if args.num_dropouts>=2 else 0, #learn_all=False if args.pretrain else True learn_all=True ) if phase=='train' or phase== 'test': #L6 net=LS.generate_classifier( net, blob_prefix="L6_", layer_prefix="L6_", param_prefix="", bottom_blob=["cls_bn6", "cls_label"], learn_all=True, propagate_down=True if (not args.pretrain and cls_loss_level=='L6') else False, num_cls=num_cls, loss_weight=cls_loss_weights[jobno], loss_name="cls_L6_loss", acc_name="cls_L6_acc", num_dropouts=1 if args.num_dropouts>=1 else 0, ) #L3 net=LS.generate_classifier( net, blob_prefix="L3_", layer_prefix="L3_", param_prefix="", bottom_blob=["cls_bn3", "cls_label"], learn_all=True, propagate_down=True if (not args.pretrain and cls_loss_level=='L3') else False, num_cls=num_cls, loss_weight=cls_loss_weights[jobno], loss_name="cls_L3_loss", acc_name="cls_L3_acc", ) return net else: raise Exception('config %s not handled yet' % config) print ("Solver: %s" % solver_file); solver.write(solver_file); train_net=net_phase('train'); test_net=net_phase('test'); deploy_net=net_phase('deploy'); print ("Train net: %s" % trainnet_file); with open(trainnet_file, 'w') as f: f.write(str(train_net.to_proto())); print ("Test net: %s" % testnet_file); with open(testnet_file, 'w') as f: f.write(str(test_net.to_proto())); print ("Deploy net: %s" % deploynet_file); with open(deploynet_file, 'w') as f: f.write(str(deploy_net.to_proto())); if args.debug: try: net=caffe.Net(trainnet_file, caffe.TRAIN); net=caffe.Net(testnet_file, caffe.TEST); net=caffe.Net(deploynet_file, caffe.TEST); except: raise Exception("network file raises error"); # store job details to csv job_signature = {}; job_signature["jobno"] = clusterno job_signature["config"] = args.config job_signature["base_lr"] = solver.sp["base_lr"] job_signature["nonDiscrete_flag"] = nonDiscrete_flag job_signature["dynamicCrop_flag"] = dynamicCrop_flag job_signature["pair_batchsize"] = pair_batchsize job_signature["cls_batchsize"] = cls_batchsize job_signature["cls_loss_weight"] = cls_loss_weights[jobno] job_signature["drlim_loss_weight"] = drlim_loss_weights[jobno] job_signature["trans_loss_weight"] = trans_loss_weights[jobno] job_signature["drlim_loss_margin"] = drlim_loss_margins[jobno] job_signature["trans_loss_margin"] = trans_loss_margins[jobno] job_signature["weight_decay"] = solver.sp["weight_decay"] job_signature["max_iter"] = solver.sp["max_iter"] job_signature["snapshot"] = solver.sp["snapshot"] job_signature["stepsize"] = solver.sp["stepsize"] job_signature["gamma"] = solver.sp["gamma"] job_signature["momentum"] = solver.sp["momentum"] job_signature["momentum2"] = solver.sp["momentum2"] job_signature["type"] = solver.sp["type"] job_signature["random_seed"] = solver.sp["random_seed"] csvwriter.writerow(job_signature); print("Job signature stored to %s" % 'hyperparams.csv') # submit job if hpcsystem=="condor": condor_submitFile='../condor/%d.condor_submit' % clusterno print("Condor submit file: %s" % condor_submitFile); with open(condor_submitFile, 'w') as file: file.write('+Group="GRAD"\n'); file.write('+Project="AI_ROBOTICS"\n'); file.write('+ProjectDescription=""\n'); file.write('+GPUJOB=true\n'); file.write('Requirements=TARGET.GPUSlot\n'); file.write('Environment=LD_LIBRARY_PATH=/scratch/vision/dineshj/caffe_vision_extra//leveldb/:/scratch/vision/dineshj/caffe_vision_extra//snappy/install/lib/:/scratch/vision/dineshj/caffe_vision_extra//OpenBLAS/build//lib/:/scratch/vision/dineshj/caffe_vision_extra//glog-0.3.3/install/lib/:/scratch/vision/dineshj/caffe_vision_extra//gflags/build/lib/:/scratch/vision/dineshj/caffe_vision_extra//lmdb/:/scratch/vision/dineshj/caffe_vision_extra//protobuf/install//lib/:/lusr/opt/boost-1.54/lib/:/opt/cuda-7.0/lib64/:/opt/cuda-7.0/nvvm/libdevice/:/usr/:/scratch/vision/dineshj/caffe_vision_extra/hdf5-1.8.15-patch1/install/lib/:/usr/lib/x86_64-linux-gnu/:/scratch/vision/dineshj/caffe_vis/build/lib/:/lib/x86_64-linux-gnu/:/v/filer4b/software/matlab-r2015b/bin/glnxa64/:/v/filer4b/software/matlab-r2015b/runtime/glnxa64/:/vision/vision_users/dineshj/local_installs/lib/;\n'); file.write('Universe = vanilla\n'); file.write('Getenv = True\n'); file.write('Log = ../condor/%d.log\n' % clusterno); file.write('Output = ../condor/%d.out\n' % clusterno); file.write('Error = ../condor/%d.err\n' % clusterno); file.write('Notification = Complete\n'); file.write('Executable=../caffe2/tools/caffe\n'); file.write('Arguments = train -gpu 0'); file.write(' -solver %s' % solver_file); if resume_flag: file.write(' -snapshot %s' % snapshot) if (not resume_flag) and finetune_flag: file.write(' -weights %s' % weights) file.write('\nQueue 1'); if not args.submit: debug_here() retcode=call("condor_submit %s 2> /dev/null" % condor_submitFile, shell=True) elif hpcsystem=="slurm": slurm_submitFile='../condor/%d.slurm_submit' % clusterno try: runtime except: runtime=20 print ("Slurm submit file: %s" % slurm_submitFile); with open(slurm_submitFile,'w') as file: file.write("#!/bin/bash\n"); file.write("#SBATCH -J tyrion\n"); file.write("#SBATCH -o ../condor/%d.err\n" % clusterno) file.write('#SBATCH -p gpu\n'); file.write('#SBATCH -n 1\n'); file.write('#SBATCH -A Visual-Recognition\n'); #Visual-Recognition || CS381V-Visual-Recogn || Fine-Tuning-CNNs file.write('#SBATCH -t 00:%02d:00\n' % runtime); if clustername=="stampede": file.write('time ../caffe2_build_stampede/tools/caffe train -gpu 0'); elif clustername=="maverick": file.write('time ../caffe2/tools/caffe train -gpu 0'); else: raise Exception('clustername %s not handled for slurm submission' % clustername) file.write(' -solver %s' % solver_file); if resume_flag: file.write(' -snapshot %s' % snapshot) if (not resume_flag) and finetune_flag: file.write(' -weights %s' % weights) file.write('\npython nn_eval.py --max_jobs 25 --max_test_images 1000 -k 1 -m %s' % solver.sp["snapshot_prefix"].strip('"')); if not args.submit: debug_here() retcode=call("sbatch " + slurm_submitFile, shell=True) if retcode !=0: print>>sys.stderr, "Child was terminated by signal", retcode raise Exception('could not submit job') # then update .last clustenro with open('.lastclusterno', 'w') as f: f.write('%d' % clusterno); csvfile.close() if __name__ == "__main__": parser=argparse.ArgumentParser() parser.add_argument('-n', '--numjobs', type=int, default=1, help="number of jobs to submit"); submit_parser = parser.add_mutually_exclusive_group(required=False) submit_parser.add_argument('--submit', dest='submit', action='store_true') submit_parser.add_argument('--nosubmit', dest='submit', action='store_false') parser.set_defaults(submit=True) debug_parser = parser.add_mutually_exclusive_group(required=False) debug_parser.add_argument('--debug', dest='debug', action='store_true') debug_parser.add_argument('--nodebug', dest='debug', action='store_false') parser.set_defaults(debug=False) parser.add_argument('--config', type=str, default="drlim", help="cls | drlim | ved_drlim"); pretrain_parser = parser.add_mutually_exclusive_group(required=False) pretrain_parser.add_argument('--pretrain', dest='pretrain', action='store_true'); pretrain_parser.add_argument('--finetune', dest='pretrain', action='store_false'); parser.set_defaults(pretrain=True) parser.add_argument('--num_dropouts', type=int, default=1, help=""); args=parser.parse_args() caffe.set_mode_gpu() main()