生活随笔
收集整理的這篇文章主要介紹了
模型优化:BatchNorm合并到卷积中
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
1.bn合并的必要性:
bn層即batch-norm層,一般是深度學(xué)習(xí)中用于加速訓(xùn)練速度和一種方法,一般放置在卷積層(conv層)或者全連接層之后,將數(shù)據(jù)歸一化并加速了訓(xùn)練擬合速度。但是bn層雖然在深度學(xué)習(xí)模型訓(xùn)練時(shí)起到了一定的積極作用,但是在預(yù)測時(shí)因?yàn)閼{空多了一些層,影響了整體的計(jì)算速度并占用了更多內(nèi)存或者顯存空間。所以我們設(shè)想如果能將bn層合并到相鄰的卷積層或者全連接層之后就好了,于是就有了這篇文章所提到的工作。
?
2.bn合并本身的數(shù)學(xué)原理:
??????????????????????bn層一般在神經(jīng)網(wǎng)絡(luò)中‘所處的位置如下圖所示:
如上圖可以看到,bn層的位置一般在conv(or Fc)層的后面,也有一些情況bn在conv(or Fc)層的前面。我們先來兩種情況分別來考慮。
?
2.1 bn層在conv層之后的情形
bn合并的原理,可以由下兩張圖所示:
?
bn層進(jìn)行數(shù)據(jù)處理的過程
這張圖的表示,將一個(gè)數(shù)據(jù)X,進(jìn)行bn層的操作和計(jì)算得到的結(jié)果。
?
這張圖表示,第一部分代表bn層處理之后接著卷基層的操作結(jié)果,第二部分表示將bn層合并到卷積層之后,卷積層w和b的變化。
注意點(diǎn):conv,Bn,Scale,層之間的top和bottom的名字要相同
2.2? bn在前,卷積在后的合并方式
?????? 這種情況下,FC層的合并方式和之前2.1的結(jié)果類似,但是bn在前,conv在后的情形,因?yàn)閏onv存在pad的情形,所以無法合并。
?
3.卷積和bn合并的代碼實(shí)現(xiàn)
3.1 caffe版本(該版本是我從網(wǎng)絡(luò)獲取的,如侵權(quán)刪)
?
[python]?view plain?copy
#!/usr/bin/env?python??import?_init_paths??import?numpy?as?np??import?sys??import?os??import?os.path?as?osp??import?google.protobuf?as?pb??from?argparse?import?ArgumentParser??import?sys??import?caffe??????def?load_and_fill_biases(src_model,?src_weights,?dst_model,?dst_weights):??????with?open(src_model)?as?f:??????????model?=?caffe.proto.caffe_pb2.NetParameter()??????????pb.text_format.Merge(f.read(),?model)????????for?i,?layer?in?enumerate(model.layer):??????????if?layer.type?==?'Convolution':?#?or?layer.type?==?'Scale':??????????????#?Add?bias?layer?if?needed??????????????if?layer.convolution_param.bias_term?==?False:??????????????????layer.convolution_param.bias_term?=?True??????????????????layer.convolution_param.bias_filler.type?=?'constant'??????????????????layer.convolution_param.bias_filler.value?=?0.0????????with?open(dst_model,?'w')?as?f:??????????f.write(pb.text_format.MessageToString(model))????????caffe.set_mode_cpu()??????net_src?=?caffe.Net(src_model,?src_weights,?caffe.TEST)??????net_dst?=?caffe.Net(dst_model,?caffe.TEST)??????for?key?in?net_src.params.keys():??????????for?i?in?range(len(net_src.params[key])):??????????????net_dst.params[key][i].data[:]?=?net_src.params[key][i].data[:]????????if?dst_weights?is?not?None:??????????#?Store?params??????????pass????????return?net_dst??????def?merge_conv_and_bn(net,?i_conv,?i_bn,?i_scale):??????#?This?is?based?on?Kyeheyon's?work??????assert(i_conv?!=?None)??????assert(i_bn?!=?None)????????def?copy_double(data):??????????return?np.array(data,?copy=True,?dtype=np.double)????????key_conv?=?net._layer_names[i_conv]??????key_bn?=?net._layer_names[i_bn]??????key_scale?=?net._layer_names[i_scale]?if?i_scale?else?None????????#?Copy??????bn_mean?=?copy_double(net.params[key_bn][0].data)??????bn_variance?=?copy_double(net.params[key_bn][1].data)??????num_bn_samples?=?copy_double(net.params[key_bn][2].data)????????#?and?Invalidate?the?BN?layer??????net.params[key_bn][0].data[:]?=?0??????net.params[key_bn][1].data[:]?=?1??????net.params[key_bn][2].data[:]?=?1??????if?num_bn_samples[0]?==?0:??????????num_bn_samples[0]?=?1????????if?net.params.has_key(key_scale):??????????print?'Combine?{:s}?+?{:s}?+?{:s}'.format(key_conv,?key_bn,?key_scale)??????????scale_weight?=?copy_double(net.params[key_scale][0].data)??????????scale_bias?=?copy_double(net.params[key_scale][1].data)??????????net.params[key_scale][0].data[:]?=?1??????????net.params[key_scale][1].data[:]?=?0??????else:??????????print?'Combine?{:s}?+?{:s}'.format(key_conv,?key_bn)??????????scale_weight?=?1??????????scale_bias?=?0????????weight?=?copy_double(net.params[key_conv][0].data)??????bias?=?copy_double(net.params[key_conv][1].data)??????alpha?=?scale_weight?/?np.sqrt(bn_variance?/?num_bn_samples[0]?+?np.finfo(np.double).eps)??????net.params[key_conv][1].data[:]?=?bias?*?alpha?+?(scale_bias?-?(bn_mean?/?num_bn_samples[0])?*?alpha)??????for?i?in?range(len(alpha)):??????????net.params[key_conv][0].data[i]?=?weight[i]?*?alpha[i]????def?merge_batchnorms_in_net(net):??????#?for?each?BN??????for?i,?layer?in?enumerate(net.layers):??????????if?layer.type?!=?'BatchNorm':??????????????continue????????????l_name?=?net._layer_names[i]????????????l_bottom?=?net.bottom_names[l_name]??????????assert(len(l_bottom)?==?1)??????????l_bottom?=?l_bottom[0]??????????l_top?=?net.top_names[l_name]??????????assert(len(l_top)?==?1)??????????l_top?=?l_top[0]????????????can_be_absorbed?=?True????????????#?Search?all?(bottom)?layers??????????for?j?in?xrange(i?-?1,?-1,?-1):??????????????tops_of_j?=?net.top_names[net._layer_names[j]]??????????????if?l_bottom?in?tops_of_j:??????????????????if?net.layers[j].type?not?in?['Convolution',?'InnerProduct']:??????????????????????can_be_absorbed?=?False??????????????????else:??????????????????????#?There?must?be?only?one?layer??????????????????????conv_ind?=?j??????????????????????break????????????if?not?can_be_absorbed:??????????????continue????????????#?find?the?following?Scale??????????scale_ind?=?None??????????for?j?in?xrange(i?+?1,?len(net.layers)):??????????????bottoms_of_j?=?net.bottom_names[net._layer_names[j]]??????????????if?l_top?in?bottoms_of_j:??????????????????if?scale_ind:??????????????????????#?Followed?by?two?or?more?layers??????????????????????scale_ind?=?None??????????????????????break????????????????????if?net.layers[j].type?in?['Scale']:??????????????????????scale_ind?=?j????????????????????????top_of_j?=?net.top_names[net._layer_names[j]][0]??????????????????????if?top_of_j?==?bottoms_of_j[0]:??????????????????????????#?On-the-fly?=>?Can?be?merged??????????????????????????break????????????????????else:??????????????????????#?Followed?by?a?layer?which?is?not?'Scale'??????????????????????scale_ind?=?None??????????????????????break??????????????merge_conv_and_bn(net,?conv_ind,?i,?scale_ind)????????return?net??????def?process_model(net,?src_model,?dst_model,?func_loop,?func_finally):??????with?open(src_model)?as?f:??????????model?=?caffe.proto.caffe_pb2.NetParameter()??????????pb.text_format.Merge(f.read(),?model)??????????for?i,?layer?in?enumerate(model.layer):??????????map(lambda?x:?x(layer,?net,?model,?i),?func_loop)????????map(lambda?x:?x(net,?model),?func_finally)????????with?open(dst_model,?'w')?as?f:??????????f.write(pb.text_format.MessageToString(model))??????#?Functions?to?remove?(redundant)?BN?and?Scale?layers??to_delete_empty?=?[]??def?pick_empty_layers(layer,?net,?model,?i):??????if?layer.type?not?in?['BatchNorm',?'Scale']:??????????return????????bottom?=?layer.bottom[0]??????top?=?layer.top[0]????????if?(bottom?!=?top):??????????#?Not?supperted?yet??????????return????????if?layer.type?==?'BatchNorm':??????????zero_mean?=?np.all(net.params[layer.name][0].data?==?0)??????????one_var?=?np.all(net.params[layer.name][1].data?==?1)??????????#length_is_1?=?(net.params['conv1_1/bn'][2].data?==?1)?or?(net.params[layer.name][2].data?==?0)??????????length_is_1?=??(net.params[layer.name][2].data?==?1)????????????if?zero_mean?and?one_var?and?length_is_1:??????????????print?'Delete?layer:?{}'.format(layer.name)??????????????to_delete_empty.append(layer)????????if?layer.type?==?'Scale':??????????no_scaling?=?np.all(net.params[layer.name][0].data?==?1)??????????zero_bias?=?np.all(net.params[layer.name][1].data?==?0)????????????if?no_scaling?and?zero_bias:??????????????print?'Delete?layer:?{}'.format(layer.name)??????????????to_delete_empty.append(layer)????def?remove_empty_layers(net,?model):??????map(model.layer.remove,?to_delete_empty)??????#?A?function?to?add?'engine:?CAFFE'?param?into?1x1?convolutions??def?set_engine_caffe(layer,?net,?model,?i):??????if?layer.type?==?'Convolution':??????????if?layer.convolution_param.kernel_size?==?1\??????????????or?(layer.convolution_param.kernel_h?==?layer.convolution_param.kernel_w?==?1):??????????????layer.convolution_param.engine?=?dict(layer.convolution_param.Engine.items())['CAFFE']??????def?main(args):??????#?Set?default?output?file?names??????if?args.output_model?is?None:??????????file_name?=?osp.splitext(args.model)[0]??????????args.output_model?=?file_name?+?'_inference.prototxt'??????if?args.output_weights?is?None:??????????file_name?=?osp.splitext(args.weights)[0]??????????args.output_weights?=?file_name?+?'_inference.caffemodel'????????net?=?load_and_fill_biases(args.model,?args.weights,?args.model?+?'.temp.pt',?None)????????net?=?merge_batchnorms_in_net(net)????????process_model(net,?args.model?+?'.temp.pt',?args.output_model,????????????????????[pick_empty_layers,?set_engine_caffe],????????????????????[remove_empty_layers])????????#?Store?params??????net.save(args.output_weights)??????if?__name__?==?'__main__':??????parser?=?ArgumentParser(??????????????description="Generate?Batch?Normalized?model?for?inference")??????parser.add_argument('model',?help="The?net?definition?prototxt")??????parser.add_argument('weights',?help="The?weights?caffemodel")??????parser.add_argument('--output_model')??????parser.add_argument('--output_weights')??????args?=?parser.parse_args()??????main(args) ?
總結(jié)
以上是生活随笔為你收集整理的模型优化:BatchNorm合并到卷积中的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。