5858#define CHECK_CL_ERROR (err, str ) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false ; } } while (0 )
5959#define LOG_CL_ERROR (err, str ) if (err != CL_SUCCESS) LOG_ERROR << str << " failed: " << err
6060
61+ #define WITH_PROFILING 0
62+
6163namespace libfreenect2
6264{
6365
@@ -167,8 +169,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
167169 size_t buf_packet_size;
168170
169171 cl::Buffer buf_lut11to16;
170- cl::Buffer buf_p0_sin_table;
171- cl::Buffer buf_p0_cos_table;
172+ cl::Buffer buf_p0_table;
172173 cl::Buffer buf_x_table;
173174 cl::Buffer buf_z_table;
174175 cl::Buffer buf_packet;
@@ -201,6 +202,11 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
201202 bool programInitialized;
202203 std::string sourceCode;
203204
205+ #if WITH_PROFILING
206+ std::vector<double > timings;
207+ int count;
208+ #endif
209+
204210 OpenCLDepthPacketProcessorImpl (const int deviceId = -1 )
205211 : image_size(512 * 424 )
206212 , lut_size(2048 )
@@ -266,12 +272,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
266272 oss << " -D AB_MULTIPLIER_PER_FRQ2=" << params.ab_multiplier_per_frq [2 ] << " f" ;
267273 oss << " -D AB_OUTPUT_MULTIPLIER=" << params.ab_output_multiplier << " f" ;
268274
269- oss << " -D PHASE_IN_RAD0_SIN=" << std::sin (-params.phase_in_rad [0 ]) << " f" ;
270- oss << " -D PHASE_IN_RAD0_COS=" << std::cos (params.phase_in_rad [0 ]) << " f" ;
271- oss << " -D PHASE_IN_RAD1_SIN=" << std::sin (-params.phase_in_rad [1 ]) << " f" ;
272- oss << " -D PHASE_IN_RAD1_COS=" << std::cos (params.phase_in_rad [1 ]) << " f" ;
273- oss << " -D PHASE_IN_RAD2_SIN=" << std::sin (-params.phase_in_rad [2 ]) << " f" ;
274- oss << " -D PHASE_IN_RAD2_COS=" << std::cos (params.phase_in_rad [2 ]) << " f" ;
275+ oss << " -D PHASE_IN_RAD0=" << params.phase_in_rad [0 ] << " f" ;
276+ oss << " -D PHASE_IN_RAD1=" << params.phase_in_rad [1 ] << " f" ;
277+ oss << " -D PHASE_IN_RAD2=" << params.phase_in_rad [2 ] << " f" ;
275278
276279 oss << " -D JOINT_BILATERAL_AB_THRESHOLD=" << params.joint_bilateral_ab_threshold << " f" ;
277280 oss << " -D JOINT_BILATERAL_MAX_EDGE=" << params.joint_bilateral_max_edge << " f" ;
@@ -430,7 +433,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
430433 bool initBuffers ()
431434 {
432435 cl_int err = CL_SUCCESS;
436+ #if WITH_PROFILING
437+ count = 0 ;
438+ queue = cl::CommandQueue (context, device, CL_QUEUE_PROFILING_ENABLE, &err);
439+ #else
433440 queue = cl::CommandQueue (context, device, 0 , &err);
441+ #endif
434442 CHECK_CL_ERROR (err, " cl::CommandQueue" );
435443
436444 // Read only
@@ -442,9 +450,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
442450
443451 buf_lut11to16 = cl::Buffer (context, CL_MEM_READ_ONLY, buf_lut11to16_size, NULL , &err);
444452 CHECK_CL_ERROR (err, " cl::Buffer" );
445- buf_p0_sin_table = cl::Buffer (context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL , &err);
446- CHECK_CL_ERROR (err, " cl::Buffer" );
447- buf_p0_cos_table = cl::Buffer (context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL , &err);
453+ buf_p0_table = cl::Buffer (context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL , &err);
448454 CHECK_CL_ERROR (err, " cl::Buffer" );
449455 buf_x_table = cl::Buffer (context, CL_MEM_READ_ONLY, buf_x_table_size, NULL , &err);
450456 CHECK_CL_ERROR (err, " cl::Buffer" );
@@ -471,7 +477,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
471477 CHECK_CL_ERROR (err, " cl::Buffer" );
472478 buf_n = cl::Buffer (context, CL_MEM_READ_WRITE, buf_n_size, NULL , &err);
473479 CHECK_CL_ERROR (err, " cl::Buffer" );
474- buf_ir = cl::Buffer (context, CL_MEM_READ_WRITE , buf_ir_size, NULL , &err);
480+ buf_ir = cl::Buffer (context, CL_MEM_WRITE_ONLY , buf_ir_size, NULL , &err);
475481 CHECK_CL_ERROR (err, " cl::Buffer" );
476482 buf_a_filtered = cl::Buffer (context, CL_MEM_READ_WRITE, buf_a_filtered_size, NULL , &err);
477483 CHECK_CL_ERROR (err, " cl::Buffer" );
@@ -507,19 +513,17 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
507513 CHECK_CL_ERROR (err, " setArg" );
508514 err = kernel_processPixelStage1.setArg (1 , buf_z_table);
509515 CHECK_CL_ERROR (err, " setArg" );
510- err = kernel_processPixelStage1.setArg (2 , buf_p0_sin_table );
516+ err = kernel_processPixelStage1.setArg (2 , buf_p0_table );
511517 CHECK_CL_ERROR (err, " setArg" );
512- err = kernel_processPixelStage1.setArg (3 , buf_p0_cos_table );
518+ err = kernel_processPixelStage1.setArg (3 , buf_packet );
513519 CHECK_CL_ERROR (err, " setArg" );
514- err = kernel_processPixelStage1.setArg (4 , buf_packet );
520+ err = kernel_processPixelStage1.setArg (4 , buf_a );
515521 CHECK_CL_ERROR (err, " setArg" );
516- err = kernel_processPixelStage1.setArg (5 , buf_a );
522+ err = kernel_processPixelStage1.setArg (5 , buf_b );
517523 CHECK_CL_ERROR (err, " setArg" );
518- err = kernel_processPixelStage1.setArg (6 , buf_b );
524+ err = kernel_processPixelStage1.setArg (6 , buf_n );
519525 CHECK_CL_ERROR (err, " setArg" );
520- err = kernel_processPixelStage1.setArg (7 , buf_n);
521- CHECK_CL_ERROR (err, " setArg" );
522- err = kernel_processPixelStage1.setArg (8 , buf_ir);
526+ err = kernel_processPixelStage1.setArg (7 , buf_ir);
523527 CHECK_CL_ERROR (err, " setArg" );
524528
525529 kernel_filterPixelStage1 = cl::Kernel (program, " filterPixelStage1" , &err);
@@ -571,14 +575,14 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
571575 {
572576 cl_int err;
573577 std::vector<cl::Event> eventWrite (1 ), eventPPS1 (1 ), eventFPS1 (1 ), eventPPS2 (1 ), eventFPS2 (1 );
574- cl::Event event0, event1 ;
578+ cl::Event eventReadIr, eventReadDepth ;
575579
576580 err = queue.enqueueWriteBuffer (buf_packet, CL_FALSE, 0 , buf_packet_size, packet.buffer , NULL , &eventWrite[0 ]);
577- CHECK_CL_ERROR (err, " enqueueMapBuffer " );
581+ CHECK_CL_ERROR (err, " enqueueWriteBuffer " );
578582
579583 err = queue.enqueueNDRangeKernel (kernel_processPixelStage1, cl::NullRange, cl::NDRange (image_size), cl::NullRange, &eventWrite, &eventPPS1[0 ]);
580584 CHECK_CL_ERROR (err, " enqueueNDRangeKernel" );
581- err = queue.enqueueReadBuffer (buf_ir, CL_FALSE, 0 , buf_ir_size, ir_frame->data , &eventPPS1, &event0 );
585+ err = queue.enqueueReadBuffer (buf_ir, CL_FALSE, 0 , buf_ir_size, ir_frame->data , &eventPPS1, &eventReadIr );
582586 CHECK_CL_ERROR (err, " enqueueReadBuffer" );
583587
584588 if (config.EnableBilateralFilter )
@@ -597,20 +601,50 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
597601 if (config.EnableEdgeAwareFilter )
598602 {
599603 err = queue.enqueueNDRangeKernel (kernel_filterPixelStage2, cl::NullRange, cl::NDRange (image_size), cl::NullRange, &eventPPS2, &eventFPS2[0 ]);
600- CHECK_CL_ERROR (err, " enqueueWriteBuffer " );
604+ CHECK_CL_ERROR (err, " enqueueNDRangeKernel " );
601605 }
602606 else
603607 {
604608 eventFPS2[0 ] = eventPPS2[0 ];
605609 }
606610
607- err = queue.enqueueReadBuffer (config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0 , buf_depth_size, depth_frame->data , &eventFPS2, &event1 );
611+ err = queue.enqueueReadBuffer (config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0 , buf_depth_size, depth_frame->data , &eventFPS2, &eventReadDepth );
608612 CHECK_CL_ERROR (err, " enqueueReadBuffer" );
609- err = event0 .wait ();
613+ err = eventReadIr .wait ();
610614 CHECK_CL_ERROR (err, " wait" );
611- err = event1 .wait ();
615+ err = eventReadDepth .wait ();
612616 CHECK_CL_ERROR (err, " wait" );
613617
618+ #if WITH_PROFILING
619+ if (count == 0 )
620+ {
621+ timings.clear ();
622+ timings.resize (7 , 0.0 );
623+ }
624+
625+ timings[0 ] += eventWrite[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventWrite[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
626+ timings[1 ] += eventPPS1[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventPPS1[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
627+ timings[2 ] += eventFPS1[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventFPS1[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
628+ timings[3 ] += eventPPS2[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventPPS2[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
629+ timings[4 ] += eventFPS2[0 ].getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventFPS2[0 ].getProfilingInfo <CL_PROFILING_COMMAND_START>();
630+ timings[5 ] += eventReadIr.getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventReadIr.getProfilingInfo <CL_PROFILING_COMMAND_START>();
631+ timings[6 ] += eventReadDepth.getProfilingInfo <CL_PROFILING_COMMAND_END>() - eventReadDepth.getProfilingInfo <CL_PROFILING_COMMAND_START>();
632+
633+ if (++count == 100 )
634+ {
635+ double sum = timings[0 ] + timings[1 ] + timings[2 ] + timings[3 ] + timings[4 ] + timings[5 ] + timings[6 ];
636+ LOG_INFO << " writing package: " << timings[0 ] / 100000000.0 << " ms." ;
637+ LOG_INFO << " stage 1: " << timings[1 ] / 100000000.0 << " ms." ;
638+ LOG_INFO << " filter 1: " << timings[2 ] / 100000000.0 << " ms." ;
639+ LOG_INFO << " stage 2: " << timings[3 ] / 100000000.0 << " ms." ;
640+ LOG_INFO << " filter 2: " << timings[4 ] / 100000000.0 << " ms." ;
641+ LOG_INFO << " reading ir: " << timings[5 ] / 100000000.0 << " ms." ;
642+ LOG_INFO << " reading depth: " << timings[6 ] / 100000000.0 << " ms." ;
643+ LOG_INFO << " overall: " << sum / 100000000.0 << " ms." ;
644+ count = 0 ;
645+ }
646+ #endif
647+
614648 return true ;
615649 }
616650
@@ -665,46 +699,32 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
665699 return ;
666700 }
667701
668- cl_float3 *p0_sin_table = new cl_float3[image_size];
669- cl_float3 *p0_cos_table = new cl_float3[image_size];
702+ cl_float3 *p0_table = new cl_float3[image_size];
670703
671704 for (int r = 0 ; r < 424 ; ++r)
672705 {
673- cl_float3 *itS = &p0_sin_table[r * 512 ];
674- cl_float3 *itC = &p0_cos_table[r * 512 ];
706+ cl_float3 *it = &p0_table[r * 512 ];
675707 const uint16_t *it0 = &p0table->p0table0 [r * 512 ];
676708 const uint16_t *it1 = &p0table->p0table1 [r * 512 ];
677709 const uint16_t *it2 = &p0table->p0table2 [r * 512 ];
678- for (int c = 0 ; c < 512 ; ++c, ++itS, ++itC , ++it0, ++it1, ++it2)
710+ for (int c = 0 ; c < 512 ; ++c, ++it , ++it0, ++it1, ++it2)
679711 {
680- const float x = ((float )*it0) * 0.000031 * M_PI;
681- const float y = ((float )*it1) * 0.000031 * M_PI;
682- const float z = ((float )*it2) * 0.000031 * M_PI;
683- itS->s [0 ] = std::sin (x);
684- itS->s [1 ] = std::sin (y);
685- itS->s [2 ] = std::sin (z);
686- itS->s [3 ] = 0 .0f ;
687- itC->s [0 ] = std::cos (-x);
688- itC->s [1 ] = std::cos (-y);
689- itC->s [2 ] = std::cos (-z);
690- itC->s [3 ] = 0 .0f ;
712+ it->s [0 ] = -((float )*it0) * 0.000031 * M_PI;
713+ it->s [1 ] = -((float )*it1) * 0.000031 * M_PI;
714+ it->s [2 ] = -((float )*it2) * 0.000031 * M_PI;
715+ it->s [3 ] = 0 .0f ;
691716 }
692717 }
693718
694719 cl_int err = CL_SUCCESS;
695- cl::Event event0, event1;
696- err = queue.enqueueWriteBuffer (buf_p0_sin_table, CL_FALSE, 0 , buf_p0_table_size, p0_sin_table, NULL , &event0);
697- LOG_CL_ERROR (err, " enqueueWriteBuffer" );
698- err = queue.enqueueWriteBuffer (buf_p0_cos_table, CL_FALSE, 0 , buf_p0_table_size, p0_cos_table, NULL , &event1);
720+ cl::Event event0;
721+ err = queue.enqueueWriteBuffer (buf_p0_table, CL_FALSE, 0 , buf_p0_table_size, p0_table, NULL , &event0);
699722 LOG_CL_ERROR (err, " enqueueWriteBuffer" );
700723
701724 err = event0.wait ();
702725 LOG_CL_ERROR (err, " wait" );
703- err = event1.wait ();
704- LOG_CL_ERROR (err, " wait" );
705726
706- delete[] p0_sin_table;
707- delete[] p0_cos_table;
727+ delete[] p0_table;
708728 }
709729
710730 void fill_xz_tables (const float *xtable, const float *ztable)
0 commit comments