From 19280739804ebda3fab95dc9c9d0cd0289a74952 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 20 May 2026 08:13:09 +0200
Subject: [PATCH 1/3] A bit of OpenCL maintenance

1. Various functions are just internal helpers not used outside of the source scope and thus should
   not be exposed.
 dt_opencl_read_host_from_device_rowpitch()
 dt_opencl_write_host_to_device_rowpitch()

2. Some correct use of #defines and constify
---
 src/common/opencl.c | 65 +++++++++++++++------------------------------
 src/common/opencl.h | 18 ++-----------
 2 files changed, 24 insertions(+), 59 deletions(-)

diff --git a/src/common/opencl.c b/src/common/opencl.c
index 33c354c1f1ad..58d716aeba38 100644
--- a/src/common/opencl.c
+++ b/src/common/opencl.c
@@ -327,7 +327,7 @@ gboolean dt_opencl_use_pinned_memory(const int devid)
 
 void dt_opencl_write_device_config(const int devid)
 {
-  if(devid < 0) return;
+  if(devid <= DT_DEVICE_CPU) return;
 
   /* As we have floats as per-device parameters we keep track of current locale
      and do conversions via "C" here and while reading device config
@@ -366,7 +366,7 @@ void dt_opencl_write_device_config(const int devid)
 
 gboolean dt_opencl_read_device_config(const int devid)
 {
-  if(devid < 0) return FALSE;
+  if(devid <= DT_DEVICE_CPU) return FALSE;
 
   gchar *locale = g_strdup(setlocale(LC_ALL, NULL));
   setlocale(LC_NUMERIC, "C");
@@ -1698,7 +1698,7 @@ static const char *_opencl_get_vendor_by_id(const unsigned int id)
 gboolean dt_opencl_finish(const int devid)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return FALSE;
+  if(!cl->inited || devid <= DT_DEVICE_CPU) return FALSE;
 
   const cl_int err = (cl->dlocl->symbols->dt_clFinish)(cl->dev[devid].cmd_queue);
 
@@ -1713,7 +1713,7 @@ gboolean dt_opencl_finish_sync_pipe(const int devid,
                                     const int pipetype)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return FALSE;
+  if(!cl->inited || devid <= DT_DEVICE_CPU) return FALSE;
 
   const gboolean exporting = pipetype & DT_DEV_PIXELPIPE_EXPORT;
   const gboolean asyncmode = cl->dev[devid].asyncmode;
@@ -2596,7 +2596,7 @@ static gboolean _check_kernel(const int dev,
 {
   dt_opencl_t *cl = darktable.opencl;
 
-  if(!cl->inited || dev < 0) return FALSE;
+  if(!cl->inited || dev <= DT_DEVICE_CPU) return FALSE;
   if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return FALSE;
 
   if(cl->dev[dev].kernel_used[kernel]) return TRUE;
@@ -2646,7 +2646,7 @@ int dt_opencl_get_max_work_item_sizes(const int dev,
                                       size_t *sizes)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || dev < 0) return CL_DEVICE_NOT_AVAILABLE;
+  if(!cl->inited || dev <= DT_DEVICE_CPU) return CL_DEVICE_NOT_AVAILABLE;
   return (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid,
                                                   CL_DEVICE_MAX_WORK_ITEM_SIZES,
                                                   sizeof(size_t) * 3, sizes, NULL);
@@ -2659,7 +2659,7 @@ int dt_opencl_get_work_group_limits(const int dev,
                                     unsigned long *localmemsize)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || dev < 0) return CL_DEVICE_NOT_AVAILABLE;
+  if(!cl->inited || dev <= DT_DEVICE_CPU) return CL_DEVICE_NOT_AVAILABLE;
 
   if(cl->dev[dev].local_size == 0) return CL_INVALID_WORK_DIMENSION;
   *localmemsize = cl->dev[dev].local_size;
@@ -2870,24 +2870,14 @@ int dt_opencl_copy_device_to_host(const int devid,
                                   const int width,
                                   const int height,
                                   const int bpp)
-{
-  return dt_opencl_read_host_from_device_rowpitch(devid, host, device,
-                                                  width, height, bpp * width);
-}
-
-int dt_opencl_read_host_from_device_rowpitch(const int devid,
-                                             void *host,
-                                             void *device,
-                                             const int width,
-                                             const int height,
-                                             const int rowpitch)
 {
   if(!_cldev_running(devid))
     return DT_OPENCL_NODEVICE;
+
   const size_t region[2] = { width, height };
   // blocking.
   return dt_opencl_read_host_from_device_raw(devid, host, device, CLIMG_ORIGIN,
-                                             region, rowpitch, TRUE);
+                                             region, (size_t)width * bpp, TRUE);
 }
 
 int dt_opencl_read_host_from_device_raw(const int devid,
@@ -2931,17 +2921,6 @@ int dt_opencl_write_host_to_device(const int devid,
                                    const int width,
                                    const int height,
                                    const int bpp)
-{
-  return dt_opencl_write_host_to_device_rowpitch(devid, host, device,
-                                                 width, height, width * bpp);
-}
-
-int dt_opencl_write_host_to_device_rowpitch(const int devid,
-                                            const void *host,
-                                            void *device,
-                                            const int width,
-                                            const int height,
-                                            const int rowpitch)
 {
   if(!_cldev_running(devid))
     return DT_OPENCL_NODEVICE;
@@ -2949,7 +2928,7 @@ int dt_opencl_write_host_to_device_rowpitch(const int devid,
   const size_t region[2] = { width, height };
   // blocking.
   return dt_opencl_write_host_to_device_raw(devid, host, device, CLIMG_ORIGIN,
-                                            region, rowpitch, TRUE);
+                                            region, (size_t)width * bpp, TRUE);
 }
 
 int dt_opencl_write_host_to_device_raw(const int devid,
@@ -3276,8 +3255,8 @@ void *dt_opencl_map_buffer(const int devid,
                            cl_mem buffer,
                            const gboolean blocking,
                            const int flags,
-                           size_t offset,
-                           size_t size)
+                           const size_t offset,
+                           const size_t size)
 {
   if(!_cldev_running(devid))
     return NULL;
@@ -3545,10 +3524,10 @@ void dt_opencl_memory_statistics(int devid,
   if(!((darktable.unmuted & DT_DEBUG_MEMORY) && (darktable.unmuted & DT_DEBUG_OPENCL)))
     return;
 
-  if(devid < 0)
+  if(devid <= DT_DEVICE_CPU)
     devid = _opencl_get_mem_context_id(mem);
 
-  if(devid < 0)
+  if(devid <= DT_DEVICE_CPU)
     return;
 
   dt_opencl_t *cl = darktable.opencl;
@@ -3622,7 +3601,7 @@ void dt_opencl_check_tuning(const int devid)
 
 cl_ulong dt_opencl_get_device_available(const int devid)
 {
-  if(!darktable.opencl->inited || devid < 0) return 0;
+  if(!darktable.opencl->inited || devid <= DT_DEVICE_CPU) return 0;
   return darktable.opencl->dev[devid].used_available;
 }
 
@@ -3633,7 +3612,7 @@ static cl_ulong _opencl_get_device_memalloc(const int devid)
 
 cl_ulong dt_opencl_get_device_memalloc(const int devid)
 {
-  if(!darktable.opencl->inited || devid < 0) return 0;
+  if(!darktable.opencl->inited || devid <= DT_DEVICE_CPU) return 0;
   return _opencl_get_device_memalloc(devid);
 }
 
@@ -3784,7 +3763,7 @@ static cl_event *_opencl_events_get_slot(const int devid,
                                          const char *tag)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return NULL;
+  if(!cl->inited || devid <= DT_DEVICE_CPU) return NULL;
   if(!cl->dev[devid].use_events) return NULL;
 
   static const cl_event zeroevent[1]; // implicitly initialized to zero
@@ -3886,7 +3865,7 @@ static cl_event *_opencl_events_get_slot(const int devid,
 void dt_opencl_events_reset(const int devid)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return;
+  if(!cl->inited || devid <= DT_DEVICE_CPU) return;
   if(!cl->dev[devid].use_events) return;
 
   cl_event **eventlist = &(cl->dev[devid].eventlist);
@@ -3920,7 +3899,7 @@ void dt_opencl_events_reset(const int devid)
 static void _opencl_events_wait_for(const int devid)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return;
+  if(!cl->inited || devid <= DT_DEVICE_CPU) return;
   if(!cl->dev[devid].use_events) return;
 
   static const cl_event zeroevent[1]; // implicitly initialized to zero
@@ -3961,7 +3940,7 @@ static void _opencl_events_profiling(const int devid,
                                      const gboolean aggregated)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return;
+  if(!cl->inited || devid <= DT_DEVICE_CPU) return;
   if(!cl->dev[devid].use_events) return;
 
   cl_event **eventlist = &(cl->dev[devid].eventlist);
@@ -4061,7 +4040,7 @@ cl_int dt_opencl_events_flush(const int devid,
                               const gboolean reset)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return CL_SUCCESS;
+  if(!cl->inited || devid <= DT_DEVICE_CPU) return CL_SUCCESS;
   if(!cl->dev[devid].use_events) return CL_SUCCESS;
 
   cl_event **eventlist = &(cl->dev[devid].eventlist);
@@ -4168,7 +4147,7 @@ cl_int dt_opencl_local_buffer_opt(const int devid,
                                   dt_opencl_local_buffer_t *factors)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return DT_OPENCL_NODEVICE;
+  if(!cl->inited || devid <= DT_DEVICE_CPU) return DT_OPENCL_NODEVICE;
 
   size_t maxsizes[3] = { 0 };     // the maximum dimensions for a work group
   size_t workgroupsize = 0;       // the maximum number of items in a work group
diff --git a/src/common/opencl.h b/src/common/opencl.h
index c0c11ac5c690..29a1abffdced 100644
--- a/src/common/opencl.h
+++ b/src/common/opencl.h
@@ -442,13 +442,6 @@ int dt_opencl_copy_device_to_host(const int devid,
                                   const int height,
                                   const int bpp);
 
-int dt_opencl_read_host_from_device_rowpitch(const int devid,
-                                             void *host,
-                                             void *device,
-                                             const int width,
-                                             const int height,
-                                             const int rowpitch);
-
 int dt_opencl_read_host_from_device_raw(const int devid,
                                         void *host,
                                         void *device,
@@ -464,13 +457,6 @@ int dt_opencl_write_host_to_device(const int devid,
                                    const int height,
                                    const int bpp);
 
-int dt_opencl_write_host_to_device_rowpitch(const int devid,
-                                            const void *host,
-                                            void *device,
-                                            const int width,
-                                            const int height,
-                                            const int rowpitch);
-
 int dt_opencl_write_host_to_device_raw(const int devid,
                                        const void *host,
                                        void *device,
@@ -549,8 +535,8 @@ void *dt_opencl_map_buffer(const int devid,
                            cl_mem buffer,
                            const gboolean blocking,
                            const int flags,
-                           size_t offset,
-                           size_t size);
+                           const size_t offset,
+                           const size_t size);
 
 int dt_opencl_unmap_mem_object(const int devid,
                                cl_mem mem_object,

From da08ed332029c192e0dddc26edb2a88ad5f74ef3 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 20 May 2026 07:57:38 +0200
Subject: [PATCH 2/3] Improved tiling related logs

1. dt_print_pipe() got a minor improvement, it now shows a leading `T` in the pipe name if
   in tiling mode.
2. use dt_print_pipe() variant where automatic display of roi is helping
3. overlap added to logs where helping
4. some logs are shown only in -d verbose mode
5. some logs are also relevant for -d pipe logs
6. added log for skipped tiles in ptp mode
7. added module instance in control logs
8. For extensive debug logs we now have compile time option DT_TILING_DEBUG
9. some constify, subtle simplifications and use of float where intended
---
 src/develop/pixelpipe_hb.c |   9 +--
 src/develop/tiling.c       | 143 ++++++++++++++++---------------------
 src/iop/demosaic.c         |   4 +-
 3 files changed, 69 insertions(+), 87 deletions(-)

diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
index df4704373cf6..79bc3f7d16cc 100644
--- a/src/develop/pixelpipe_hb.c
+++ b/src/develop/pixelpipe_hb.c
@@ -160,7 +160,8 @@ void dt_print_pipe_ext(const char *title,
 
   if(pipe)
   {
-    snprintf(pname, sizeof(pname), "[%s%s]",
+    snprintf(pname, sizeof(pname), "[%s%s%s]",
+      pipe->tiling ? "T" : "",
       dt_dev_pixelpipe_type_to_str(pipe->type),
       dt_pipe_is_canvas(pipe) && darktable.develop->late_scaling.enabled ? " HQ" : "");
     if(pipe->mask_display == DT_DEV_PIXELPIPE_DISPLAY_PASSTHRU)
@@ -2028,11 +2029,11 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
   /* get tiling requirement of module */
   dt_develop_tiling_t tiling = { 0 };
   // set sentinel value to detect whether callback set sizes
-  tiling.factor_cl = tiling.maxbuf_cl = -1;
+  tiling.factor_cl = tiling.maxbuf_cl = -1.0f;
   module->tiling_callback(module, piece, &roi_in, roi_out, &tiling);
   // default to CPU size if callback didn't set GPU
-  if(tiling.factor_cl < 0) tiling.factor_cl = tiling.factor;
-  if(tiling.maxbuf_cl < 0) tiling.maxbuf_cl = tiling.maxbuf;
+  if(tiling.factor_cl < 0.0f) tiling.factor_cl = tiling.factor;
+  if(tiling.maxbuf_cl < 0.0f) tiling.maxbuf_cl = tiling.maxbuf;
 
   /* does this module involve blending? */
   if(piece->blendop_data
diff --git a/src/develop/tiling.c b/src/develop/tiling.c
index e403a4b9e2a1..09fdfec3ee38 100644
--- a/src/develop/tiling.c
+++ b/src/develop/tiling.c
@@ -162,6 +162,8 @@ static double _nm_fitness(double x[], void *rest[])
 #define BETA 0.5    /* contraction coefficient */
 #define GAMMA 2.0   /* expansion coefficient */
 
+// #define DT_TILING_DEBUG
+
 static int _simplex(double (*objfunc)(double[], void *[]),
                     const double start[],
                     const int n,
@@ -244,7 +246,7 @@ static int _simplex(double (*objfunc)(double[], void *[]),
     f[j] = objfunc(v[j], rest);
   }
 
-#if 0
+#ifdef DT_TILING_DEBUG
   /* print out the initial values */
   printf ("Initial Values\n");
   for(j = 0; j <= n; j++)
@@ -431,7 +433,7 @@ static int _simplex(double (*objfunc)(double[], void *[]),
       }
     }
 
-#if 0
+#ifdef DT_TILING_DEBUG
     /* print out the value at each iteration */
     printf ("Iteration %d\n", itr);
     for(j = 0; j <= n; j++)
@@ -470,7 +472,7 @@ static int _simplex(double (*objfunc)(double[], void *[]),
     }
   }
 
-#if 0
+#ifdef DT_TILING_DEBUG
   printf ("The minimum was found at\n");
   for(j = 0; j < n; j++)
   {
@@ -507,10 +509,10 @@ static int _nm_fit_output_to_input_roi(dt_iop_module_t *self,
   void *rest[4] = { (void *)self, (void *)piece, (void *)iroi, (void *)oroi };
   double start[4] = { (float)oroi->x / piece->iwidth, (float)oroi->y / piece->iheight,
                       (float)oroi->width / piece->iwidth, (float)oroi->height / piece->iheight };
-  double epsilon = (double)delta / MIN(piece->iwidth, piece->iheight);
-  int maxiter = 1000;
+  const double epsilon = (double)delta / MIN(piece->iwidth, piece->iheight);
+  const int maxiter = 1000;
 
-  int iter = _simplex(_nm_fitness, start, 4, epsilon, 1.0, maxiter, NULL, rest);
+  const int iter = _simplex(_nm_fitness, start, 4, epsilon, 1.0, maxiter, NULL, rest);
 
   dt_print(DT_DEBUG_TILING | DT_DEBUG_VERBOSE,
            "[_nm_fit_output_to_input_roi] _simplex: %d, delta: %d, epsilon: %f",
@@ -605,7 +607,7 @@ static void _default_process_tiling_ptp(dt_iop_module_t *self,
   if((tiling.factor < 2.2f)
      && (tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp))
   {
-    dt_print(DT_DEBUG_TILING,
+    dt_print(DT_DEBUG_PIPE | DT_DEBUG_TILING,
              "[default_process_tiling_ptp] [%s]  no need to use tiling for module '%s%s' "
              "as no real memory saving to be expected",
              dt_dev_pixelpipe_type_to_str(piece->pipe->type), self->op, dt_iop_get_instance_id(self));
@@ -670,7 +672,6 @@ static void _default_process_tiling_ptp(dt_iop_module_t *self,
      that is identical to image width/height no special alignment is needed. */
 
   const unsigned int align = tiling.align;
-
   assert(align != 0);
 
   /* properly align tile width and height by making them smaller if needed */
@@ -692,17 +693,13 @@ static void _default_process_tiling_ptp(dt_iop_module_t *self,
   /* sanity check: don't run wild on too many tiles */
   if(tiles_x * tiles_y > _maximum_number_tiles())
   {
-    dt_print(DT_DEBUG_TILING,
+    dt_print(DT_DEBUG_PIPE | DT_DEBUG_TILING,
              "[default_process_tiling_ptp] [%s] gave up tiling for module '%s%s'. too many tiles: %d x %d",
              dt_dev_pixelpipe_type_to_str(piece->pipe->type),
              self->op, dt_iop_get_instance_id(self), tiles_x, tiles_y);
     goto error;
   }
 
-  dt_print(DT_DEBUG_TILING,
-           "[default_process_tiling_ptp] [%s] (%dx%d) tiles with max dimensions %dx%d and overlap %d",
-           dt_dev_pixelpipe_type_to_str(piece->pipe->type), tiles_x, tiles_y, width, height, overlap);
-
   /* reserve input and output buffers for tiles */
   input = dt_alloc_aligned((size_t)width * height * in_bpp);
   if(input == NULL)
@@ -726,10 +723,11 @@ static void _default_process_tiling_ptp(dt_iop_module_t *self,
   dt_aligned_pixel_t processed_maximum_new = { 1.0f };
   for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
 
+  piece->pipe->tiling = TRUE;
   dt_print_pipe(DT_DEBUG_PIPE | DT_DEBUG_TILING,
-                        "process *tiled* ptp", piece->pipe, piece->module, DT_DEVICE_CPU, roi_in, roi_out,
-                        "%dx%d tiles, size=%dx%d",
-                        tiles_x, tiles_y, tile_wd, tile_ht);
+                        "default *tiled* ptp", piece->pipe, piece->module, DT_DEVICE_CPU, roi_in, roi_out,
+                        "%dx%d tiles, size=%dx%d, overlap=%d",
+                        tiles_x, tiles_y, tile_wd, tile_ht, overlap);
 
   /* iterate over tiles */
   for(size_t tx = 0; tx < tiles_x; tx++)
@@ -737,12 +735,10 @@ static void _default_process_tiling_ptp(dt_iop_module_t *self,
     const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
     for(size_t ty = 0; ty < tiles_y; ty++)
     {
-      piece->pipe->tiling = TRUE;
-
       const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
 
       /* no need to process end-tiles that are smaller than the total overlap area */
-      if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
+      const gboolean skipped = (wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0);
 
       /* origin and region of effective part of tile, which we want to store later */
       size_t origin[2] = { 0, 0 };
@@ -756,11 +752,12 @@ static void _default_process_tiling_ptp(dt_iop_module_t *self,
       const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
       size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
 
-      dt_print(DT_DEBUG_TILING,
-               "[default_process_tiling_ptp] [%s] tile (%zu,%zu) with %zux%zu at origin [%zu,%zu]",
-               dt_dev_pixelpipe_type_to_str(piece->pipe->type), tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
+      dt_print_pipe(DT_DEBUG_TILING,
+               skipped ? "  tile ptp skipped" : "  tile ptp", piece->pipe, piece->module, DT_DEVICE_CPU, &iroi, &oroi,
+               "tile (%zu,%zu)", tx, ty);
+      if(skipped) continue;
 
-/* prepare input tile buffer */
+      /* prepare input tile buffer */
       DT_OMP_FOR()
       for(size_t j = 0; j < ht; j++)
         memcpy((char *)input + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch, (size_t)wd * in_bpp);
@@ -800,7 +797,7 @@ static void _default_process_tiling_ptp(dt_iop_module_t *self,
         ooffs += (size_t)overlap * opitch;
       }
 
-/* copy "good" part of tile to output buffer */
+      /* copy "good" part of tile to output buffer */
       DT_OMP_FOR(shared(origin, region))
       for(size_t j = 0; j < region[1]; j++)
         memcpy((char *)ovoid + ooffs + j * opitch,
@@ -817,7 +814,8 @@ static void _default_process_tiling_ptp(dt_iop_module_t *self,
   return;
 
 error:
-  dt_control_log(_("tiling failed for module '%s'. the output most likely will be OK, but you might want to check."), self->op);
+  dt_control_log(_("tiling failed for module '%s%s'. the output most likely will be OK, but you might want to check."),
+    self->op, dt_iop_get_instance_id(self));
 // fall through
 
 fallback:
@@ -904,7 +902,6 @@ static void _default_process_tiling_roi(dt_iop_module_t *self,
      Modules will report alignment requirements via align within tiling_callback(). */
 
   const unsigned int align = tiling.align;
-
   assert(align != 0);
 
   /* shrink tile size in case it would exceed singlebuffer size */
@@ -1085,10 +1082,9 @@ static void _default_process_tiling_roi(dt_iop_module_t *self,
       const size_t ioffs = ((size_t)iroi_full.y - roi_in->y)  * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
             size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
 
-      dt_print(DT_DEBUG_TILING,
-               "[default_process_tiling_roi] [%s] process tile (%zu,%zu) size %dx%d at origin [%d,%d]",
-               dt_dev_pixelpipe_type_to_str(piece->pipe->type), tx, ty,
-               iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
+      dt_print_pipe(DT_DEBUG_TILING,
+               "  tile roi", piece->pipe, piece->module, DT_DEVICE_CPU, &iroi_full, &oroi_full,
+               "tile (%zu,%zu)", tx, ty);
 
       /* prepare input tile buffer */
       input = dt_alloc_aligned((size_t)iroi_full.width * iroi_full.height * in_bpp);
@@ -1155,7 +1151,8 @@ static void _default_process_tiling_roi(dt_iop_module_t *self,
   return;
 
 error:
-  dt_control_log(_("tiling failed for module '%s'. the output most likely will be OK, but you might want to check."), self->op);
+  dt_control_log(_("tiling failed for module '%s%s'. the output most likely will be OK, but you might want to check."),
+    self->op,dt_iop_get_instance_id(self));
 // fall through
 
 fallback:
@@ -1203,7 +1200,7 @@ float dt_tiling_estimate_cpumem(dt_develop_tiling_t *tiling,
   if(dt_tiling_piece_fits_host_memory(piece, m_dx, m_dy, max_bpp, tiling->factor, tiling->overhead))
     return (float)m_dx * m_dy * max_bpp * tiling->factor + tiling->overhead;
 
-  float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
+  const float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
                                                               / ((float)roi_out->width * roi_out->height)));
   float available = dt_get_available_pipe_mem(piece->pipe);
   available = fmaxf(available - ((float)roi_out->width * roi_out->height * max_bpp)
@@ -1263,7 +1260,7 @@ float dt_tiling_estimate_clmem(dt_develop_tiling_t *tiling,
   const float fullscale = fmaxf(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
                                                               / ((float)roi_out->width * roi_out->height)));
   const gboolean use_pinned_memory = dt_opencl_use_pinned_memory(devid);
-  const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0;
+  const float pinned_buffer_overhead = use_pinned_memory ? 2.0f : 0.0f;
   const float pinned_buffer_slack = use_pinned_memory ? 0.85f : 1.0f;
   const float available = (float)dt_opencl_get_device_available(devid);
   const float factor = fmaxf(tiling->factor_cl + pinned_buffer_overhead, 1.0f);
@@ -1274,8 +1271,7 @@ float dt_tiling_estimate_clmem(dt_develop_tiling_t *tiling,
   int width = MIN(MAX(roi_in->width, roi_out->width), darktable.opencl->dev[devid].max_image_width);
   int height = MIN(MAX(roi_in->height, roi_out->height), darktable.opencl->dev[devid].max_image_height);
 
-  unsigned int align = tiling->align;
-  align = _lcm(align, CL_ALIGNMENT);
+  const unsigned int align = _lcm(tiling->align, CL_ALIGNMENT);
 
   if((float)width * height * max_bpp * maxbuf > singlebuffer)
   {
@@ -1344,14 +1340,14 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
 
   /* get tiling requirements of module */
   dt_develop_tiling_t tiling = { 0 };
-  tiling.factor_cl = tiling.maxbuf_cl = -1;
+  tiling.factor_cl = tiling.maxbuf_cl = -1.0f;
   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
-  if(tiling.factor_cl < 0) tiling.factor_cl = tiling.factor;
-  if(tiling.maxbuf_cl < 0) tiling.maxbuf_cl = tiling.maxbuf;
+  if(tiling.factor_cl < 0.0f) tiling.factor_cl = tiling.factor;
+  if(tiling.maxbuf_cl < 0.0f) tiling.maxbuf_cl = tiling.maxbuf;
 
   /* shall we use pinned memory transfers? */
   gboolean use_pinned_memory = dt_opencl_use_pinned_memory(devid);
-  const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
+  const float pinned_buffer_overhead = use_pinned_memory ? 2.0f : 0.0f; // add two additional pinned memory buffers
                                                                 // which seemingly get allocated not only on
                                                                 // host but also on device (why???)
   // avoid problems when pinned buffer size gets too close to max_mem_alloc size
@@ -1400,14 +1396,15 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
      Modules will report alignment requirements via align within tiling_callback().
      Additional alignment requirements are set via definition of CL_ALIGNMENT.
      We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
-     that is identical to image width/height no special alignment is done. */
+     that is identical to image width/height no special alignment is done.
+  */
   const unsigned int align = tiling.align;
 
   /* determining alignment requirement for tile width/height.
-     in case of tile width also align according to definition of CL_ALIGNMENT */
+     in case of tile width also align according to definition of CL_ALIGNMENT
+  */
   const unsigned int walign = _lcm(align, CL_ALIGNMENT);
   const unsigned int halign = align;
-
   assert(align != 0 && walign != 0 && halign != 0);
 
   /* properly align tile width and height by making them smaller if needed */
@@ -1418,12 +1415,10 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
   const int overlap = tiling.overlap % align != 0 ? (tiling.overlap / align + 1) * align
                                                     : tiling.overlap;
 
-
   /* calculate effective tile size */
   const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
   const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
 
-
   /* calculate number of tiles */
   const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
   const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
@@ -1439,11 +1434,6 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
     return DT_OPENCL_PROCESS_CL;
   }
 
-  dt_print_pipe(DT_DEBUG_PIPE | DT_DEBUG_TILING,
-                        "process *tiled* ptp", piece->pipe, piece->module, devid, roi_in, roi_out,
-                        "%dx%d tiles%s, size=%dx%d",
-                        tiles_x, tiles_y, (use_pinned_memory) ? ", pinned" : "", tile_wd, tile_ht);
-
   /* store processed_maximum to be re-used and aggregated */
   dt_aligned_pixel_t processed_maximum_saved;
   dt_aligned_pixel_t processed_maximum_new = { 1.0f };
@@ -1466,7 +1456,6 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
 
   if(use_pinned_memory)
   {
-
     input_buffer = dt_opencl_map_buffer(devid, pinned_input, TRUE, CL_MAP_WRITE, 0,
                                         (size_t)width * height * in_bpp);
     if(input_buffer == NULL)
@@ -1481,7 +1470,6 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
 
   if(use_pinned_memory)
   {
-
     pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
                                                              CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
     if(pinned_output == NULL)
@@ -1496,7 +1484,6 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
 
   if(use_pinned_memory)
   {
-
     output_buffer = dt_opencl_map_buffer(devid, pinned_output, TRUE, CL_MAP_READ, 0,
                                          (size_t)width * height * out_bpp);
     if(output_buffer == NULL)
@@ -1509,18 +1496,21 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
     }
   }
 
+  dt_print_pipe(DT_DEBUG_PIPE | DT_DEBUG_TILING,
+                        "default *tiled* cl_ptp", piece->pipe, piece->module, devid, roi_in, roi_out,
+                        "%dx%d tiles%s, size=%dx%d, overlap=%d",
+                        tiles_x, tiles_y, (use_pinned_memory) ? ", pinned" : "", tile_wd, tile_ht, overlap);
   /* iterate over tiles */
+  piece->pipe->tiling = TRUE;
   for(size_t tx = 0; tx < tiles_x; tx++)
   {
     for(size_t ty = 0; ty < tiles_y; ty++)
     {
-      piece->pipe->tiling = TRUE;
-
       const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
       const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
 
       /* no need to process (end)tiles that are smaller than the total overlap area */
-      if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
+      const gboolean skipped = (wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0);
 
       /* origin and region of effective part of tile, which we want to store later */
       size_t origin[2] = { 0, 0 };
@@ -1535,11 +1525,10 @@ static int _default_process_tiling_cl_ptp(dt_iop_module_t *self,
       const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
       size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
 
-
-      dt_print(DT_DEBUG_TILING,
-               "[default_process_tiling_cl_ptp] [%s] tile (%zu,%zu) size %zux%zu at origin [%zu,%zu]",
-               dt_dev_pixelpipe_type_to_str(piece->pipe->type), tx, ty,
-               wd, ht, tx * tile_wd, ty * tile_ht);
+      dt_print_pipe(DT_DEBUG_TILING,
+               skipped ? "  tile cl_ptp skipped" : "  tile cl_ptp", piece->pipe, piece->module, devid, &iroi, &oroi,
+               "tile (%zu,%zu)", tx, ty);
+      if(skipped) continue;
 
       /* get input and output buffers */
       if(cltile_w != wd || cltile_h != ht)
@@ -1728,7 +1717,7 @@ static int _default_process_tiling_cl_roi(dt_iop_module_t *self,
 
   /* shall we use pinned memory transfers? */
   gboolean use_pinned_memory = dt_opencl_use_pinned_memory(devid);
-  const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
+  const float pinned_buffer_overhead = use_pinned_memory ? 2.0f : 0.0f; // add two additional pinned memory buffers
                                                                 // which seemingly get allocated not only on
                                                                 // host but also on device (why???)
   // avoid problems when pinned buffer size gets too close to max_mem_alloc size
@@ -1745,9 +1734,7 @@ static int _default_process_tiling_cl_roi(dt_iop_module_t *self,
   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
      Modules will report alignment requirements via align within tiling_callback().
   */
-  unsigned int align = tiling.align;
-  align = _lcm(align, CL_ALIGNMENT);
-
+  const unsigned int align = _lcm(tiling.align, CL_ALIGNMENT);
   assert(align != 0);
 
   /* shrink tile size in case it would exceed singlebuffer size */
@@ -1894,12 +1881,11 @@ static int _default_process_tiling_cl_roi(dt_iop_module_t *self,
 
 
   /* iterate over tiles */
+  piece->pipe->tiling = TRUE;
   for(size_t tx = 0; tx < tiles_x; tx++)
   {
     for(size_t ty = 0; ty < tiles_y; ty++)
     {
-      piece->pipe->tiling = TRUE;
-
       /* the output dimensions of the good part of this specific tile */
       const size_t wd = (tx + 1) * tile_wd > roi_out->width ? (size_t)roi_out->width - tx * tile_wd : tile_wd;
       const size_t ht = (ty + 1) * tile_ht > roi_out->height ? (size_t)roi_out->height - ty * tile_ht : tile_ht;
@@ -2002,15 +1988,12 @@ static int _default_process_tiling_cl_roi(dt_iop_module_t *self,
       size_t oorigin[2] = { oroi_good.x - oroi_full.x, oroi_good.y - oroi_full.y };
       size_t oregion[2] = { oroi_good.width, oroi_good.height };
 
-      dt_print(DT_DEBUG_TILING,
-               "[default_process_tiling_cl_roi] [%s] process tile (%zu,%zu) size %dx%d at origin [%d,%d]",
-               dt_dev_pixelpipe_type_to_str(piece->pipe->type), tx, ty,
-               iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
-      dt_print(DT_DEBUG_TILING | DT_DEBUG_VERBOSE,
-               "[default_process_tiling_cl_roi]    dest [%lu,%lu] at [%lu,%lu], "
-               "offsets [%i,%i] -> [%i,%i], delta=%i\n",
-               oregion[0], oregion[1], oorigin[0], oorigin[1], in_dx, in_dy,
-               out_dx, out_dy, delta);
+      dt_print_pipe(DT_DEBUG_TILING,
+              "  tile cl_roi", piece->pipe, piece->module, devid, &iroi_full, &oroi_full,
+              "tile (%zu,%zu)", tx, ty);
+      dt_print_pipe(DT_DEBUG_TILING | DT_DEBUG_VERBOSE,
+             "  tile cl_roi", piece->pipe, piece->module, devid, &iroi_full, &oroi_full,
+              "tile (%zu,%zu)  offsets=[%i,%i] delta=%i", tx, ty, out_dx, out_dy, delta);
 
       /* get opencl input and output buffers */
       if(cltile_iw != iroi_full.width || cltile_ih != iroi_full.height)
@@ -2129,12 +2112,10 @@ static int _default_process_tiling_cl_roi(dt_iop_module_t *self,
   dt_opencl_release_mem_object(output);
   piece->pipe->tiling = FALSE;
   const gboolean pinning_error = (use_pinned_memory == FALSE) && dt_opencl_use_pinned_memory(devid);
-  dt_print(DT_DEBUG_OPENCL | DT_DEBUG_TILING,
-           "[default_process_tiling_opencl_roi] [%s] couldn't run process_cl() "
-           "for module '%s%s' in tiling mode:%s %s",
-           dt_dev_pixelpipe_type_to_str(piece->pipe->type),
-           self->op, dt_iop_get_instance_id(self),
-           (pinning_error) ? " pinning problem" : "", cl_errstr(err));
+  dt_print_pipe(DT_DEBUG_OPENCL | DT_DEBUG_TILING,
+           "default tiling_cl_roi error", piece->pipe, piece->module, devid, roi_in, roi_out,
+           "%serror=%s",
+           (pinning_error) ? "pinning problem " : "", cl_errstr(err));
 
   if(pinning_error) darktable.opencl->dev[devid].pinned_error = TRUE;
   return err;
diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c
index d6ded6b8ed6f..cfd6328ba953 100644
--- a/src/iop/demosaic.c
+++ b/src/iop/demosaic.c
@@ -831,7 +831,7 @@ void process(dt_iop_module_t *self,
     if(out_height > 0)
     {
       if(tiling)
-        dt_print(DT_DEBUG_TILING, "tile=%.3d/%.3d, group=%.5d first=%.5d last=%.5d rows=%.4d",
+        dt_print(DT_DEBUG_TILING | DT_DEBUG_VERBOSE, "tile=%.3d/%.3d, group=%.5d first=%.5d last=%.5d rows=%.4d",
                tile_nr, num_tiles, group, first_in, last_in, t_rows);
 
       float *t_in = in + width * first_in * ch;
@@ -1104,7 +1104,7 @@ int process_cl(dt_iop_module_t *self,
     {
       if(tiling)
       {
-        dt_print(DT_DEBUG_TILING,
+        dt_print(DT_DEBUG_TILING | DT_DEBUG_VERBOSE,
               "tile=%.3d/%.3d, group=%.5d first=%.5d last=%.5d rows=%.4d",
                tile_nr, num_tiles, group, first_in, last_in, t_rows);
 

From 8bb62f18769e499e66a1111cf1e753f133ff7e20 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 20 May 2026 09:11:17 +0200
Subject: [PATCH 3/3] Fix tiling requirements for blending

If we blend a piece output there are additional requirements for memory to be checked.
Some calculations lead to higher results than necessary if no feathering was involved.
---
 src/develop/blend.c | 52 ++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/develop/blend.c b/src/develop/blend.c
index 2a145f8f89a6..e2dc4f30d662 100644
--- a/src/develop/blend.c
+++ b/src/develop/blend.c
@@ -1461,40 +1461,48 @@ void tiling_callback_blendop(dt_iop_module_t *self,
                              dt_develop_tiling_t *tiling)
 {
   tiling->factor = 0.0f;
+  tiling->factor_cl = 0.0f;
   tiling->maxbuf = 1.0f;
+  tiling->maxbuf_cl = 1.0f;
   tiling->overhead = 0;
   tiling->overlap = 0;
   tiling->align = 1;
 
   dt_develop_blend_params_t *const bldata = piece->blendop_data;
-  if(bldata)
+  if(bldata == NULL)
+    return;
+
+  if(bldata->details != 0.0f)
   {
-    if(bldata->details != 0.0f)
+    // details mask requires 2 additional quarter buffers of details data size
+    // so normalize to roi_size
+    dt_dev_detail_mask_t *details = &piece->pipe->scharr;
+    if(details->data)
     {
-      // details mask requires 2 additional quarter buffers of details data size
-      // so normalize to roi_size
-      dt_dev_detail_mask_t *details = &piece->pipe->scharr;
-      if(details->data)
-        tiling->factor = 0.5f * (float)(details->roi.width * details->roi.height) / (roi_in->width * roi_in->height);
-     }
-
-    if(bldata->feathering_radius > 0.1f) // we don't feather below that
+      tiling->factor = 0.5f * (float)(details->roi.width * details->roi.height) / (roi_in->width * roi_in->height);
+      tiling->factor_cl = tiling->factor;
+    }
+  }
+
+  if(bldata->feathering_radius > 0.1f) // we don't feather below that
+  {
+    const int devid = piece->pipe->devid;
+    if(devid > DT_DEVICE_CPU)
     {
-      const int devid = piece->pipe->devid;
-      if(devid > DT_DEVICE_CPU)
-      {
-        /* OpenCL feathering does simple internal tiling for less mem pressure,
-           we still need some mem here for this.
-        */
-        tiling->factor_cl = MAX(tiling->factor, 1.0f);
-      }
-      tiling->factor = MAX(tiling->factor, 18.0f * 0.25f); // we need all 18 intermediate guided filter mask buffers
+      /* OpenCL feathering does simple internal tiling for less mem pressure,
+         we still need some mem here for this.
+      */
+      tiling->factor_cl = MAX(tiling->factor_cl, 1.0f);
     }
+    tiling->factor = MAX(tiling->factor, 18.0f * 0.25f); // we need all 18 intermediate guided filter mask buffers
+
+    tiling->factor += 1.5f; // in + (guide, tmp) + two quarter buffers for the mask
+    tiling->factor_cl += 1.5f;
   }
+
   const float outnorm = (float)(roi_out->width * roi_out->height) / (roi_in->width * roi_in->height);
-  const float basic = 2.5f + outnorm; // in + out + (guide, tmp) + two quarter buffers for the mask
-  tiling->factor += basic;
-  tiling->factor_cl += basic;
+  tiling->factor += outnorm;
+  tiling->factor_cl += outnorm;
 }
 
 /** check if content of params is all zero, indicating a