microsoft · matajoh · May 5, 2026 · May 4, 2026
diff --git a/.github/workflows/pr_gate.yml b/.github/workflows/pr_gate.yml
@@ -96,6 +96,31 @@ jobs:
       - name: Python test
         run: pytest -vv -s
 
+  windows-x86:
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        python_version: ["3.12"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Use Python ${{matrix.python_version}} (x86)
+        uses: actions/setup-python@v6
+        with:
+          python-version: ${{matrix.python_version}}
+          architecture: x86
+
+      - name: Get dependencies
+        run: |
+          python -m pip install --upgrade pip
+
+      - name: Python build
+        run: pip install -e .[test] --verbose
+
+      - name: Python test
+        run: pytest -vv -s
+
   macos-arm64:
     runs-on: macos-latest
     strategy:

diff --git a/README.md b/README.md
@@ -83,6 +83,46 @@ the free-threaded build and the relevant CPython APIs stabilise.
 [pep703]: https://peps.python.org/pep-0703/
 [issue5]: https://github.com/microsoft/bocpy/issues/5
 
+### Scaling with cores
+
+The chart below shows BOC runtime throughput as the worker count grows from
+1 to 8, plotted as **speedup relative to a single worker**. Numbers come
+from [examples/benchmark.py](examples/benchmark.py) — a chain-ring workload
+that exercises the scheduler, two-phase locking, sub-interpreter crossings
+and the message queue together — run on CPython 3.14 (mean of 3 repeats,
+8 s each).
+
+<!-- pypi-skip-start -->
+```mermaid
+---
+config:
+    xyChart:
+        width: 700
+        height: 360
+---
+xychart-beta
+    title "bocpy speedup vs. worker count (chain-ring benchmark, CPython 3.14)"
+    x-axis "Workers" [1, 2, 3, 4, 5, 6, 7, 8]
+    y-axis "Speedup vs. 1 worker" 0 --> 9
+    bar [1.00, 1.97, 2.94, 3.90, 4.87, 5.82, 6.75, 7.54]
+    line [1, 2, 3, 4, 5, 6, 7, 8]
+```
+<!-- pypi-skip-end -->
+
+The line is the ideal `y = x` reference; the bars are measured speedup. Up
+to 8 workers, BOC delivers roughly linear scaling on this microbenchmark
+(≈7.5× at 8 workers). Real applications carry serial costs that this
+benchmark deliberately strips out — see the docstring at the top of
+[examples/benchmark.py](examples/benchmark.py) for the load-bearing
+caveats. To reproduce:
+
+```bash
+python examples/benchmark.py \
+    --sweep-axis workers --sweep-values 1,2,3,4,5,6,7,8 \
+    --duration 8 --warmup 2 --repeats 3 \
+    --output scaling.json
+```
+
 A behavior can be thought of as a function which depends on zero or more
 concurrently-owned data objects (which we call **cowns**). As a programmer, you
 indicate that you want the function to be called once all of those resources are

diff --git a/examples/benchmark.py b/examples/benchmark.py
@@ -34,6 +34,7 @@
 
 from bocpy import (Cown, Matrix, noticeboard, notice_write, receive, send,
                    start, wait, when)
+from bocpy import _core
 
 # Sentinels for the parent/child JSON protocol.  Uppercase so the
 # transpiler keeps them as module-level constants in the worker export.
@@ -42,6 +43,21 @@
 SCHEMA_VERSION = 1
 
 
+def _physical_cpu_count() -> int:
+    """Return physical core count, falling back to logical or 1.
+
+    Used as the oversubscription threshold so warnings fire when the
+    requested worker count starts using SMT siblings rather than
+    waiting until logical cores are exhausted.
+
+    :return: A positive integer.
+    """
+    n = _core.physical_cpu_count()
+    if n > 0:
+        return n
+    return os.cpu_count() or 1
+
+
 # ---------------------------------------------------------------------------
 # Behavior code (chain workload)
 # ---------------------------------------------------------------------------
@@ -203,11 +219,22 @@ def derive_sizes(cfg: BenchConfig) -> BenchConfig:
     :return: The same config with ``rings`` / ``chains_per_ring`` set.
     """
     if cfg.chains_per_ring is None:
+        # Use a small per-ring chain count (4) so chains never collide
+        # on adjacent slots as they advance. Independent rings carry
+        # the load instead.
         cfg.chains_per_ring = max(
-            1, cfg.ring_size // (cfg.group_size * cfg.stride * 2))
+            1, cfg.ring_size // (cfg.group_size * cfg.stride * 8))
     if cfg.rings is None:
-        cfg.rings = max(cfg.workers * 4 // cfg.chains_per_ring,
-                        cfg.workers * 2)
+        # Bias toward more *rings* rather than more chains-per-ring:
+        # chains on the same ring contend for adjacent slots as they
+        # advance, so per-ring concurrency is bounded well below
+        # ``chains_per_ring``. Independent rings, by contrast, never
+        # collide. Provision at least ``workers * 4`` rings so every
+        # worker sees a deep, independent supply of ready chains and
+        # the measured throughput reflects scheduler scaling rather
+        # than workload starvation.
+        cfg.rings = max(cfg.workers * 16 // cfg.chains_per_ring,
+                        cfg.workers * 4)
     return cfg
 
 
@@ -244,8 +271,9 @@ def emit_soft_warnings(cfg: BenchConfig, cpu_count: int) -> None:
         print(f"warning: duration={cfg.duration}s is short; results will "
               "be noisy", file=sys.stderr)
     if cfg.workers > cpu_count:
-        print(f"warning: workers={cfg.workers} exceeds cpu_count="
-              f"{cpu_count}; oversubscribed", file=sys.stderr)
+        print(f"warning: workers={cfg.workers} exceeds physical core "
+              f"count={cpu_count}; oversubscribed (SMT siblings or "
+              f"hyperthreads will be used)", file=sys.stderr)
 
 
 # ---------------------------------------------------------------------------
@@ -862,6 +890,7 @@ def collect_metadata(argv: list, git_sha: Optional[str]) -> dict:
         "hostname": socket.gethostname(),
         "platform": sys.platform,
         "cpu_count": os.cpu_count() or 0,
+        "physical_cpu_count": _physical_cpu_count(),
         "python_version": sys.version.split()[0],
         "python_implementation": sys.implementation.name,
         "free_threaded": free_threaded,
@@ -1054,7 +1083,7 @@ def _default_sweep_values(axis: str) -> list:
     :param axis: The sweep axis name.
     :return: A list of default values.
     """
-    cpu = os.cpu_count() or 1
+    cpu = _physical_cpu_count()
     if axis == "workers":
         return sorted(set([1, 2, 4, 8, min(16, cpu)]))
     if axis == "iters":
@@ -1158,7 +1187,7 @@ def child_main(args) -> int:
     if err is not None:
         print(f"benchmark: invalid config: {err}", file=sys.stderr)
         return 2
-    emit_soft_warnings(cfg, os.cpu_count() or 1)
+    emit_soft_warnings(cfg, _physical_cpu_count())
     rep = run_single_point_body(cfg, repeat_index=0)
     payload = {
         "inputs": asdict(cfg),
@@ -1197,7 +1226,7 @@ def parent_main(args) -> int:
         return 2
 
     # Pre-spawn validation across every sweep point.
-    cpu = os.cpu_count() or 1
+    cpu = _physical_cpu_count()
     derived_points = []
     for value in sweep_values:
         cfg = cfg_for_axis(base, args.sweep_axis, value)

diff --git a/examples/boids.py b/examples/boids.py
@@ -352,22 +352,28 @@ class Boids(pyglet.window.Window):
         """Pyglet window that renders a boids simulation."""
 
         def __init__(self, width: int, height: int, num_boids: int,
-                     show_overlay: bool = True):
+                     show_overlay: bool = True, scale: float = 1.0):
             """Initialize the window and create boids.
 
             :param width: Window width in pixels.
             :param height: Window height in pixels.
             :param num_boids: The number of boids to simulate.
             :param show_overlay: Whether to render the boid count and
                 behavior-rate overlay in the bottom-left corner.
+            :param scale: Multiplier applied to the drawn boid triangle
+                size. ``1.0`` keeps the original 20x14 pixel triangle.
             """
             pyglet.window.Window.__init__(self, width, height, "Boids")
             pyglet.gl.glClearColor(1, 1, 1, 1)
             self.batch = pyglet.graphics.Batch()
             self.elapsed = 0
             self.simulation = Simulation(num_boids, width, height)
             self.num_behaviors = 0
+            self.num_frames = 0
+            self.total_behaviors = 0
+            self.total_elapsed = 0.0
             self.samples = deque()
+            self.fps_samples = deque()
             self.show_overlay = show_overlay
 
             if show_overlay:
@@ -380,13 +386,24 @@ def __init__(self, width: int, height: int, num_boids: int,
                     "behavior/s: ",
                     font_size=24, x=5, y=50,
                     color=(100, 100, 100, 255))
+
+                self.fps_label = pyglet.text.Label(
+                    "fps: ",
+                    font_size=24, x=5, y=95,
+                    color=(100, 100, 100, 255))
             else:
                 self.num_boids_label = None
                 self.behaviors_label = None
+                self.fps_label = None
 
             self.triangles: pyglet.shapes.Triangle = []
+            tip_x = 0.0
+            base_x = -20.0 * scale
+            base_y = 7.0 * scale
             for _ in range(num_boids):
-                tri = pyglet.shapes.Triangle(0, 0, -20, +7, -20, -7,
+                tri = pyglet.shapes.Triangle(tip_x, 0,
+                                             base_x, +base_y,
+                                             base_x, -base_y,
                                              color=(55, 255, 255, 255),
                                              batch=self.batch)
                 tri.anchor_position = 0, 0
@@ -399,6 +416,7 @@ def on_draw(self):
             if self.show_overlay:
                 self.num_boids_label.draw()
                 self.behaviors_label.draw()
+                self.fps_label.draw()
 
         def on_close(self):
             wait()
@@ -412,17 +430,26 @@ def update(self, delta_time: float):
             self.elapsed += delta_time
             self.simulation.step(self.width, self.height)
             self.num_behaviors += self.simulation.num_behaviors
+            self.num_frames += 1
+            self.total_behaviors += self.simulation.num_behaviors
+            self.total_elapsed += delta_time
 
             if self.elapsed > 1:
                 self.samples.append(self.num_behaviors / self.elapsed)
+                self.fps_samples.append(self.num_frames / self.elapsed)
                 self.num_behaviors = 0
+                self.num_frames = 0
                 self.elapsed = 0
                 if len(self.samples) > 10:
                     self.samples.popleft()
+                if len(self.fps_samples) > 10:
+                    self.fps_samples.popleft()
 
             if len(self.samples) > 3 and self.behaviors_label is not None:
                 behavior_rate = sum(self.samples) / len(self.samples)
                 self.behaviors_label.text = f"behavior/s: {behavior_rate:.0f}"
+                fps = sum(self.fps_samples) / len(self.fps_samples)
+                self.fps_label.text = f"fps: {fps:.1f}"
 
             positions = self.simulation.positions
             velocities = self.simulation.velocities
@@ -459,6 +486,9 @@ def update(self, delta_time: float):
     parser.add_argument("--workers", type=int, default=None,
                         help="Number of BOC worker sub-interpreters. "
                              "Defaults to bocpy's default (CPU count - 1).")
+    parser.add_argument("--scale", type=float, default=1.0,
+                        help="Multiplier applied to the drawn boid "
+                             "triangle size (default: 1.0).")
     args = parser.parse_args()
 
     # Validate at the boundary; downstream code (Matrix sizing, hash modulo,
@@ -473,6 +503,8 @@ def update(self, delta_time: float):
         parser.error("--fps must be positive")
     if args.workers is not None and args.workers <= 0:
         parser.error("--workers must be positive")
+    if args.scale <= 0:
+        parser.error("--scale must be positive")
 
     # Start the BOC runtime explicitly so --workers takes effect for every
     # mode.
@@ -486,7 +518,7 @@ def update(self, delta_time: float):
         # The overlay (boid count / behavior rate) is suppressed in video
         # mode so the rendered output stays clean.
         boids = Boids(args.width, args.height, args.boids,
-                      show_overlay=False)
+                      show_overlay=False, scale=args.scale)
 
         # Allow graceful close: override on_close to set a flag and return
         # True so pyglet does not destroy the window mid-frame. The loop
@@ -630,11 +662,20 @@ def _on_close():
                 print(ff_stderr.decode("utf-8", errors="replace"), end="")
             return
 
+        # Successful render: report the average behavior rate over the
+        # entire run, computed from cumulative counters (not the rolling
+        # 1s window used for the on-screen overlay).
+        if boids.total_elapsed > 0:
+            avg_rate = boids.total_behaviors / boids.total_elapsed
+            print(f"behavior/s (avg over {boids.total_elapsed:.1f}s of "
+                  f"simulated time): {avg_rate:.0f} "
+                  f"({boids.total_behaviors} behaviors)")
+
         print(f"Wrote {args.output} ({frames_written} frames)"
               f"{' (interrupted)' if boids.bocpy_video_closing else ''}")
         return
 
-    boids = Boids(args.width, args.height, args.boids)
+    boids = Boids(args.width, args.height, args.boids, scale=args.scale)
     pyglet.clock.schedule_interval(boids.update, 1 / args.fps)
     pyglet.app.run()
 

diff --git a/examples/assets/cheese.txt → examples/cheese.txt b/examples/assets/cheese.txt → examples/cheese.txt
diff --git a/examples/assets/menu.txt → examples/menu.txt b/examples/assets/menu.txt → examples/menu.txt
diff --git a/examples/sketches.py b/examples/sketches.py
@@ -10,7 +10,7 @@
 
 def all_known_cheeses() -> list[str]:
     """Load the cheese inventory from disk."""
-    path = os.path.join(os.path.dirname(__file__), "assets", "cheese.txt")
+    path = os.path.join(os.path.dirname(__file__), "cheese.txt")
     with open(path) as file:
         return [line.strip() for line in file]
 
@@ -26,7 +26,7 @@ def is_available(logger, name: str) -> bool:
 
 def menu() -> list[str]:
     """Load the menu and shuffle it."""
-    path = os.path.join(os.path.dirname(__file__), "assets", "menu.txt")
+    path = os.path.join(os.path.dirname(__file__), "menu.txt")
     with open(path) as file:
         menu = [line.strip() for line in file]
         random.shuffle(menu)

diff --git a/pyproject.toml b/pyproject.toml
@@ -53,4 +53,4 @@ packages = ["bocpy", "bocpy.examples"]
 "bocpy.examples" = "examples"
 
 [tool.setuptools.package-data]
-"bocpy.examples" = ["assets/*.txt"]
+"bocpy.examples" = ["*.txt"]