Merge branch 'gcarreno:main' into main

EagleAglow · web-flow · commit 2c571ada8320 · 2024-03-25T15:03:33.000-07:00
diff --git a/entries/abouchez/README.md b/entries/abouchez/README.md
@@ -20,7 +20,7 @@ I am very happy to share decades of server-side performance coding techniques us
 
 Here are the main ideas behind this implementation proposal:
 
-- **mORMot** makes cross-platform and cross-compiler support simple - e.g. `TMemMap`, `TDynArray.Sort`,`TTextWriter`, `SetThreadCpuAffinity`, `crc32c`, `ConsoleWrite` or command-line parsing;
+- **mORMot** makes cross-platform and cross-compiler support simple - e.g. `TMemMap`, `TDynArray`,`TTextWriter`, `SetThreadCpuAffinity`, `crc32c`, `ConsoleWrite` or command-line parsing;
 - The entire 16GB file is `memmap`ed at once into memory - it won't work on 32-bit OS, but avoid any `read` syscall or memory copy;
 - Process file in parallel using several threads - configurable via the `-t=` switch, default being the total number of CPUs reported by the OS;
 - Input is fed into each thread as 64MB chunks: because thread scheduling is unbalanced, it is inefficient to pre-divide the size of the whole input file into the number of threads;
@@ -32,20 +32,22 @@ Here are the main ideas behind this implementation proposal:
 - Parse temperatures with a dedicated code (expects single decimal input values);
 - The station names are stored as UTF-8 pointers to the memmap location where they appear first, in `StationName[]`, to be emitted eventually for the final output, not during temperature parsing;
 - No memory allocation (e.g. no transient `string` or `TBytes`) nor any syscall is done during the parsing process to reduce contention and ensure the process is only CPU-bound and RAM-bound (we checked this with `strace` on Linux);
-- Pascal code was tuned to generate the best possible asm output on FPC x86_64 (which is our target);
+- Pascal code was tuned to generate the best possible asm output on FPC x86_64 (which is our target) - perhaps making it less readable, because we used pointer arithmetics when it matters (I like to think as such low-level pascal code as [portable assembly](https://sqlite.org/whyc.html#performance) similar to "unsafe" code in managed languages);
 - Can optionally output timing statistics and resultset hash value on the console to debug and refine settings (with the `-v` command line switch);
 - Can optionally set each thread affinity to a single core (with the `-a` command line switch).
 
 If you are not convinced by the "perfect hash" trick, you can define the `NOPERFECTHASH` conditional, which forces full name comparison, but is noticeably slower. Our algorithm is safe with the official dataset, and gives the expected final result - which was the goal of this challenge: compute the right data reduction with as little time as possible, with all possible hacks and tricks. A "perfect hash" is a well known hacking pattern, when the dataset is validated in advance. And since our CPUs offers `crc32c` which is perfect for our dataset... let's use it! https://en.wikipedia.org/wiki/Perfect_hash_function ;)
 
 ## Why L1 Cache Matters
 
-Take great care of the "64 bytes cache line" is quite unique among all implementations of the "1brc" I have seen in any language - and it does make a noticeable difference in performance.
+Taking special care of the "64 bytes cache line" is quite unique among all implementations of the "1brc" I have seen in any language - and it does make a noticeable difference in performance.
 
 The L1 cache is well known in the performance hacking litterature to be the main bottleneck for any efficient in-memory process. If you want things to go fast, you should flatter your CPU L1 cache.
 
 Min/max values will be reduced as 16-bit smallint - resulting in temperature range of -3276.7..+3276.8 which seems fair on our planet according to the IPCC. ;)
 
+As a result, each `Station[]` entry takes only 16 bytes, so we can fit exactly 4 entries in a single CPU L1 cache line. To be fair, if we put some more data into the record (e.g. use `Int64` instead of `smallint`/`integer`), the performance degrades only for a few percents. The main fact seems to be that the entry is likely to fit into a single cache line, even if filling two cache lines may be sometimes needed for misaligned data.
+
 In our first attempt (see "Old Version" below), we stored the name into the `Station[]` array, so that each entry is 64 bytes long exactly. But since `crc32c` is a perfect hash function for our dataset, it is enough to just store the 32-bit hash instead, and not the actual name.
 
 Note that if we reduce the number of stations from 41343 to 400, the performance is much higher, also with a 16GB file as input. The reason is that since 400x16 = 6400, each dataset could fit entirely in each core L1 cache. No slower L2/L3 cache is involved, therefore performance is better. The cache memory seems to be the bottleneck of our code. Which is a good sign.
@@ -236,6 +238,6 @@ Benchmark 1: abouchez
 ```
 It is a known fact from experiment that forcing thread affinity is not a good idea, and it is always much better to let any modern Operating System do  the threads scheduling to the CPU cores, because it has a much better knowledge of the actual system load and status. Even on a "fair" CPU architecture like AMD Zen. For a "pure CPU" process, affinity may help a very little. But for our "old" process working outside of the L1 cache limits, we better let the OS decide.
 
-So with this "old" version, it was decided to use `-t=16`. The "old" version is using a whole cache line (16 bytes) for its `Station[]` record, so it may be the responsible of using too much CPU cache, so more than 16 threads does not make a difference with it. Whereas our "new" version, with its `Station[]` of only 16 bytes, could use `-t=32` with benefits. The cache memory access is likely to be the bottleneck from now on.
+So with this "old" version, it was decided to use `-t=16`. The "old" version is using a whole cache line (64 bytes) for its `Station[]` record, so it may be the responsible of using too much CPU cache, so more than 16 threads does not make a difference with it. Whereas our "new" version, with its `Station[]` of only 16 bytes, could use `-t=32` with benefits. The cache memory access is likely to be the bottleneck from now on.
 
 Arnaud :D
diff --git a/entries/abouchez/src/brcmormot.lpr b/entries/abouchez/src/brcmormot.lpr
@@ -327,29 +327,31 @@ function Average(sum, count: PtrInt): PtrInt;
   //ConsoleWrite([sum / (count * 10), ' ', result / 10]);
 end;
 
-function ByStationName(const A, B): integer;
+function ByStationName(const A, B): integer; // = StrComp() but ending with ';'
 var
   pa, pb: PByte;
+  c: byte;
 begin
   result := 0;
   pa := pointer(A);
   pb := pointer(B);
-  if pa = pb then
+  dec(pa, {%H-}PtrUInt(pb));
+  if pa = nil then
     exit;
   repeat
-    if pa^ <> pb^ then
+    c := PByteArray(pa)[{%H-}PtrUInt(pb)];
+    if c <> pb^ then
       break
-    else if pa^ = ord(';') then
+    else if c = ord(';') then
       exit; // Str1 = Str2
-    inc(pa);
     inc(pb);
   until false;
-  if pa^ = ord(';') then
+  if (c = ord(';')) or
+     ((pb^ <> ord(';')) and
+      (c < pb^)) then
     result := -1
-  else if pb^ = ord(';') then
-    result := 1
   else
-    result := pa^ - pb^;
+    result := 1;
 end;
 
 function TBrcMain.SortedText: RawUtf8;
@@ -368,36 +370,39 @@ function TBrcMain.SortedText: RawUtf8;
   assert(c <> 0);
   DynArraySortIndexed(
     pointer(fList.StationName), SizeOf(PUtf8Char), c, ndx, ByStationName);
-  // generate output
-  FastSetString(result, nil, 1200000); // pre-allocate result
-  st := TRawByteStringStream.Create(result);
   try
-    w := TTextWriter.Create(st, @tmp, SizeOf(tmp));
+    // generate output
+    FastSetString(result, nil, 1200000); // pre-allocate result
+    st := TRawByteStringStream.Create(result);
     try
-      w.Add('{');
-      n := ndx.buf;
-      repeat
-        s := @fList.Station[n^];
-        assert(s^.Count <> 0);
-        p := fList.StationName[n^];
-        w.AddNoJsonEscape(p, NameLen(p));
-        AddTemp(w, '=', s^.Min);
-        AddTemp(w, '/', Average(s^.Sum, s^.Count));
-        AddTemp(w, '/', s^.Max);
-        dec(c);
-        if c = 0 then
-          break;
-        w.Add(',', ' ');
-        inc(n);
-      until false;
-      w.Add('}');
-      w.FlushFinal;
-      FakeLength(result, w.WrittenBytes);
+      w := TTextWriter.Create(st, @tmp, SizeOf(tmp));
+      try
+        w.Add('{');
+        n := ndx.buf;
+        repeat
+          s := @fList.Station[n^];
+          assert(s^.Count <> 0);
+          p := fList.StationName[n^];
+          w.AddNoJsonEscape(p, NameLen(p));
+          AddTemp(w, '=', s^.Min);
+          AddTemp(w, '/', Average(s^.Sum, s^.Count));
+          AddTemp(w, '/', s^.Max);
+          dec(c);
+          if c = 0 then
+            break;
+          w.Add(',', ' ');
+          inc(n);
+        until false;
+        w.Add('}');
+        w.FlushFinal;
+        FakeLength(result, w.WrittenBytes);
+      finally
+        w.Free;
+      end;
     finally
-      w.Free;
+      st.Free;
     end;
   finally
-    st.Free;
     ndx.Done;
   end;
 end;