Merge pull request #113 from georges-hatem/main

gcarreno · web-flow · commit 3705e590f4eb · 2024-04-21T20:05:11.000+01:00
own dictionary implementation, apparently has better collision resolution. 35s -&gt; 30s
diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md
@@ -207,3 +207,17 @@ So the problem (on my computer at least) does not seem to be related to the load
 As a last attempt, I tried again accumulating data in a shared memory, protecting all data accumulation with `InterlockedInc`, `InterlockedExchangeAdd`, and `TCriticalSection`. In order to avoid too many contentions on the critical section, I also tried to maintain a large array of critical sections, acquiring only the index for which we are accumulating data. All of these attempts under-performed on 4 threads, and likely will perform even worse as thread-count increases. The only way this would work is by having ﬁner-grained control over the locking, such that a thread would only be blocked if it tried to write into a record that is already locked.
 
 Lastly, the `TDictionary.TryGetValue` has shown to be quite costly, around `1/4th` of the total cost. And although it is currently so much better than when using the station name as key, evaluating the `mod` of all those hashes, there is a lot of collisions. So if the dictionary key-storage is implemented as an array, and `mod` is used to transform those `CRC32` into indexes ranging in `[0, 45k]`, those collisions will be the cause of slowness. If there is a way to reduce the number of collisions, then maybe a custom dictionary implementation might help.
+
+
+## Multi-Threaded attempt v.3 (2024-04-21)
+
+Using performance profiler ValGrind, it identified that:
+ - 30% of the time was spent on `TryGetValue` of the generic `TDictionary`.
+ - 14% of the time is on computing the crc32 hash
+ - 15% of the time on extracting the line data
+ - surprisingly, 9% of the time is spent on looking for the #13 (new-line) character
+
+I implemented my own Dictionary class consisting of two arrays. We compute the modulus of the incoming key (Cardinal) to fit it in the correct bucket. A first attempt at collision resolution was to store as values a TList, but performance was worse than the generic TDictionary.  Next attempt was a linear probing, with circular indexing in case the index goes out of bounds. Performance improved from 35s to 30s.  Will later try quadratic probing, as it apparently reduces clustering.
+
+edit:
+quadratic probing improved performance even further. we could probably do better with 2-level hashing, but finding such a hash function is going to take a lot of trials, this is probably acceptable results
diff --git a/entries/ghatem-fpc/src/onebrc.pas b/entries/ghatem-fpc/src/onebrc.pas
@@ -10,6 +10,9 @@ interface
 
 function RoundExDouble(const ATemp: Double): Double; inline;
 
+const
+  cDictSize: Integer = 45000;
+
 type
 
   // record is packed to minimize its size
@@ -24,6 +27,24 @@ function RoundExDouble(const ATemp: Double): Double; inline;
   PStationData = ^TStationData;
   TStationsDict = specialize TDictionary<Cardinal, PStationData>;
 
+  TKeys = array of Cardinal;
+  TValues = array of PStationData;
+
+  { TMyDictionary }
+
+  TMyDictionary = class
+  private
+    FHashes: TKeys;
+    FData  : TValues;
+    procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer); inline;
+  public
+    constructor Create;
+    property Keys: TKeys read FHashes;
+    property Values: TValues read FData;
+    function TryGetValue (const aKey: Cardinal; out aValue: PStationData): Boolean; inline;
+    procedure Add (const aKey: Cardinal; const aValue: PStationData); inline;
+  end;
+
   { TOneBRC }
 
   TOneBRC = class
@@ -36,7 +57,7 @@   TOneBRC = class
 
     FThreadCount: UInt16;
     FThreads: array of TThread;
-    FStationsDicts: array of TStationsDict;
+    FStationsDicts: array of TMyDictionary;
 
     procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline;
 
@@ -106,6 +127,78 @@ function Compare(AList: TStringList; AIndex1, AIndex2: Integer): Integer;
   Result := CompareStr(Str1, Str2);
 end;
 
+{ TMyDictionary }
+
+procedure TMyDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer);
+var vIdx: Integer;
+    vOffset: Integer;
+begin
+  vIdx := aKey mod cDictSize;
+  aFound := False;
+
+  if FHashes[vIdx] = aKey then begin
+    aIndex := vIdx;
+    aFound := True;
+  end
+  else begin
+    vOffset := 1;
+    while True do begin
+      // quadratic probing, by incrementing vOffset
+      Inc (vIdx, vOffset);
+      Inc (vOffset);
+
+      // exceeded boundary, loop back
+      if vIdx >= cDictSize then
+        Dec (vIdx, cDictSize);
+
+      if FHashes[vIdx] = aKey then begin
+        // found match
+        aIndex := vIdx;
+        aFound := True;
+        break;
+      end
+      else if FHashes[vIdx] = 0 then begin
+        // found empty bucket to use
+        aIndex := vIdx;
+        aFound := False;
+        break;
+      end;
+    end;
+  end;
+end;
+
+constructor TMyDictionary.Create;
+begin
+  SetLength (FHashes, cDictSize);
+  SetLength (FData, cDictSize);
+end;
+
+function TMyDictionary.TryGetValue(const aKey: Cardinal; out aValue: PStationData): Boolean;
+var
+  vIdx: Integer;
+begin
+  InternalFind (aKey, Result, vIdx);
+
+  if Result then
+    aValue := FData[vIdx]
+  else
+    aValue := nil;
+end;
+
+procedure TMyDictionary.Add(const aKey: Cardinal; const aValue: PStationData);
+var
+  vIdx: Integer;
+  vFound: Boolean;
+begin
+  InternalFind (aKey, vFound, vIdx);
+  if not vFound then begin
+    FHashes[vIdx] := aKey;
+    FData[vIdx] := aValue;
+  end
+  else
+    raise Exception.Create ('TMyDict: cannot add, duplicate key');
+end;
+
 procedure TOneBRC.ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt);
 // given a line of data, extract the length of station name, and temperature as Integer.
 var
@@ -155,8 +248,8 @@ constructor TOneBRC.Create (const aThreadCount: UInt16);
   SetLength (FThreads, aThreadCount);
 
   for I := 0 to aThreadCount - 1 do begin
-    FStationsDicts[I] := TStationsDict.Create;
-    FStationsDicts[I].Capacity := 45000;
+    FStationsDicts[I] := TMyDictionary.Create;
+    //FStationsDicts[I].Capacity := 45000;
   end;
 end;
 
@@ -280,6 +373,10 @@ procedure TOneBRC.Merge(aLeft: UInt16; aRight: UInt16);
     vDataL: PStationData;
 begin
   for iHash in FStationsDicts[aRight].Keys do begin
+    // zero means empty slot: skip
+    if iHash = 0 then
+      continue;
+
     FStationsDicts[aRight].TryGetValue(iHash, vDataR);
 
     if FStationsDicts[aLeft].TryGetValue(iHash, vDataL) then begin
@@ -322,7 +419,9 @@ procedure TOneBRC.GenerateOutput;
   try
     vStations.BeginUpdate;
     for vData in FStationsDicts[0].Values do begin
-      vStations.Add(vData^.Name);
+      // nil value means empty slot: skip
+      if vData <> nil then
+        vStations.Add(vData^.Name);
     end;
     vStations.EndUpdate;