Skip to content

Commit 3705e59

Browse files
authored
Merge pull request #113 from georges-hatem/main
own dictionary implementation, apparently has better collision resolution. 35s -> 30s
2 parents 1bc5c8c + a8a3cab commit 3705e59

2 files changed

Lines changed: 117 additions & 4 deletions

File tree

entries/ghatem-fpc/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,17 @@ So the problem (on my computer at least) does not seem to be related to the load
207207
As a last attempt, I tried again accumulating data in a shared memory, protecting all data accumulation with `InterlockedInc`, `InterlockedExchangeAdd`, and `TCriticalSection`. In order to avoid too many contentions on the critical section, I also tried to maintain a large array of critical sections, acquiring only the index for which we are accumulating data. All of these attempts under-performed on 4 threads, and likely will perform even worse as thread-count increases. The only way this would work is by having finer-grained control over the locking, such that a thread would only be blocked if it tried to write into a record that is already locked.
208208

209209
Lastly, the `TDictionary.TryGetValue` has shown to be quite costly, around `1/4th` of the total cost. And although it is currently so much better than when using the station name as key, evaluating the `mod` of all those hashes, there is a lot of collisions. So if the dictionary key-storage is implemented as an array, and `mod` is used to transform those `CRC32` into indexes ranging in `[0, 45k]`, those collisions will be the cause of slowness. If there is a way to reduce the number of collisions, then maybe a custom dictionary implementation might help.
210+
211+
212+
## Multi-Threaded attempt v.3 (2024-04-21)
213+
214+
Using performance profiler ValGrind, it identified that:
215+
- 30% of the time was spent on `TryGetValue` of the generic `TDictionary`.
216+
- 14% of the time is on computing the crc32 hash
217+
- 15% of the time on extracting the line data
218+
- surprisingly, 9% of the time is spent on looking for the #13 (new-line) character
219+
220+
I implemented my own Dictionary class consisting of two arrays. We compute the modulus of the incoming key (Cardinal) to fit it in the correct bucket. A first attempt at collision resolution was to store as values a TList, but performance was worse than the generic TDictionary. Next attempt was a linear probing, with circular indexing in case the index goes out of bounds. Performance improved from 35s to 30s. Will later try quadratic probing, as it apparently reduces clustering.
221+
222+
edit:
223+
quadratic probing improved performance even further. we could probably do better with 2-level hashing, but finding such a hash function is going to take a lot of trials, this is probably acceptable results

entries/ghatem-fpc/src/onebrc.pas

Lines changed: 103 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ interface
1010

1111
function RoundExDouble(const ATemp: Double): Double; inline;
1212

13+
const
14+
cDictSize: Integer = 45000;
15+
1316
type
1417

1518
// record is packed to minimize its size
@@ -24,6 +27,24 @@ function RoundExDouble(const ATemp: Double): Double; inline;
2427
PStationData = ^TStationData;
2528
TStationsDict = specialize TDictionary<Cardinal, PStationData>;
2629

30+
TKeys = array of Cardinal;
31+
TValues = array of PStationData;
32+
33+
{ TMyDictionary }
34+
35+
TMyDictionary = class
36+
private
37+
FHashes: TKeys;
38+
FData : TValues;
39+
procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer); inline;
40+
public
41+
constructor Create;
42+
property Keys: TKeys read FHashes;
43+
property Values: TValues read FData;
44+
function TryGetValue (const aKey: Cardinal; out aValue: PStationData): Boolean; inline;
45+
procedure Add (const aKey: Cardinal; const aValue: PStationData); inline;
46+
end;
47+
2748
{ TOneBRC }
2849

2950
TOneBRC = class
@@ -36,7 +57,7 @@ TOneBRC = class
3657

3758
FThreadCount: UInt16;
3859
FThreads: array of TThread;
39-
FStationsDicts: array of TStationsDict;
60+
FStationsDicts: array of TMyDictionary;
4061

4162
procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline;
4263

@@ -106,6 +127,78 @@ function Compare(AList: TStringList; AIndex1, AIndex2: Integer): Integer;
106127
Result := CompareStr(Str1, Str2);
107128
end;
108129

130+
{ TMyDictionary }
131+
132+
procedure TMyDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer);
133+
var vIdx: Integer;
134+
vOffset: Integer;
135+
begin
136+
vIdx := aKey mod cDictSize;
137+
aFound := False;
138+
139+
if FHashes[vIdx] = aKey then begin
140+
aIndex := vIdx;
141+
aFound := True;
142+
end
143+
else begin
144+
vOffset := 1;
145+
while True do begin
146+
// quadratic probing, by incrementing vOffset
147+
Inc (vIdx, vOffset);
148+
Inc (vOffset);
149+
150+
// exceeded boundary, loop back
151+
if vIdx >= cDictSize then
152+
Dec (vIdx, cDictSize);
153+
154+
if FHashes[vIdx] = aKey then begin
155+
// found match
156+
aIndex := vIdx;
157+
aFound := True;
158+
break;
159+
end
160+
else if FHashes[vIdx] = 0 then begin
161+
// found empty bucket to use
162+
aIndex := vIdx;
163+
aFound := False;
164+
break;
165+
end;
166+
end;
167+
end;
168+
end;
169+
170+
constructor TMyDictionary.Create;
171+
begin
172+
SetLength (FHashes, cDictSize);
173+
SetLength (FData, cDictSize);
174+
end;
175+
176+
function TMyDictionary.TryGetValue(const aKey: Cardinal; out aValue: PStationData): Boolean;
177+
var
178+
vIdx: Integer;
179+
begin
180+
InternalFind (aKey, Result, vIdx);
181+
182+
if Result then
183+
aValue := FData[vIdx]
184+
else
185+
aValue := nil;
186+
end;
187+
188+
procedure TMyDictionary.Add(const aKey: Cardinal; const aValue: PStationData);
189+
var
190+
vIdx: Integer;
191+
vFound: Boolean;
192+
begin
193+
InternalFind (aKey, vFound, vIdx);
194+
if not vFound then begin
195+
FHashes[vIdx] := aKey;
196+
FData[vIdx] := aValue;
197+
end
198+
else
199+
raise Exception.Create ('TMyDict: cannot add, duplicate key');
200+
end;
201+
109202
procedure TOneBRC.ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt);
110203
// given a line of data, extract the length of station name, and temperature as Integer.
111204
var
@@ -155,8 +248,8 @@ constructor TOneBRC.Create (const aThreadCount: UInt16);
155248
SetLength (FThreads, aThreadCount);
156249

157250
for I := 0 to aThreadCount - 1 do begin
158-
FStationsDicts[I] := TStationsDict.Create;
159-
FStationsDicts[I].Capacity := 45000;
251+
FStationsDicts[I] := TMyDictionary.Create;
252+
//FStationsDicts[I].Capacity := 45000;
160253
end;
161254
end;
162255

@@ -280,6 +373,10 @@ procedure TOneBRC.Merge(aLeft: UInt16; aRight: UInt16);
280373
vDataL: PStationData;
281374
begin
282375
for iHash in FStationsDicts[aRight].Keys do begin
376+
// zero means empty slot: skip
377+
if iHash = 0 then
378+
continue;
379+
283380
FStationsDicts[aRight].TryGetValue(iHash, vDataR);
284381

285382
if FStationsDicts[aLeft].TryGetValue(iHash, vDataL) then begin
@@ -322,7 +419,9 @@ procedure TOneBRC.GenerateOutput;
322419
try
323420
vStations.BeginUpdate;
324421
for vData in FStationsDicts[0].Values do begin
325-
vStations.Add(vData^.Name);
422+
// nil value means empty slot: skip
423+
if vData <> nil then
424+
vStations.Add(vData^.Name);
326425
end;
327426
vStations.EndUpdate;
328427

0 commit comments

Comments
 (0)