Skip to content

Commit 48f1f23

Browse files
committed
more work
1 parent 74a3fb1 commit 48f1f23

44 files changed

Lines changed: 1882 additions & 203 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/ManagedCode.GraphRag.Postgres/PostgresGraphStore.cs

Lines changed: 303 additions & 27 deletions
Large diffs are not rendered by default.

src/ManagedCode.GraphRag.Postgres/ServiceCollectionExtensions.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public static IServiceCollection AddPostgresGraphStore(this IServiceCollection s
2121
{
2222
var opts = sp.GetRequiredKeyedService<PostgresGraphStoreOptions>(serviceKey);
2323
var logger = sp.GetRequiredService<ILogger<PostgresGraphStore>>();
24-
return new PostgresGraphStore(opts.ConnectionString, opts.GraphName, logger);
24+
return new PostgresGraphStore(opts, logger);
2525
});
2626
services.AddKeyedSingleton<IGraphStore>(key, (sp, serviceKey) => sp.GetRequiredKeyedService<PostgresGraphStore>(serviceKey));
2727

@@ -40,4 +40,6 @@ public sealed class PostgresGraphStoreOptions
4040
public string ConnectionString { get; set; } = string.Empty;
4141

4242
public string GraphName { get; set; } = "graphrag";
43+
44+
public bool AutoCreateIndexes { get; set; } = true;
4345
}

src/ManagedCode.GraphRag/Chunking/MarkdownTextChunker.cs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
using System.Text;
44
using GraphRag.Config;
55
using GraphRag.Tokenization;
6+
using Microsoft.ML.Tokenizers;
67

78
namespace GraphRag.Chunking;
89

9-
public sealed class MarkdownTextChunker(ITokenizerProvider tokenizerProvider) : ITextChunker
10+
public sealed class MarkdownTextChunker : ITextChunker
1011
{
1112
public IReadOnlyList<TextChunk> Chunk(IReadOnlyList<ChunkSlice> slices, ChunkingConfig config)
1213
{
@@ -18,7 +19,7 @@ public IReadOnlyList<TextChunk> Chunk(IReadOnlyList<ChunkSlice> slices, Chunking
1819
return Array.Empty<TextChunk>();
1920
}
2021

21-
var tokenizer = tokenizerProvider.GetTokenizer(config.EncodingModel);
22+
var tokenizer = TokenizerRegistry.GetTokenizer(config.EncodingModel);
2223
var options = new MarkdownChunkerOptions
2324
{
2425
MaxTokensPerChunk = Math.Max(MinChunkSize, config.Size),
@@ -32,7 +33,7 @@ public IReadOnlyList<TextChunk> Chunk(IReadOnlyList<ChunkSlice> slices, Chunking
3233
var fragments = Split(slice.Text, options, tokenizer);
3334
foreach (var fragment in fragments)
3435
{
35-
var tokens = tokenizer.Encode(fragment);
36+
var tokens = tokenizer.EncodeToIds(fragment);
3637
if (tokens.Count == 0)
3738
{
3839
continue;
@@ -45,7 +46,7 @@ public IReadOnlyList<TextChunk> Chunk(IReadOnlyList<ChunkSlice> slices, Chunking
4546
return results;
4647
}
4748

48-
private List<string> Split(string text, MarkdownChunkerOptions options, ITextTokenizer tokenizer)
49+
private List<string> Split(string text, MarkdownChunkerOptions options, Tokenizer tokenizer)
4950
{
5051
text = NormalizeNewlines(text);
5152
var firstChunkDone = false;
@@ -60,7 +61,7 @@ private List<string> Split(string text, MarkdownChunkerOptions options, ITextTok
6061

6162
for (var index = 1; index < rawChunks.Count; index++)
6263
{
63-
var previousTokens = tokenizer.Encode(rawChunks[index - 1]);
64+
var previousTokens = tokenizer.EncodeToIds(rawChunks[index - 1]);
6465
var overlapTokens = previousTokens.Skip(Math.Max(0, previousTokens.Count - options.Overlap)).ToArray();
6566
var overlapText = tokenizer.Decode(overlapTokens);
6667
newChunks.Add(string.Concat(overlapText, rawChunks[index]));
@@ -77,7 +78,7 @@ private List<string> RecursiveSplit(
7778
int maxChunk1Size,
7879
int maxChunkNSize,
7980
SeparatorType separatorType,
80-
ITextTokenizer tokenizer,
81+
Tokenizer tokenizer,
8182
ref bool firstChunkDone)
8283
{
8384
if (string.IsNullOrWhiteSpace(text))
@@ -110,7 +111,7 @@ private List<string> GenerateChunks(
110111
int maxChunk1Size,
111112
int maxChunkNSize,
112113
SeparatorType separatorType,
113-
ITextTokenizer tokenizer,
114+
Tokenizer tokenizer,
114115
ref bool firstChunkDone)
115116
{
116117
if (fragments.Count == 0)

src/ManagedCode.GraphRag/Chunking/TokenTextChunker.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
namespace GraphRag.Chunking;
77

8-
public sealed class TokenTextChunker(ITokenizerProvider tokenizerProvider) : ITextChunker
8+
public sealed class TokenTextChunker : ITextChunker
99
{
1010
public IReadOnlyList<TextChunk> Chunk(IReadOnlyList<ChunkSlice> slices, ChunkingConfig config)
1111
{
@@ -17,12 +17,12 @@ public IReadOnlyList<TextChunk> Chunk(IReadOnlyList<ChunkSlice> slices, Chunking
1717
return Array.Empty<TextChunk>();
1818
}
1919

20-
var tokenizer = tokenizerProvider.GetTokenizer(config.EncodingModel);
20+
var tokenizer = TokenizerRegistry.GetTokenizer(config.EncodingModel);
2121
var flattened = new List<(int SliceIndex, int Token)>();
2222
for (var index = 0; index < slices.Count; index++)
2323
{
2424
var slice = slices[index];
25-
var encoded = tokenizer.Encode(slice.Text);
25+
var encoded = tokenizer.EncodeToIds(slice.Text);
2626
foreach (var token in encoded)
2727
{
2828
flattened.Add((index, token));
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
namespace GraphRag.Constants;
2+
3+
public static class CypherParameterNames
4+
{
5+
public const string GraphName = "graph_name";
6+
public const string Query = "query";
7+
public const string Parameters = "params";
8+
public const string NodeId = "node_id";
9+
public const string SourceId = "source_id";
10+
public const string TargetId = "target_id";
11+
public const string Properties = "props";
12+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
namespace GraphRag.Constants;
2+
3+
public static class EntityPropertyNames
4+
{
5+
public const string Id = "id";
6+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
namespace GraphRag.Constants;
2+
3+
public static class HashFieldNames
4+
{
5+
public const string Document = "document";
6+
public const string Text = "text";
7+
public const string Path = "path";
8+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
namespace GraphRag.Constants;
2+
3+
public static class PipelineTableNames
4+
{
5+
public const string Documents = "documents";
6+
public const string TextUnits = "text_units";
7+
public const string Entities = "entities";
8+
public const string Relationships = "relationships";
9+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
namespace GraphRag.Constants;
2+
3+
public static class TokenizerDefaults
4+
{
5+
public const string DefaultModel = "gpt-4";
6+
public const string DefaultEncoding = "o200k_base";
7+
8+
public static readonly string[] PreferredModels =
9+
[
10+
"gpt-4",
11+
"gpt-4o",
12+
"gpt-4-turbo",
13+
"gpt-4-turbo-preview"
14+
];
15+
16+
public static readonly string[] PreferredEncodings =
17+
[
18+
"o200k_base",
19+
"cl100k_base"
20+
];
21+
}

src/ManagedCode.GraphRag/Indexing/Workflows/CreateBaseTextUnitsWorkflow.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
using GraphRag.Utils;
1010
using Microsoft.Extensions.DependencyInjection;
1111
using GraphRag.Tokenization;
12+
using GraphRag.Constants;
1213
using GraphRag.Storage;
1314

1415
namespace GraphRag.Indexing.Workflows;
@@ -21,14 +22,13 @@ public static WorkflowDelegate Create()
2122
{
2223
return async (config, context, cancellationToken) =>
2324
{
24-
var documents = await context.OutputStorage.LoadTableAsync<DocumentRecord>("documents", cancellationToken).ConfigureAwait(false);
25+
var documents = await context.OutputStorage.LoadTableAsync<DocumentRecord>(PipelineTableNames.Documents, cancellationToken).ConfigureAwait(false);
2526
context.Stats.NumDocuments = documents.Count;
2627

2728
var textUnits = new List<TextUnitRecord>();
2829
var callbacks = context.Callbacks;
2930
var chunkingConfig = config.Chunks;
3031
var chunkerResolver = context.Services.GetRequiredService<IChunkerResolver>();
31-
var tokenizerProvider = context.Services.GetRequiredService<ITokenizerProvider>();
3232
var chunker = chunkerResolver.Resolve(chunkingConfig.Strategy);
3333

3434
foreach (var document in documents)
@@ -39,7 +39,7 @@ public static WorkflowDelegate Create()
3939
? FormatMetadata(document.Metadata)
4040
: null;
4141

42-
var tokenizer = tokenizerProvider.GetTokenizer(chunkingConfig.EncodingModel);
42+
var tokenizer = TokenizerRegistry.GetTokenizer(chunkingConfig.EncodingModel);
4343
var metadataTokens = metadataPrefix is null ? 0 : tokenizer.CountTokens(metadataPrefix);
4444
var chunkConfig = CreateEffectiveConfig(chunkingConfig, metadataTokens);
4545

@@ -49,7 +49,7 @@ public static WorkflowDelegate Create()
4949
foreach (var chunk in chunks)
5050
{
5151
var text = metadataPrefix is null ? chunk.Text : metadataPrefix + chunk.Text;
52-
var id = Hashing.GenerateSha512Hash(("document", document.Id), ("text", text));
52+
var id = Hashing.GenerateSha512Hash((HashFieldNames.Document, document.Id), (HashFieldNames.Text, text));
5353
textUnits.Add(new TextUnitRecord
5454
{
5555
Id = id,
@@ -69,7 +69,7 @@ public static WorkflowDelegate Create()
6969
.Where(unit => !string.IsNullOrWhiteSpace(unit.Text))
7070
.ToArray();
7171

72-
await context.OutputStorage.WriteTableAsync("text_units", filtered, cancellationToken).ConfigureAwait(false);
72+
await context.OutputStorage.WriteTableAsync(PipelineTableNames.TextUnits, filtered, cancellationToken).ConfigureAwait(false);
7373

7474
return new WorkflowResult(filtered);
7575
};

0 commit comments

Comments
 (0)