diff --git a/docs/modules/servers/nav.adoc b/docs/modules/servers/nav.adoc index b0030e78704..1e619f3879e 100644 --- a/docs/modules/servers/nav.adoc +++ b/docs/modules/servers/nav.adoc @@ -14,6 +14,7 @@ *** xref:distributed/architecture/index.adoc[] **** xref:distributed/architecture/implemented-standards.adoc[] **** xref:distributed/architecture/consistency-model.adoc[] +**** xref:distributed/architecture/blobstore.adoc[] **** xref:distributed/architecture/specialized-instances.adoc[] **** xref:distributed/architecture/data-tiering.adoc[] *** xref:distributed/run/index.adoc[Run] @@ -88,6 +89,7 @@ *** xref:postgres/architecture/index.adoc[] **** xref:postgres/architecture/implemented-standards.adoc[] **** xref:postgres/architecture/consistency-model.adoc[] +**** xref:postgres/architecture/blobstore.adoc[] **** xref:postgres/architecture/specialized-instances.adoc[] *** xref:postgres/run/index.adoc[] **** xref:postgres/run/run-java.adoc[Run with Java] diff --git a/docs/modules/servers/pages/distributed/architecture/blobstore.adoc b/docs/modules/servers/pages/distributed/architecture/blobstore.adoc new file mode 100644 index 00000000000..ea6c3469dcf --- /dev/null +++ b/docs/modules/servers/pages/distributed/architecture/blobstore.adoc @@ -0,0 +1,4 @@ += Distributed James Server — BlobStore +:navtitle: BlobStore + +include::partial$architecture/blobstore.adoc[] diff --git a/docs/modules/servers/pages/postgres/architecture/blobstore.adoc b/docs/modules/servers/pages/postgres/architecture/blobstore.adoc new file mode 100644 index 00000000000..db1e165838a --- /dev/null +++ b/docs/modules/servers/pages/postgres/architecture/blobstore.adoc @@ -0,0 +1,4 @@ += Postgresql James server — BlobStore +:navtitle: BlobStore + +include::partial$architecture/blobstore.adoc[] diff --git a/docs/modules/servers/partials/architecture/blobstore.adoc b/docs/modules/servers/partials/architecture/blobstore.adoc new file mode 100644 index 00000000000..fb4ef523d1e --- /dev/null +++ b/docs/modules/servers/partials/architecture/blobstore.adoc @@ -0,0 +1,108 @@ +James stores large, non-indexable binary payloads in a BlobStore. Typical examples +are message bodies, attachments, deleted messages retained by the vault, and mail +queue payloads. + +Mailbox, Mail Queue, and Deleted Messages Vault components rely on it. + +Server components usually depend on the higher-level `BlobStore`. `BlobStoreDAO` +is the lower-level virtual storage abstraction implemented by concrete storage +connectors such as memory, file, Cassandra, Postgres, and S3 compatible object +stores. It allows James to compose storage features such as encryption or +compression independently of the storage connector. + +== Abstraction layers + +Most James components use `BlobStore`, which is responsible for saving content +and returning a `BlobId`. `BlobStoreDAO` is the lower-level persistence contract: +it stores, reads, lists, and deletes blobs for a given `BucketName` and `BlobId`. + +A `BlobStore` exposes a default logical bucket through `getDefaultBucketName()`. +Callers can explicitly pass another `BucketName` when they need to store data in +another logical bucket. More advanced organization rules inside a logical bucket +are not part of the generic `BlobStore` contract; callers need to model them +explicitly or provide a custom `BlobStore` implementation. + +Cross-cutting storage features can be composed around this DAO contract. For +example, deduplication decides blob identifiers at the `BlobStore` level, while +wrappers such as compression or encryption can transform payloads and metadata +before delegating to the concrete storage connector. + +=== BlobStore implementations + +James composes several behaviors at the `BlobStore` level: + +* `PassThroughBlobStore` is the non-deduplicating strategy. It generates a new + `BlobId` for each save, delegates persistence to the configured + `BlobStoreDAO`, and deletes blobs directly. +* `DeDuplicationBlobStore` is the deduplicating strategy. It derives `BlobId` + values from content hashes, so identical content can share the same stored + blob. A single delete does not remove the underlying blob immediately; garbage + collection is responsible for eventually removing unreferenced blobs. +* `MetricableBlobStore` decorates another `BlobStore` with timing metrics. +* `CachedBlobStore` decorates another `BlobStore` with a Cassandra-backed cache + for small, frequently read blobs. + +=== BlobStoreDAO implementations + +Concrete `BlobStoreDAO` implementations persist payloads in a storage backend, +for example memory, file, Cassandra, Postgres, or S3 compatible object storage. + +Some `BlobStoreDAO` implementations are wrappers rather than final storage +connectors: + +* `AESBlobStoreDAO` encrypts payload bytes before delegating writes to the + underlying DAO, and decrypts them transparently on reads. This protects blob + content at rest, especially when James stores blobs in third-party object + storage. +* `ZstdBlobStoreDAO` can compress payload bytes before delegating writes to the + underlying DAO. When it stores compressed bytes, it records metadata such as + `content-encoding` and the original size. On reads, it uses this metadata to + transparently decompress the payload. This reduces storage usage and network + transfer for compressible blob content. + +AES and Zstd can be enabled together. In the Guice binding chain, compression +wraps encryption: `ZstdBlobStoreDAO` delegates to `AESBlobStoreDAO`, which then +delegates to the concrete storage DAO. Writes therefore compress first and +encrypt afterwards; reads decrypt first and decompress afterwards. This ordering +preserves the benefit of compression, as encrypted payloads are generally not +compressible. + +== Logical buckets + +`BucketName` is a James logical namespace in the `BlobStoreDAO` contract. It is +not an AWS S3 bucket name, even when the selected connector stores data in an S3 +compatible object store. + +Each connector maps this logical namespace to its own storage model. Depending +on the implementation and configuration, a logical bucket can be stored as a +directory, an object-storage bucket, a database partition, or another +connector-specific representation. Code using `BlobStoreDAO` should only rely on +the James `BucketName` abstraction. + +== Metadata + +Blob metadata stores side information needed to interpret a blob payload without +changing the payload bytes or blob identifier. One use case is object store +compression: James uses a marker such as `content-encoding` to detect a +compressed payload and transparently decompress it when reading. + +James uses a hybrid metadata model: + +* Metadata actively interpreted by James should expose typed helpers or constants + in the API. For example, `BlobMetadata.contentEncoding()` reads the + `content-encoding` entry. +* Other metadata stays available through the underlying map as an extension point + for James library users and custom storage implementations. + Custom storage metadata could be used by James library users or storage extensions to implement other use cases. + +Metadata-aware storage implementations and wrappers should preserve unknown +metadata entries. + +=== Metadata names + +`BlobMetadataName` defines the portable metadata key convention: + +* names are case-insensitive and are canonicalized to lowercase; +* names must be non-empty; +* names must be shorter than 128 characters; +* names can contain only ASCII letters, digits, and `-`. diff --git a/docs/modules/servers/partials/architecture/index.adoc b/docs/modules/servers/partials/architecture/index.adoc index 449a31c99e3..634004753c0 100644 --- a/docs/modules/servers/partials/architecture/index.adoc +++ b/docs/modules/servers/partials/architecture/index.adoc @@ -279,6 +279,8 @@ the same content will be stored one once. The downside is that deletion is more complicated, and a garbage collection needs to be run. A first implementation based on bloom filters can be used and triggered using the WebAdmin REST API. +See xref:{xref-base}/architecture/blobstore.adoc[BlobStore architecture page] for more details. + === Task Manager Allows to control and schedule long running tasks run by other diff --git a/server/blob/blob-api/src/main/java/org/apache/james/blob/api/BlobStore.java b/server/blob/blob-api/src/main/java/org/apache/james/blob/api/BlobStore.java index 8d5101e0db7..0493e27743b 100644 --- a/server/blob/blob-api/src/main/java/org/apache/james/blob/api/BlobStore.java +++ b/server/blob/blob-api/src/main/java/org/apache/james/blob/api/BlobStore.java @@ -27,6 +27,19 @@ import reactor.util.function.Tuple2; +/** + * High-level James blob storage abstraction. + * + *
A {@link BlobStore} stores binary payloads in a James logical {@link BucketName} + * and returns {@link BlobId} references. A configured {@link BlobStore} exposes a default + * logical bucket through {@link #getDefaultBucketName()}.
+ * + *A {@link BucketName} is a James-specific logical bucket. Each storage connector decides how this logical + * bucket is represented in its backend. It should not be conflated with an S3 bucket name and does not have to map one-to-one + * to a physical bucket.
+ * + *See {@code docs/modules/servers/partials/architecture/blobstore.adoc} for more details.
+ */ public interface BlobStore { String DEFAULT_BUCKET_NAME_QUALIFIER = "defaultBucket"; diff --git a/server/blob/blob-api/src/main/java/org/apache/james/blob/api/BlobStoreDAO.java b/server/blob/blob-api/src/main/java/org/apache/james/blob/api/BlobStoreDAO.java index ed0679bd510..a527dad8259 100644 --- a/server/blob/blob-api/src/main/java/org/apache/james/blob/api/BlobStoreDAO.java +++ b/server/blob/blob-api/src/main/java/org/apache/james/blob/api/BlobStoreDAO.java @@ -38,6 +38,19 @@ import com.google.common.io.ByteSource; import com.google.common.io.FileBackedOutputStream; +/** + * James virtual blob store abstraction. + * + *A {@link BucketName} is a James-specific logical bucket. Each storage connector decides how this logical + * bucket is represented in its backend. It should not be conflated with an S3 bucket name and does not have to map one-to-one + * to a physical bucket.
+ * + *{@link BlobMetadata} is part of the contract so wrapper DAOs and storage implementations can keep side information + * needed to interpret a payload, such as compression markers. Metadata actively used by James should expose typed + * helpers, while the underlying metadata map remains an extension point for James library users and custom implementations.
+ * + *See {@code docs/modules/servers/partials/architecture/blobstore.adoc} for more details.
+ */ public interface BlobStoreDAO { record BlobMetadataName(String name) { private static final CharMatcher CHAR_MATCHER = CharMatcher.inRange('a', 'z') @@ -46,6 +59,7 @@ record BlobMetadataName(String name) { .or(CharMatcher.is('-')); public BlobMetadataName { + Preconditions.checkArgument(!name.isEmpty(), "Metadata name cannot be empty"); Preconditions.checkArgument(CHAR_MATCHER.matchesAllOf(name), "Invalid char in metadata name. Must be a-z,A-Z,0-9 or - got " + name); Preconditions.checkArgument(name.length() < 128, "Metadata name is too long. Size exceed 128 chars"); name = name.toLowerCase(Locale.US); diff --git a/server/blob/blob-api/src/test/java/org/apache/james/blob/api/BlobMetadataTest.java b/server/blob/blob-api/src/test/java/org/apache/james/blob/api/BlobMetadataTest.java index 3cabf6991a5..4f36eea2a52 100644 --- a/server/blob/blob-api/src/test/java/org/apache/james/blob/api/BlobMetadataTest.java +++ b/server/blob/blob-api/src/test/java/org/apache/james/blob/api/BlobMetadataTest.java @@ -20,10 +20,17 @@ package org.apache.james.blob.api; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.ValueSource; class BlobMetadataTest { + private static final String ONE_HUNDRED_TWENTY_SEVEN_CHARS_METADATA_NAME = "a".repeat(127); + private static final String ONE_HUNDRED_TWENTY_EIGHT_CHARS_METADATA_NAME = "a".repeat(128); + @Test void blobMetadataNameShouldBeCaseInsensitive() { assertThat(new BlobStoreDAO.BlobMetadataName("X-Test").name()) @@ -31,4 +38,41 @@ void blobMetadataNameShouldBeCaseInsensitive() { assertThat(new BlobStoreDAO.BlobMetadataName("X-Test")) .isEqualTo(new BlobStoreDAO.BlobMetadataName("x-test")); } + + @ParameterizedTest + @CsvSource({ + "metadata, metadata", + "CONTENT-ENCODING, content-encoding", + "x-test-123, x-test-123", + "A1-B2-C3, a1-b2-c3" + }) + void blobMetadataNameShouldAcceptLettersDigitsAndDash(String rawName, String expectedName) { + assertThat(new BlobStoreDAO.BlobMetadataName(rawName).name()) + .isEqualTo(expectedName); + } + + @ParameterizedTest + @ValueSource(strings = {"metadata_name", "metadata.name", "metadata name", "metadata/name", "metadata:name", "metadata#name"}) + void blobMetadataNameShouldRejectUnsupportedCharacters(String rawName) { + assertThatThrownBy(() -> new BlobStoreDAO.BlobMetadataName(rawName)) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + void blobMetadataNameShouldRejectEmptyName() { + assertThatThrownBy(() -> new BlobStoreDAO.BlobMetadataName("")) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + void blobMetadataNameShouldAcceptNameBelowOneHundredTwentyEightCharacters() { + assertThat(new BlobStoreDAO.BlobMetadataName(ONE_HUNDRED_TWENTY_SEVEN_CHARS_METADATA_NAME).name()) + .isEqualTo(ONE_HUNDRED_TWENTY_SEVEN_CHARS_METADATA_NAME); + } + + @Test + void blobMetadataNameShouldRejectNameOfOneHundredTwentyEightCharacters() { + assertThatThrownBy(() -> new BlobStoreDAO.BlobMetadataName(ONE_HUNDRED_TWENTY_EIGHT_CHARS_METADATA_NAME)) + .isInstanceOf(IllegalArgumentException.class); + } } \ No newline at end of file