Skip to content
This repository was archived by the owner on Mar 23, 2026. It is now read-only.

Commit 8bf37e3

Browse files
committed
BLZG-1777 Patch for blazegraph/bigdata#315 into 2.0.1
1 parent e0320f2 commit 8bf37e3

1 file changed

Lines changed: 65 additions & 20 deletions

File tree

bigdata-core/bigdata-rdf/src/java/com/bigdata/rdf/store/DataLoader.java

Lines changed: 65 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
package com.bigdata.rdf.store;
2929

30+
import java.io.BufferedInputStream;
3031
import java.io.BufferedReader;
3132
import java.io.File;
3233
import java.io.FileInputStream;
@@ -94,8 +95,11 @@ public class DataLoader {
9495
protected static final transient Logger log = Logger.getLogger(DataLoader.class);
9596

9697
private final RDFParserOptions parserOptions;
98+
99+
97100

98101
/**
102+
*
99103
* The {@link StatementBuffer} capacity.
100104
*/
101105
private final int bufferCapacity;
@@ -400,6 +404,24 @@ public static enum ClosureEnum {
400404
*/
401405
public static interface Options extends RDFParserOptions.Options {
402406

407+
/**
408+
*
409+
* Java property to override the default GZIP buffer size used for
410+
* {@link GZipInputStream} and {@link GZipOutputStream}.
411+
*
412+
* This specifies the size in Bytes to use. The default is 65535.
413+
*
414+
* -Dcom.bigdata.journal.DataLoader.gzipBufferSize=65535
415+
*
416+
* See BLZG-1777
417+
*
418+
*/
419+
420+
static final String GZIP_BUFFER_SIZE = DataLoader.class
421+
.getClass().getName() + ".gzipBufferSize";
422+
423+
static final int DEFAULT_GZIP_BUFFER_SIZE = 65535;
424+
403425
/**
404426
* Optional property specifying whether and when the {@link DataLoader}
405427
* will {@link ITripleStore#commit()} the database (default
@@ -408,9 +430,9 @@ public static interface Options extends RDFParserOptions.Options {
408430
* Note: commit semantics vary depending on the specific backing store.
409431
* See {@link ITripleStore#commit()}.
410432
*/
411-
String COMMIT = DataLoader.class.getName()+".commit";
433+
static final String COMMIT = DataLoader.class.getName()+".commit";
412434

413-
String DEFAULT_COMMIT = CommitEnum.Batch.toString();
435+
static final String DEFAULT_COMMIT = CommitEnum.Batch.toString();
414436

415437
/**
416438
* Optional property specifying the capacity of the
@@ -426,9 +448,9 @@ public static interface Options extends RDFParserOptions.Options {
426448
* will increase the GC burden and could require a larger heap, but the
427449
* net throughput might also increase.
428450
*/
429-
String BUFFER_CAPACITY = DataLoader.class.getName()+".bufferCapacity";
451+
static final String BUFFER_CAPACITY = DataLoader.class.getName()+".bufferCapacity";
430452

431-
String DEFAULT_BUFFER_CAPACITY = "100000";
453+
static final String DEFAULT_BUFFER_CAPACITY = "100000";
432454

433455
/**
434456
* Optional property specifying the capacity of blocking queue used by
@@ -441,9 +463,9 @@ public static interface Options extends RDFParserOptions.Options {
441463
*
442464
* @see BLZG-1552
443465
*/
444-
String QUEUE_CAPACITY = DataLoader.class.getName() + ".queueCapacity";
466+
static final String QUEUE_CAPACITY = DataLoader.class.getName() + ".queueCapacity";
445467

446-
String DEFAULT_QUEUE_CAPACITY = "10";
468+
static final String DEFAULT_QUEUE_CAPACITY = "10";
447469

448470
/**
449471
* Optional property controls whether and when the RDFS(+) closure is
@@ -465,9 +487,9 @@ public static interface Options extends RDFParserOptions.Options {
465487
* @see InferenceEngine
466488
* @see InferenceEngine.Options
467489
*/
468-
String CLOSURE = DataLoader.class.getName()+".closure";
490+
static final String CLOSURE = DataLoader.class.getName()+".closure";
469491

470-
String DEFAULT_CLOSURE = ClosureEnum.Batch.toString();
492+
static final String DEFAULT_CLOSURE = ClosureEnum.Batch.toString();
471493

472494
/**
473495
*
@@ -496,12 +518,12 @@ public static interface Options extends RDFParserOptions.Options {
496518
* flushes the {@link DataLoader} when statement identifiers are
497519
* enabled. </strong>
498520
*/
499-
String FLUSH = DataLoader.class.getName()+".flush";
521+
static final String FLUSH = DataLoader.class.getName()+".flush";
500522

501523
/**
502524
* The default value (<code>true</code>) for {@link #FLUSH}.
503525
*/
504-
String DEFAULT_FLUSH = "true";
526+
static final String DEFAULT_FLUSH = "true";
505527

506528
/**
507529
* When <code>true</code>, the loader will not break on unresolvable
@@ -515,12 +537,12 @@ public static interface Options extends RDFParserOptions.Options {
515537
* @see BLZG-1531 (Add option to make the DataLoader robust to files
516538
* that cause rio to throw a fatal exception)
517539
*/
518-
String IGNORE_INVALID_FILES = DataLoader.class.getName()+".ignoreInvalidFiles";
540+
static final String IGNORE_INVALID_FILES = DataLoader.class.getName()+".ignoreInvalidFiles";
519541

520542
/**
521543
* The default value (<code>false</code>) for {@link #IGNORE_INVALID_FILES)
522544
*/
523-
String DEFAULT_IGNORE_INVALID_FILES = "false";
545+
static final String DEFAULT_IGNORE_INVALID_FILES = "false";
524546

525547
/**
526548
* When <code>true</code>, the data loader will rename each file as it
@@ -532,24 +554,24 @@ public static interface Options extends RDFParserOptions.Options {
532554
*
533555
* @see BLZG-1534 (durable queues)
534556
*/
535-
String DURABLE_QUEUES = DataLoader.class.getName() + ".durableQueues";
557+
static final String DURABLE_QUEUES = DataLoader.class.getName() + ".durableQueues";
536558

537559
/**
538560
* The default value (<code>false</code>) for {@link #DURABLE_QUEUES)
539561
*/
540-
String DEFAULT_DURABLE_QUEUES = "false";
562+
static final String DEFAULT_DURABLE_QUEUES = "false";
541563

542564
/**
543565
* When true, runs DumpJournal after each commit (with the -pages option) to obtain a distribution of the BTree index page sizes.
544566
*
545567
* @see BLZG-1535 (support dump journal in data loader)
546568
*/
547-
String DUMP_JOURNAL = DataLoader.class.getName() + ".dumpJournal";
569+
static final String DUMP_JOURNAL = DataLoader.class.getName() + ".dumpJournal";
548570

549571
/**
550572
* The default value (<code>false</code>) for {@link #DUMP_JOURNAL)
551573
*/
552-
String DEFAULT_DUMP_JOURNAL = "false";
574+
static final String DEFAULT_DUMP_JOURNAL = "false";
553575

554576
/**
555577
* When greater than ZERO (0), significant information may be reported
@@ -560,12 +582,12 @@ public static interface Options extends RDFParserOptions.Options {
560582
* the assertion buffers each time it reports on the incremental parser
561583
* performance.
562584
*/
563-
String VERBOSE = DataLoader.class.getName() + ".verbose";
585+
static final String VERBOSE = DataLoader.class.getName() + ".verbose";
564586

565587
/**
566588
* The default value (<code>0</code>) for {@link #VERBOSE)
567589
*/
568-
String DEFAULT_VERBOSE = "0";
590+
static final String DEFAULT_VERBOSE = "0";
569591

570592
}
571593

@@ -1309,11 +1331,12 @@ public void loadFiles(final MyLoadStats totals, final int depth,
13091331

13101332
if (n.endsWith(".gz")) {
13111333

1312-
is = new GZIPInputStream(is);
1334+
is = new GZIPInputStream(is, getGzipBuffer());
13131335

13141336
} else if (n.endsWith(".zip")) {
13151337

1316-
is = new ZipInputStream(is);
1338+
is = new ZipInputStream(new BufferedInputStream(is,
1339+
getGzipBuffer()));
13171340

13181341
}
13191342

@@ -1849,6 +1872,28 @@ public ClosureStats doClosure() {
18491872

18501873
}
18511874

1875+
/**
1876+
* Utility to return the gzip buffer either from the
1877+
* default or the {@link Options#GZIP_BUFFER_SIZE}
1878+
*
1879+
* See BLZG-1777
1880+
*
1881+
* @return
1882+
* int with the buffer size
1883+
*/
1884+
private static int getGzipBuffer() {
1885+
1886+
final String s = System.getProperty(Options.GZIP_BUFFER_SIZE);
1887+
1888+
if (s == null || s.isEmpty()) {
1889+
return Options.DEFAULT_GZIP_BUFFER_SIZE;
1890+
} else {
1891+
return Integer.parseInt(s);
1892+
}
1893+
1894+
}
1895+
1896+
18521897
/**
18531898
* Utility method may be used to create and/or load RDF data into a local
18541899
* database instance. Directories will be recursively processed. The data

0 commit comments

Comments
 (0)