2727
2828package com .bigdata .rdf .store ;
2929
30+ import java .io .BufferedInputStream ;
3031import java .io .BufferedReader ;
3132import java .io .File ;
3233import java .io .FileInputStream ;
@@ -94,8 +95,11 @@ public class DataLoader {
9495 protected static final transient Logger log = Logger .getLogger (DataLoader .class );
9596
9697 private final RDFParserOptions parserOptions ;
98+
99+
97100
98101 /**
102+ *
99103 * The {@link StatementBuffer} capacity.
100104 */
101105 private final int bufferCapacity ;
@@ -400,6 +404,24 @@ public static enum ClosureEnum {
400404 */
401405 public static interface Options extends RDFParserOptions .Options {
402406
407+ /**
408+ *
409+ * Java property to override the default GZIP buffer size used for
410+ * {@link GZipInputStream} and {@link GZipOutputStream}.
411+ *
412+ * This specifies the size in Bytes to use. The default is 65535.
413+ *
414+ * -Dcom.bigdata.journal.DataLoader.gzipBufferSize=65535
415+ *
416+ * See BLZG-1777
417+ *
418+ */
419+
420+ static final String GZIP_BUFFER_SIZE = DataLoader .class
421+ .getClass ().getName () + ".gzipBufferSize" ;
422+
423+ static final int DEFAULT_GZIP_BUFFER_SIZE = 65535 ;
424+
403425 /**
404426 * Optional property specifying whether and when the {@link DataLoader}
405427 * will {@link ITripleStore#commit()} the database (default
@@ -408,9 +430,9 @@ public static interface Options extends RDFParserOptions.Options {
408430 * Note: commit semantics vary depending on the specific backing store.
409431 * See {@link ITripleStore#commit()}.
410432 */
411- String COMMIT = DataLoader .class .getName ()+".commit" ;
433+ static final String COMMIT = DataLoader .class .getName ()+".commit" ;
412434
413- String DEFAULT_COMMIT = CommitEnum .Batch .toString ();
435+ static final String DEFAULT_COMMIT = CommitEnum .Batch .toString ();
414436
415437 /**
416438 * Optional property specifying the capacity of the
@@ -426,9 +448,9 @@ public static interface Options extends RDFParserOptions.Options {
426448 * will increase the GC burden and could require a larger heap, but the
427449 * net throughput might also increase.
428450 */
429- String BUFFER_CAPACITY = DataLoader .class .getName ()+".bufferCapacity" ;
451+ static final String BUFFER_CAPACITY = DataLoader .class .getName ()+".bufferCapacity" ;
430452
431- String DEFAULT_BUFFER_CAPACITY = "100000" ;
453+ static final String DEFAULT_BUFFER_CAPACITY = "100000" ;
432454
433455 /**
434456 * Optional property specifying the capacity of blocking queue used by
@@ -441,9 +463,9 @@ public static interface Options extends RDFParserOptions.Options {
441463 *
442464 * @see BLZG-1552
443465 */
444- String QUEUE_CAPACITY = DataLoader .class .getName () + ".queueCapacity" ;
466+ static final String QUEUE_CAPACITY = DataLoader .class .getName () + ".queueCapacity" ;
445467
446- String DEFAULT_QUEUE_CAPACITY = "10" ;
468+ static final String DEFAULT_QUEUE_CAPACITY = "10" ;
447469
448470 /**
449471 * Optional property controls whether and when the RDFS(+) closure is
@@ -465,9 +487,9 @@ public static interface Options extends RDFParserOptions.Options {
465487 * @see InferenceEngine
466488 * @see InferenceEngine.Options
467489 */
468- String CLOSURE = DataLoader .class .getName ()+".closure" ;
490+ static final String CLOSURE = DataLoader .class .getName ()+".closure" ;
469491
470- String DEFAULT_CLOSURE = ClosureEnum .Batch .toString ();
492+ static final String DEFAULT_CLOSURE = ClosureEnum .Batch .toString ();
471493
472494 /**
473495 *
@@ -496,12 +518,12 @@ public static interface Options extends RDFParserOptions.Options {
496518 * flushes the {@link DataLoader} when statement identifiers are
497519 * enabled. </strong>
498520 */
499- String FLUSH = DataLoader .class .getName ()+".flush" ;
521+ static final String FLUSH = DataLoader .class .getName ()+".flush" ;
500522
501523 /**
502524 * The default value (<code>true</code>) for {@link #FLUSH}.
503525 */
504- String DEFAULT_FLUSH = "true" ;
526+ static final String DEFAULT_FLUSH = "true" ;
505527
506528 /**
507529 * When <code>true</code>, the loader will not break on unresolvable
@@ -515,12 +537,12 @@ public static interface Options extends RDFParserOptions.Options {
515537 * @see BLZG-1531 (Add option to make the DataLoader robust to files
516538 * that cause rio to throw a fatal exception)
517539 */
518- String IGNORE_INVALID_FILES = DataLoader .class .getName ()+".ignoreInvalidFiles" ;
540+ static final String IGNORE_INVALID_FILES = DataLoader .class .getName ()+".ignoreInvalidFiles" ;
519541
520542 /**
521543 * The default value (<code>false</code>) for {@link #IGNORE_INVALID_FILES)
522544 */
523- String DEFAULT_IGNORE_INVALID_FILES = "false" ;
545+ static final String DEFAULT_IGNORE_INVALID_FILES = "false" ;
524546
525547 /**
526548 * When <code>true</code>, the data loader will rename each file as it
@@ -532,24 +554,24 @@ public static interface Options extends RDFParserOptions.Options {
532554 *
533555 * @see BLZG-1534 (durable queues)
534556 */
535- String DURABLE_QUEUES = DataLoader .class .getName () + ".durableQueues" ;
557+ static final String DURABLE_QUEUES = DataLoader .class .getName () + ".durableQueues" ;
536558
537559 /**
538560 * The default value (<code>false</code>) for {@link #DURABLE_QUEUES)
539561 */
540- String DEFAULT_DURABLE_QUEUES = "false" ;
562+ static final String DEFAULT_DURABLE_QUEUES = "false" ;
541563
542564 /**
543565 * When true, runs DumpJournal after each commit (with the -pages option) to obtain a distribution of the BTree index page sizes.
544566 *
545567 * @see BLZG-1535 (support dump journal in data loader)
546568 */
547- String DUMP_JOURNAL = DataLoader .class .getName () + ".dumpJournal" ;
569+ static final String DUMP_JOURNAL = DataLoader .class .getName () + ".dumpJournal" ;
548570
549571 /**
550572 * The default value (<code>false</code>) for {@link #DUMP_JOURNAL)
551573 */
552- String DEFAULT_DUMP_JOURNAL = "false" ;
574+ static final String DEFAULT_DUMP_JOURNAL = "false" ;
553575
554576 /**
555577 * When greater than ZERO (0), significant information may be reported
@@ -560,12 +582,12 @@ public static interface Options extends RDFParserOptions.Options {
560582 * the assertion buffers each time it reports on the incremental parser
561583 * performance.
562584 */
563- String VERBOSE = DataLoader .class .getName () + ".verbose" ;
585+ static final String VERBOSE = DataLoader .class .getName () + ".verbose" ;
564586
565587 /**
566588 * The default value (<code>0</code>) for {@link #VERBOSE)
567589 */
568- String DEFAULT_VERBOSE = "0" ;
590+ static final String DEFAULT_VERBOSE = "0" ;
569591
570592 }
571593
@@ -1309,11 +1331,12 @@ public void loadFiles(final MyLoadStats totals, final int depth,
13091331
13101332 if (n .endsWith (".gz" )) {
13111333
1312- is = new GZIPInputStream (is );
1334+ is = new GZIPInputStream (is , getGzipBuffer () );
13131335
13141336 } else if (n .endsWith (".zip" )) {
13151337
1316- is = new ZipInputStream (is );
1338+ is = new ZipInputStream (new BufferedInputStream (is ,
1339+ getGzipBuffer ()));
13171340
13181341 }
13191342
@@ -1849,6 +1872,28 @@ public ClosureStats doClosure() {
18491872
18501873 }
18511874
1875+ /**
1876+ * Utility to return the gzip buffer either from the
1877+ * default or the {@link Options#GZIP_BUFFER_SIZE}
1878+ *
1879+ * See BLZG-1777
1880+ *
1881+ * @return
1882+ * int with the buffer size
1883+ */
1884+ private static int getGzipBuffer () {
1885+
1886+ final String s = System .getProperty (Options .GZIP_BUFFER_SIZE );
1887+
1888+ if (s == null || s .isEmpty ()) {
1889+ return Options .DEFAULT_GZIP_BUFFER_SIZE ;
1890+ } else {
1891+ return Integer .parseInt (s );
1892+ }
1893+
1894+ }
1895+
1896+
18521897 /**
18531898 * Utility method may be used to create and/or load RDF data into a local
18541899 * database instance. Directories will be recursively processed. The data
0 commit comments