@@ -72,19 +72,28 @@ wreck_the_warc: build ensure_jwarc
7272 rm -f data/testing.warc
7373 gzip -d data/testing.warc.gz # windows gunzip no work-a
7474 @echo
75- @echo iterate over this uncompressed warc: works
76- mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args=" data/testing.warc"
77- @echo
7875 @echo compress it the wrong way
7976 gzip data/testing.warc
8077 @echo
81- @echo iterating over this compressed warc fails
82- mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args=" data/testing.warc.gz" || /usr/bin/true
78+ @echo showing the records in the compressed warc - note the offsets of request and response are
79+ java -jar jwarc-0.33.0.jar ls data/testing.warc.gz
80+ @echo
81+ @echo access the request record - failing
82+ java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 3734 || /usr/bin/true
83+ @echo
84+ @echo access the response record - failing
85+ java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 3734 || /usr/bin/true
8386 @echo
8487 @echo " now let's do it the right way"
8588 gzip -d data/testing.warc.gz
8689 mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args=" data/testing.warc data/testing.warc.gz"
8790 @echo
88- @echo and now iterating works
89- mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args=" data/testing.warc.gz"
91+ @echo showing the records in the compressed warc - note the skewed offsets of request and response
92+ java -jar jwarc-0.33.0.jar ls data/testing.warc.gz
93+ @echo
94+ @echo access the request record - works
95+ java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 518 | head
96+ @echo
97+ @echo access the response record - works
98+ java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 1027 | head -n 20
9099 @echo
0 commit comments