Skip to content

Commit 40bb84a

Browse files
committed
Merge branch 'main' into luca/feature/part3
# Conflicts: # Makefile # README.md # src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java
2 parents b3c7252 + 8244615 commit 40bb84a

3 files changed

Lines changed: 392 additions & 292 deletions

File tree

Makefile

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,19 +72,28 @@ wreck_the_warc: build ensure_jwarc
7272
rm -f data/testing.warc
7373
gzip -d data/testing.warc.gz # windows gunzip no work-a
7474
@echo
75-
@echo iterate over this uncompressed warc: works
76-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc"
77-
@echo
7875
@echo compress it the wrong way
7976
gzip data/testing.warc
8077
@echo
81-
@echo iterating over this compressed warc fails
82-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true
78+
@echo showing the records in the compressed warc - note the offsets of request and response are
79+
java -jar jwarc-0.33.0.jar ls data/testing.warc.gz
80+
@echo
81+
@echo access the request record - failing
82+
java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 3734 || /usr/bin/true
83+
@echo
84+
@echo access the response record - failing
85+
java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 3734 || /usr/bin/true
8386
@echo
8487
@echo "now let's do it the right way"
8588
gzip -d data/testing.warc.gz
8689
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz"
8790
@echo
88-
@echo and now iterating works
89-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz"
91+
@echo showing the records in the compressed warc - note the skewed offsets of request and response
92+
java -jar jwarc-0.33.0.jar ls data/testing.warc.gz
93+
@echo
94+
@echo access the request record - works
95+
java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 518 | head
96+
@echo
97+
@echo access the response record - works
98+
java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 1027 | head -n 20
9099
@echo

0 commit comments

Comments
 (0)