Skip to content

Commit 81edcc2

Browse files
committed
Merge branch 'main' into luca/feature/part5
# Conflicts: # Makefile # README.md # pom.xml
2 parents f88927a + 4c97de4 commit 81edcc2

2 files changed

Lines changed: 395 additions & 229 deletions

File tree

Makefile

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ build:
33

44
cdxj: build ensure_jwarc
55
@echo "creating *.cdxj index files from the local warcs"
6-
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
7-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
8-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
6+
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/whirlwind.warc.cdxj
7+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > data/whirlwind.warc.wet.cdxj
8+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > data/whirlwind.warc.wat.cdxj
99

1010
extract:
1111
@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
@@ -49,39 +49,39 @@ duck_cloudfront: build
4949
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"cloudfront"
5050

5151

52-
ensure_jwarc:
53-
@echo "Ensuring JWarc JAR is present"
54-
@if [ ! -f jwarc.jar ] ; then \
55-
echo "jwarc.jar not found, downloading..." ; \
56-
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar ; \
57-
else \
58-
echo "jwarc.jar found." ; \
59-
fi
60-
61-
get_jwarc:
52+
jwarc.jar:
6253
@echo "downloading JWarc JAR"
6354
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar
6455

65-
wreck_the_warc: build ensure_jwarc
56+
wreck_the_warc: build jwarc.jar
6657
@echo
6758
@echo we will break and then fix this warc
6859
cp data/whirlwind.warc.gz data/testing.warc.gz
6960
rm -f data/testing.warc
7061
gzip -d data/testing.warc.gz # windows gunzip no work-a
7162
@echo
72-
@echo iterate over this uncompressed warc: works
73-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc"
74-
@echo
7563
@echo compress it the wrong way
7664
gzip data/testing.warc
7765
@echo
78-
@echo iterating over this compressed warc fails
79-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true
66+
@echo showing the records in the compressed warc - note the offsets of request and response are
67+
java -jar jwarc.jar ls data/testing.warc.gz
68+
@echo
69+
@echo access the request record - failing
70+
java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
71+
@echo
72+
@echo access the response record - failing
73+
java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
8074
@echo
8175
@echo "now let's do it the right way"
8276
gzip -d data/testing.warc.gz
8377
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz"
8478
@echo
85-
@echo and now iterating works
86-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz"
79+
@echo showing the records in the compressed warc - note the skewed offsets of request and response
80+
java -jar jwarc.jar ls data/testing.warc.gz
81+
@echo
82+
@echo access the request record - works
83+
java -jar jwarc.jar extract data/testing.warc.gz 518 | head
84+
@echo
85+
@echo access the response record - works
86+
java -jar jwarc.jar extract data/testing.warc.gz 1027 | head -n 20
8787
@echo

0 commit comments

Comments
 (0)