|
3 | 3 |
|
4 | 4 | cdxj: build ensure_jwarc |
5 | 5 | @echo "creating *.cdxj index files from the local warcs" |
6 | | - java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj |
7 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj |
8 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj |
| 6 | + java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/whirlwind.warc.cdxj |
| 7 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > data/whirlwind.warc.wet.cdxj |
| 8 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > data/whirlwind.warc.wat.cdxj |
9 | 9 |
|
10 | 10 | extract: |
11 | 11 | @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index" |
@@ -49,39 +49,39 @@ duck_cloudfront: build |
49 | 49 | mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"cloudfront" |
50 | 50 |
|
51 | 51 |
|
52 | | -ensure_jwarc: |
53 | | - @echo "Ensuring JWarc JAR is present" |
54 | | - @if [ ! -f jwarc.jar ] ; then \ |
55 | | - echo "jwarc.jar not found, downloading..." ; \ |
56 | | - curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar ; \ |
57 | | - else \ |
58 | | - echo "jwarc.jar found." ; \ |
59 | | - fi |
60 | | - |
61 | | -get_jwarc: |
| 52 | +jwarc.jar: |
62 | 53 | @echo "downloading JWarc JAR" |
63 | 54 | curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar |
64 | 55 |
|
65 | | -wreck_the_warc: build ensure_jwarc |
| 56 | +wreck_the_warc: build jwarc.jar |
66 | 57 | @echo |
67 | 58 | @echo we will break and then fix this warc |
68 | 59 | cp data/whirlwind.warc.gz data/testing.warc.gz |
69 | 60 | rm -f data/testing.warc |
70 | 61 | gzip -d data/testing.warc.gz # windows gunzip no work-a |
71 | 62 | @echo |
72 | | - @echo iterate over this uncompressed warc: works |
73 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc" |
74 | | - @echo |
75 | 63 | @echo compress it the wrong way |
76 | 64 | gzip data/testing.warc |
77 | 65 | @echo |
78 | | - @echo iterating over this compressed warc fails |
79 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true |
| 66 | + @echo showing the records in the compressed warc - note the offsets of request and response are |
| 67 | + java -jar jwarc.jar ls data/testing.warc.gz |
| 68 | + @echo |
| 69 | + @echo access the request record - failing |
| 70 | + java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true |
| 71 | + @echo |
| 72 | + @echo access the response record - failing |
| 73 | + java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true |
80 | 74 | @echo |
81 | 75 | @echo "now let's do it the right way" |
82 | 76 | gzip -d data/testing.warc.gz |
83 | 77 | mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz" |
84 | 78 | @echo |
85 | | - @echo and now iterating works |
86 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" |
| 79 | + @echo showing the records in the compressed warc - note the skewed offsets of request and response |
| 80 | + java -jar jwarc.jar ls data/testing.warc.gz |
| 81 | + @echo |
| 82 | + @echo access the request record - works |
| 83 | + java -jar jwarc.jar extract data/testing.warc.gz 518 | head |
| 84 | + @echo |
| 85 | + @echo access the response record - works |
| 86 | + java -jar jwarc.jar extract data/testing.warc.gz 1027 | head -n 20 |
87 | 87 | @echo |
0 commit comments