Skip to content

Commit c10f363

Browse files
author
Greg Lindahl
committed
doc: update instructions, add example output to readme, etc
1 parent 29983d1 commit c10f363

5 files changed

Lines changed: 366 additions & 115 deletions

File tree

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,6 @@ TEST*.gz
55
extraction.*
66
testing.*
77
whirlwind.parquet
8-
collinfo.json
8+
collinfo.json
9+
.venv/
10+
.vscode/

Makefile

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ venv:
44
virtualenv -p python ~/venv/whirlwind
55
@echo
66
@echo "now you have to activate it:"
7-
@echo ". ~/venv/whirlwind/bin/activate"
8-
@echo
7+
@echo "source ~/venv/whirlwind/bin/activate"
98

109
install:
1110
pip install -r requirements.txt
@@ -39,12 +38,10 @@ extract:
3938
cdx_toolkit:
4039
@echo look up this capture in the comoncrawl cdx index
4140
cdxt --cc --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
42-
sleep 5
4341
@echo
4442
@echo extract the content from the commoncrawl s3 bucket
4543
rm -f TEST-000000.extracted.warc.gz
4644
cdxt --cc --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete
47-
sleep 5
4845
@echo
4946
@echo index this new warc
5047
cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
@@ -55,21 +52,25 @@ cdx_toolkit:
5552
@echo
5653

5754
download_collinfo:
58-
@echo downloading collinfo.json so we can find out the crawl name
59-
wget https://index.commoncrawl.org/collinfo.json
60-
sleep 5
55+
@echo "downloading collinfo.json so we can find out the crawl name"
56+
curl -O https://index.commoncrawl.org/collinfo.json
57+
58+
CC-MAIN-2024-22.warc.paths.gz:
59+
@echo "downloading the list from s3, requires s3 auth even though it is free"
60+
@echo "note that this file should be in the repo"
61+
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
6162

6263
duck_local_files:
6364
@echo "warning! 300 gigabyte download"
6465
python duck.py local_files
6566

6667
duck_ccf_local_files:
67-
@echo "warning! only works on greg's development machine"
68+
@echo "warning! only works on Common Crawl Foundadtion's development machine"
6869
python duck.py ccf_local_files
6970

70-
duck_s3_ls_then_cloudfront:
71+
duck_cloudfront:
7172
@echo "warning! this might take 1-10 minutes"
72-
python duck.py s3_ls_then_cloudfront
73+
python duck.py cloudfront
7374

7475
wreck_the_warc:
7576
@echo

0 commit comments

Comments
 (0)