44 virtualenv -p python ~ /venv/whirlwind
55 @echo
66 @echo " now you have to activate it:"
7- @echo " . ~/venv/whirlwind/bin/activate"
8- @echo
7+ @echo " source ~/venv/whirlwind/bin/activate"
98
109install :
1110 pip install -r requirements.txt
@@ -39,12 +38,10 @@ extract:
3938cdx_toolkit :
4039 @echo look up this capture in the comoncrawl cdx index
4140 cdxt --cc --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
42- sleep 5
4341 @echo
4442 @echo extract the content from the commoncrawl s3 bucket
4543 rm -f TEST-000000.extracted.warc.gz
4644 cdxt --cc --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete
47- sleep 5
4845 @echo
4946 @echo index this new warc
5047 cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
@@ -55,21 +52,25 @@ cdx_toolkit:
5552 @echo
5653
5754download_collinfo :
58- @echo downloading collinfo.json so we can find out the crawl name
59- wget https://index.commoncrawl.org/collinfo.json
60- sleep 5
55+ @echo " downloading collinfo.json so we can find out the crawl name"
56+ curl -O https://index.commoncrawl.org/collinfo.json
57+
58+ CC-MAIN-2024-22.warc.paths.gz :
59+ @echo " downloading the list from s3, requires s3 auth even though it is free"
60+ @echo " note that this file should be in the repo"
61+ aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk ' {print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
6162
6263duck_local_files :
6364 @echo " warning! 300 gigabyte download"
6465 python duck.py local_files
6566
6667duck_ccf_local_files :
67- @echo " warning! only works on greg 's development machine"
68+ @echo " warning! only works on Common Crawl Foundadtion 's development machine"
6869 python duck.py ccf_local_files
6970
70- duck_s3_ls_then_cloudfront :
71+ duck_cloudfront :
7172 @echo " warning! this might take 1-10 minutes"
72- python duck.py s3_ls_then_cloudfront
73+ python duck.py cloudfront
7374
7475wreck_the_warc :
7576 @echo
0 commit comments