We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent a91e52f commit f88927aCopy full SHA for f88927a
2 files changed
Makefile
@@ -38,7 +38,7 @@ download_collinfo:
38
CC-MAIN-2024-22.warc.paths.gz:
39
@echo "downloading the list from s3, requires s3 auth even though it is free"
40
@echo "note that this file should be in the repo"
41
- aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
+ aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > data/CC-MAIN-2024-22.warc.paths.gz
42
43
duck_ccf_local_files: build
44
@echo "warning! only works on Common Crawl Foundadtion's development machine"
CC-MAIN-2024-22.warc.paths.gz data/CC-MAIN-2024-22.warc.paths.gzCC-MAIN-2024-22.warc.paths.gz renamed to data/CC-MAIN-2024-22.warc.paths.gz
0 commit comments