commoncrawl · lfoppiano · Feb 12, 2026 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/Makefile b/Makefile
@@ -9,10 +9,10 @@ cdxj: build jwarc.jar
 
 extract: jwarc.jar
 	@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
-	java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
-	java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
-	java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
-	@echo "hint: python -m json.tool extraction.json"
+	java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > data/extraction.html
+	java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > data/extraction.txt
+	java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > data/extraction.json
+	@echo "hint: python -m json.tool data/extraction.json"
 
 cdx_toolkit: jwarc.jar
 	@echo demonstrate that we have this entry in the index
@@ -41,12 +41,18 @@ CC-MAIN-2024-22.warc.paths.gz:
 	aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > data/CC-MAIN-2024-22.warc.paths.gz
 
 duck_ccf_local_files: build
-	@echo "warning! only works on Common Crawl Foundadtion's development machine"
-	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"ccf_local_files"
+	@echo "warning! only works on Common Crawl Foundation's development machine"
+	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="ccf_local_files"
+
+duck_local_files: build
+	ifndef LOCAL_DIR
+		$(error LOCAL_DIR is required. Usage: make duck_local_files LOCAL_DIR=/path/to/data)
+	endif
+	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="local_files $(LOCAL_DIR)"
 
 duck_cloudfront: build
 	@echo "warning! this might take 1-10 minutes"
-	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"cloudfront"
+	mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="cloudfront"
 
 jwarc.jar:
 	@echo "downloading JWarc JAR"

diff --git a/README.md b/README.md
@@ -700,11 +700,128 @@ The date of our test record is 20240518015810, which is
 
 ## Task 8: Query using the columnar index + DuckDB from outside AWS
 
-TBA
+A single crawl columnar index is around 300 gigabytes. If you don't have a lot of disk space, but you do have a lot of time, you can directly access the index stored on AWS S3. We're going to do just that, and then use [DuckDB](https://duckdb.org) to make an SQL query against the index to find our webpage. We'll be running the following query:
+
+```sql
+    SELECT
+      *
+    FROM ccindex
+    WHERE subset = 'warc'
+      AND crawl = 'CC-MAIN-2024-22'
+      AND url_host_tld = 'org' -- help the query optimizer
+      AND url_host_registered_domain = 'wikipedia.org' -- ditto
+      AND url = 'https://an.wikipedia.org/wiki/Escopete'
+    ;
+```
+
+Run
+
+```make duck_cloudfront```
+
+On a machine with a 1 gigabit network connection and many cores, this should take about one minute total, and uses 8 cores. The output should look like:
+
+<details>
+  <summary>Click to view output</summary>
+
+```
+Using algorithm: cloudfront
+Total records for crawl: CC-MAIN-2024-22
+100% ▕████████████████████████████████████████████████████████████▏ 
+2709877975
+
+Our one row:
+100% ▕████████████████████████████████████████████████████████████▏ 
+url_surtkey | url | url_host_name | url_host_tld | url_host_2nd_last_part | url_host_3rd_last_part | url_host_4th_last_part | url_host_5th_last_part | url_host_registry_suffix | url_host_registered_domain | url_host_private_suffix | url_host_private_domain | url_host_name_reversed | url_protocol | url_port | url_path | url_query | fetch_time | fetch_status | fetch_redirect | content_digest | content_mime_type | content_mime_detected | content_charset | content_languages | content_truncated | warc_filename | warc_record_offset | warc_record_length | warc_segment | crawl | subset
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+org,wikipedia,an)/wiki/escopete | https://an.wikipedia.org/wiki/Escopete | an.wikipedia.org | org | wikipedia | an | NULL | NULL | org | wikipedia.org | org | wikipedia.org | org.wikipedia.an | https | NULL | /wiki/Escopete | NULL | 2024-05-18T01:58:10Z | 200 | NULL | RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU | text/html | text/html | UTF-8 | spa | NULL | crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz | 80610731 | 17423 | 1715971057216.39 | CC-MAIN-2024-22 | warc
+
+Writing our one row to a local parquet file, whirlwind.parquet
+100% ▕████████████████████████████████████████████████████████████▏ 
+Total records for local whirlwind.parquet should be 1:
+1
+
+Our one row, locally:
+url_surtkey | url | url_host_name | url_host_tld | url_host_2nd_last_part | url_host_3rd_last_part | url_host_4th_last_part | url_host_5th_last_part | url_host_registry_suffix | url_host_registered_domain | url_host_private_suffix | url_host_private_domain | url_host_name_reversed | url_protocol | url_port | url_path | url_query | fetch_time | fetch_status | fetch_redirect | content_digest | content_mime_type | content_mime_detected | content_charset | content_languages | content_truncated | warc_filename | warc_record_offset | warc_record_length | warc_segment | crawl | subset
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+org,wikipedia,an)/wiki/escopete | https://an.wikipedia.org/wiki/Escopete | an.wikipedia.org | org | wikipedia | an | NULL | NULL | org | wikipedia.org | org | wikipedia.org | org.wikipedia.an | https | NULL | /wiki/Escopete | NULL | 2024-05-18T01:58:10Z | 200 | NULL | RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU | text/html | text/html | UTF-8 | spa | NULL | crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz | 80610731 | 17423 | 1715971057216.39 | CC-MAIN-2024-22 | warc
+
+Complete row:
+  url_surtkey org,wikipedia,an)/wiki/escopete
+  url https://an.wikipedia.org/wiki/Escopete
+  url_host_name an.wikipedia.org
+  url_host_tld org
+  url_host_2nd_last_part wikipedia
+  url_host_3rd_last_part an
+  url_host_4th_last_part null
+  url_host_5th_last_part null
+  url_host_registry_suffix org
+  url_host_registered_domain wikipedia.org
+  url_host_private_suffix org
+  url_host_private_domain wikipedia.org
+  url_host_name_reversed org.wikipedia.an
+  url_protocol https
+  url_port null
+  url_path /wiki/Escopete
+  url_query null
+  fetch_time 2024-05-18T01:58:10Z
+  fetch_status 200
+  fetch_redirect null
+  content_digest RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU
+  content_mime_type text/html
+  content_mime_detected text/html
+  content_charset UTF-8
+  content_languages spa
+  content_truncated null
+  warc_filename crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz
+  warc_record_offset 80610731
+  warc_record_length 17423
+  warc_segment 1715971057216.39
+  crawl CC-MAIN-2024-22
+  subset warc
+
+Equivalent to CDXJ:
+org,wikipedia,an)/wiki/escopete 20240518015810 {"url":"https://an.wikipedia.org/wiki/Escopete","mime":"text/html","status":"200","digest":"sha1:RY7PLBUFQNI2FFV5FTUQK72W6SNPXLQU","length":"17423","offset":"80610731","filename":"crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz"}
+```
+</details>
+
+The above command runs code in `Duck.java`, which accesses the relevant part of the index for our crawl (CC-MAIN-2024-22) and then counts the number of records in that crawl (2709877975!). The code runs the SQL query we saw before which should match the single response record we want. 
+
+The program then writes that one record into a local Parquet file, does a second query that returns that one record, and shows the full contents of the record. We can see that the complete row contains many columns containing different information associated with our record. Finally, it converts the row to the CDXJ format we saw before. 
 
 ### Bonus: download a full crawl index and query with DuckDB
 
-TBA
+If you want to run many of these queries, and you have a lot of disk space, you'll want to download the 300 gigabyte index and query it repeatedly. Run
+
+```shell
+aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ .'
+```
+
+or (if you don't have access through the AWS CLI):
+
+```shell
+mkdir -p cc-main-2024-22
+cd cc-main-2024-22
+
+wget https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/cc-index-table.paths.gz
+gunzip cc-index-table.paths.gz
+
+grep 'subset=warc' cc-index-table.paths | \
+  awk '{print "https://data.commoncrawl.org/" $1, $1}' | \
+  xargs -n 2 -P 10 sh -c '
+    echo "Downloading: $2"
+    mkdir -p "$(dirname "$2")" &&
+    wget -O "$2" "$1"
+  ' _
+
+cd -
+```
+
+then you can run `make duck_local_files LOCAL_DIR=/path/to/the/downloaded/data` to run the same query as above, but this time using your local copy of the index files.
+
+> [!IMPORTANT]
+> If you happen to be using the Common Crawl Foundation development server, we've already downloaded these files, and you can run ```make duck_ccf_local_files```
+
+All of these scripts run the same SQL query and should return the same record (written as a parquet file).
 
 ## Bonus 2: combine some steps
 
@@ -726,7 +843,7 @@ We make more datasets available than just the ones discussed in this Whirlwind T
 
 Common Crawl regularly releases Web Graphs which are graphs describing the structure and connectivity of the web as captured in the crawl releases. We provide two levels of graph: host-level and domain-level. Both are available to download [from our website](https://commoncrawl.org/web-graphs). 
 
-The host-level graph describes links between pages on the web at the level of hostnames (e.g. `en.wikipedia.org`). The domain-level graph aggregates this information in the host-level graph, describing links at the pay-level domain (PLD) level (based on the public suffix list maintained on [publicsuffix.org](publicsuffix.org)). The PLD is the subdomain directly under the top-level domain (TLD): e.g. for `en.wikipedia.org`, the TLD would be `.org` and the PLD would be `wikipedia.org`.
+The host-level graph describes links between pages on the web at the level of hostnames (e.g. `en.wikipedia.org`). The domain-level graph aggregates this information in the host-level graph, describing links at the pay-level domain (PLD) level (based on the public suffix list maintained on [publicsuffix.org](https://publicsuffix.org)). The PLD is the subdomain directly under the top-level domain (TLD): e.g. for `en.wikipedia.org`, the TLD would be `.org` and the PLD would be `wikipedia.org`.
 
 As an example, let's look at the [Web Graph release for March, April and May 2025](https://data.commoncrawl.org/projects/hyperlinkgraph/cc-main-2025-mar-apr-may/index.html). This page provides links to download data associated with the host- and domain-level graph for those months. The key files needed to construct the graphs are the files containing the vertices or nodes (the hosts or domains), and the files containing the edges (the links between the hosts/domains). These are currently the top two links in each of the tables. 
 

diff --git a/src/main/java/org/commoncrawl/whirlwind/Duck.java b/src/main/java/org/commoncrawl/whirlwind/Duck.java
@@ -24,6 +24,7 @@
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.sql.*;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
@@ -38,7 +39,7 @@ public class Duck {
 	private static final DateTimeFormatter TIMESTAMP_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMddHHmmss");
 
 	public enum Algorithm {
-		CCF_LOCAL_FILES("ccf_local_files"), CLOUDFRONT("cloudfront");
+		CCF_LOCAL_FILES("ccf_local_files"), CLOUDFRONT("cloudfront"), LOCAL_FILES("local_files");
 
 		private final String name;
 
@@ -113,8 +114,13 @@ public static void printRowAsKvList(ResultSet rs, PrintStream out) throws SQLExc
 	/**
 	 * Gets the list of parquet files to query based on the algorithm.
 	 */
-	public static List<String> getFiles(Algorithm algo, String crawl) throws IOException {
+	public static List<String> getFiles(Algorithm algo, String crawl, String localPrefix) throws IOException {
 		switch (algo) {
+		case LOCAL_FILES: {
+			Path indexPath = Path.of(localPrefix);
+			return getLocalParquetFiles(indexPath);
+		}
+
 		case CCF_LOCAL_FILES: {
 			Path indexPath = Path.of("/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc", "crawl=" + crawl,
 					"subset=warc");
@@ -124,7 +130,7 @@ public static List<String> getFiles(Algorithm algo, String crawl) throws IOExcep
 		case CLOUDFRONT: {
 			String externalPrefix = String
 					.format("https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=%s/subset=warc/", crawl);
-			String pathsFile = crawl + ".warc.paths.gz";
+			String pathsFile = Paths.get("data", crawl + ".warc.paths.gz").toString();
 
 			List<String> files = new ArrayList<>();
 			try (GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(pathsFile));
@@ -142,6 +148,23 @@ public static List<String> getFiles(Algorithm algo, String crawl) throws IOExcep
 		}
 	}
 
+	private static List<String> getLocalParquetFiles(Path indexPath) throws IOException {
+		if (!Files.isDirectory(indexPath)) {
+			System.err.println("Directory not found: " + indexPath);
+			System.exit(1);
+		}
+
+		List<String> files = Files.list(indexPath).filter(p -> p.toString().endsWith(".parquet")).map(Path::toString)
+				.collect(Collectors.toList());
+
+		if (files.isEmpty()) {
+			System.err.println("No parquet files found in: " + indexPath);
+			System.exit(1);
+		}
+
+		return files;
+	}
+
 	private static List<String> getLocalParquetFiles(Path indexPath, String prefix, String crawl) throws IOException {
 		if (!Files.isDirectory(indexPath)) {
 			printIndexDownloadAdvice(prefix, crawl);
@@ -189,6 +212,7 @@ private static ResultSet executeWithRetry(Statement stmt, String sql) throws SQL
 	public static void main(String[] args) {
 		String crawl = "CC-MAIN-2024-22";
 		Algorithm algo = Algorithm.CLOUDFRONT;
+		String localPrefix = "/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc";
 
 		if (args.length > 0) {
 			if ("help".equalsIgnoreCase(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0])) {
@@ -200,20 +224,30 @@ public static void main(String[] args) {
 			System.out.println("Using algorithm: " + algo.getName());
 		}
 
+		if (algo == Algorithm.LOCAL_FILES) {
+			if (args.length < 2) {
+				System.err.println("Error: local_files algorithm requires a directory argument.");
+				printUsage();
+				System.exit(1);
+			}
+			localPrefix = args[1];
+		}
+
 		try {
-			run(algo, crawl);
+			run(algo, crawl, localPrefix);
 		} catch (Exception e) {
 			System.err.println("Error: " + e.getMessage());
 			printUsage();
 			System.exit(1);
 		}
 	}
 
-	public static void run(Algorithm algo, String crawl) throws IOException, SQLException, InterruptedException {
+	public static void run(Algorithm algo, String crawl, String localPrefix)
+			throws IOException, SQLException, InterruptedException {
 		// Ensure stdout uses UTF-8
 		PrintStream out = new PrintStream(System.out, true, StandardCharsets.UTF_8);
 
-		List<String> files = getFiles(algo, crawl);
+		List<String> files = getFiles(algo, crawl, localPrefix);
 		String filesList = files.stream().map(f -> "'" + f + "'").collect(Collectors.joining(", "));
 
 		// Use in-memory DuckDB
@@ -304,14 +338,19 @@ private static void printResultSet(ResultSet rs, PrintStream out) throws SQLExce
 	}
 
 	private static void printUsage() {
-		System.err.println("Usage: Duck [algorithm]");
+		System.err.println("Usage: Duck [algorithm] [local-directory]");
 		System.err.println();
 		System.err.println("Query Common Crawl index using DuckDB.");
 		System.err.println();
 		System.err.println("Algorithms:");
-		System.err.println("  ccf_local_files  Use local parquet files from /home/cc-pds/commoncrawl/...");
+		System.err.println("  local_files      Use local parquet files (from specified local directory)");
+		System.err.println(
+				"  ccf_local_files  Use local parquet files (default: /home/cc-pds/commoncrawl/cc-index/table/cc-main/warc)");
 		System.err.println("  cloudfront       Use CloudFront URLs (requires <crawl>.warc.paths.gz file)");
 		System.err.println();
+		System.err.println("Arguments:");
+		System.err.println("  local-directory  Local directory prefix for 'local_files' algorithm");
+		System.err.println();
 		System.err.println("Options:");
 		System.err.println("  help, --help, -h  Show this help message");
 	}