Skip to content

Commit 0a3d2ba

Browse files
committed
Fix formatting
1 parent 2bda962 commit 0a3d2ba

4 files changed

Lines changed: 125 additions & 137 deletions

File tree

pom.xml

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,33 @@
11
<?xml version="1.0" encoding="UTF-8"?>
22
<project
3-
xmlns="http://maven.apache.org/POM/4.0.0"
4-
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
5-
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
6-
<modelVersion>4.0.0</modelVersion>
3+
xmlns="http://maven.apache.org/POM/4.0.0"
4+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
5+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
6+
<modelVersion>4.0.0</modelVersion>
77

8-
<groupId>org.commoncrawl</groupId>
9-
<artifactId>whirlwind-java</artifactId>
10-
<version>0.1.0-SNAPSHOT</version>
11-
<name>Whirlwind Tour Java Tools</name>
8+
<groupId>org.commoncrawl</groupId>
9+
<artifactId>whirlwind-java</artifactId>
10+
<version>0.1.0-SNAPSHOT</version>
11+
<name>Whirlwind Tour Java Tools</name>
1212

13-
<properties>
14-
<maven.compiler.release>11</maven.compiler.release>
15-
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
16-
</properties>
13+
<properties>
14+
<maven.compiler.release>11</maven.compiler.release>
15+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
16+
</properties>
1717

18-
<dependencies>
19-
<dependency>
20-
<groupId>org.apache.commons</groupId>
21-
<artifactId>commons-compress</artifactId>
22-
<version>1.28.0</version>
23-
</dependency>
24-
<dependency>
25-
<groupId>org.netpreserve</groupId>
26-
<artifactId>jwarc</artifactId>
27-
<version>0.32.0</version>
28-
</dependency>
18+
<dependencies>
19+
<dependency>
20+
<groupId>org.apache.commons</groupId>
21+
<artifactId>commons-compress</artifactId>
22+
<version>1.28.0</version>
23+
</dependency>
24+
<dependency>
25+
<groupId>org.netpreserve</groupId>
26+
<artifactId>jwarc</artifactId>
27+
<version>0.32.0</version>
28+
</dependency>
2929

30-
</dependencies>
30+
</dependencies>
3131

3232
<build>
3333
<plugins>

src/main/java/org/commoncrawl/whirlwind/ReadWARC.java

Lines changed: 33 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -29,49 +29,42 @@
2929

3030
public class ReadWARC {
3131

32-
private static final List<String> RESPONSE_TYPES = Arrays.asList("request", "response", "conversion", "metadata");
32+
private static final List<String> RESPONSE_TYPES = Arrays.asList("request", "response", "conversion", "metadata");
3333

34-
public static void main(String[] args) throws IOException {
34+
public static void main(String[] args) throws IOException {
3535

36-
if (args.length != 1) {
37-
System.err.println("Usage: java ReadWARC <input-warc-file>");
38-
System.exit(1);
39-
}
36+
if (args.length != 1) {
37+
System.err.println("Usage: java ReadWARC <input-warc-file>");
38+
System.exit(1);
39+
}
4040

41-
Path requested = Path.of(args[0]).toAbsolutePath().normalize();
42-
if (!Files.isRegularFile(requested)) {
43-
throw new SecurityException("Invalid WARC path");
44-
}
41+
Path requested = Path.of(args[0]).toAbsolutePath().normalize();
42+
if (!Files.isRegularFile(requested)) {
43+
throw new SecurityException("Invalid WARC path");
44+
}
4545

46-
if (requested.toString().endsWith("gz") || requested.toString().endsWith("gzip")) {
47-
try {
48-
ValidateWARC.validateRandomAccessWarcOrFail(requested);
49-
} catch (IOException e) {
50-
System.out.println("This file is probably not a multi-member gzip but a single gzip file." +
51-
"\n" +
52-
"To allow seek, a gzipped WARC must have each record compressed into a single gzip member and concatenated together." +
53-
"\n" +
54-
"\n" +
55-
"This file is likely still valid and can be fixed by running:" +
56-
"\n" +
57-
"mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args=\"testing.warc testing.warc.gz\"");
58-
System.exit(-1);
59-
}
60-
}
46+
if (requested.toString().endsWith("gz") || requested.toString().endsWith("gzip")) {
47+
try {
48+
ValidateWARC.validateRandomAccessWarcOrFail(requested);
49+
} catch (IOException e) {
50+
System.out.println("This file is probably not a multi-member gzip but a single gzip file." + "\n"
51+
+ "To allow seek, a gzipped WARC must have each record compressed into a single gzip member and concatenated together."
52+
+ "\n" + "\n" + "This file is likely still valid and can be fixed by running:" + "\n"
53+
+ "mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args=\"testing.warc testing.warc.gz\"");
54+
System.exit(-1);
55+
}
56+
}
6157

62-
try (
63-
InputStream in = Files.newInputStream(requested);
64-
WarcReader reader = new WarcReader(in)
65-
) {
66-
reader.records().forEach(record -> {
67-
System.out.println(" WARC-Type: " + record.type());
68-
if (RESPONSE_TYPES.contains(record.type())) {
69-
MessageHeaders headers = record.headers();
70-
for (String header : headers.all("WARC-Target-URI")) {
71-
System.out.println(" WARC-Target-URI " + header);
72-
}
73-
}
74-
});
75-
}
76-
}
58+
try (InputStream in = Files.newInputStream(requested); WarcReader reader = new WarcReader(in)) {
59+
reader.records().forEach(record -> {
60+
System.out.println(" WARC-Type: " + record.type());
61+
if (RESPONSE_TYPES.contains(record.type())) {
62+
MessageHeaders headers = record.headers();
63+
for (String header : headers.all("WARC-Target-URI")) {
64+
System.out.println(" WARC-Target-URI " + header);
65+
}
66+
}
67+
});
68+
}
69+
}
7770
}

src/main/java/org/commoncrawl/whirlwind/RecompressWARC.java

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -32,39 +32,38 @@
3232

3333
public class RecompressWARC {
3434

35-
public static void main(String[] args) throws IOException {
35+
public static void main(String[] args) throws IOException {
3636

37-
if (args.length != 2) {
38-
System.err.println("Usage: java RecompressWarc <input-uncompressed-warc-file> <output-compressed-warc-file>");
39-
System.exit(1);
40-
}
37+
if (args.length != 2) {
38+
System.err
39+
.println("Usage: java RecompressWarc <input-uncompressed-warc-file> <output-compressed-warc-file>");
40+
System.exit(1);
41+
}
4142

42-
Path inputPath = Path.of(args[0]).toAbsolutePath().normalize();
43-
Path outputPath = Path.of(args[1]).toAbsolutePath().normalize();
43+
Path inputPath = Path.of(args[0]).toAbsolutePath().normalize();
44+
Path outputPath = Path.of(args[1]).toAbsolutePath().normalize();
4445

45-
if (!Files.isRegularFile(inputPath)) {
46-
throw new SecurityException("Invalid input WARC path");
47-
}
46+
if (!Files.isRegularFile(inputPath)) {
47+
throw new SecurityException("Invalid input WARC path");
48+
}
4849

49-
if (inputPath.endsWith(".gz")) {
50-
System.out.println("Input WARC file is already compressed");
51-
System.exit(1);
52-
}
50+
if (inputPath.endsWith(".gz")) {
51+
System.out.println("Input WARC file is already compressed");
52+
System.exit(1);
53+
}
5354

54-
try (
55-
InputStream in = Files.newInputStream(inputPath);
56-
WarcReader reader = new WarcReader(in);
57-
OutputStream out = Files.newOutputStream(outputPath);
58-
WritableByteChannel outChannel = Channels.newChannel(out);
59-
WarcWriter writer = new WarcWriter(outChannel, WarcCompression.GZIP)
60-
) {
61-
reader.forEach(record -> {
62-
try {
63-
writer.write(record);
64-
} catch (IOException e) {
65-
throw new UncheckedIOException(e);
66-
}
67-
});
68-
}
69-
}
55+
try (InputStream in = Files.newInputStream(inputPath);
56+
WarcReader reader = new WarcReader(in);
57+
OutputStream out = Files.newOutputStream(outputPath);
58+
WritableByteChannel outChannel = Channels.newChannel(out);
59+
WarcWriter writer = new WarcWriter(outChannel, WarcCompression.GZIP)) {
60+
reader.forEach(record -> {
61+
try {
62+
writer.write(record);
63+
} catch (IOException e) {
64+
throw new UncheckedIOException(e);
65+
}
66+
});
67+
}
68+
}
7069
}

src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -26,57 +26,53 @@
2626
import java.util.concurrent.atomic.AtomicInteger;
2727

2828
public class ValidateWARC {
29-
public static void main(String[] args) throws Exception {
30-
if (args.length != 1) {
31-
System.err.println("Usage: java ValidateWARC <file.gz>");
32-
System.exit(2);
33-
}
29+
public static void main(String[] args) throws Exception {
30+
if (args.length != 1) {
31+
System.err.println("Usage: java ValidateWARC <file.gz>");
32+
System.exit(2);
33+
}
3434

35-
Path requested = Path.of(args[0]).toAbsolutePath().normalize();
36-
if (!Files.isRegularFile(requested)) {
37-
throw new SecurityException("Invalid WARC path");
38-
}
35+
Path requested = Path.of(args[0]).toAbsolutePath().normalize();
36+
if (!Files.isRegularFile(requested)) {
37+
throw new SecurityException("Invalid WARC path");
38+
}
3939

40-
int n = getWarcCompressionInformation(requested);
41-
if (n <= 1) {
42-
System.out.println("Single-member gzip (likely whole-file gzip). members=" + n);
43-
} else {
44-
System.out.println("Concatenated multi-member gzip (record-compressed). members=" + n);
45-
}
40+
int n = getWarcCompressionInformation(requested);
41+
if (n <= 1) {
42+
System.out.println("Single-member gzip (likely whole-file gzip). members=" + n);
43+
} else {
44+
System.out.println("Concatenated multi-member gzip (record-compressed). members=" + n);
45+
}
4646

47-
}
47+
}
4848

49-
public static int getWarcCompressionInformation(Path inputWarc) throws IOException {
50-
final AtomicInteger memberCount = new AtomicInteger(0);
49+
public static int getWarcCompressionInformation(Path inputWarc) throws IOException {
50+
final AtomicInteger memberCount = new AtomicInteger(0);
5151

52-
try (
53-
InputStream fis = Files.newInputStream(inputWarc);
54-
BufferedInputStream bis = new BufferedInputStream(fis);
55-
GzipCompressorInputStream gz = GzipCompressorInputStream.builder()
56-
.setDecompressConcatenated(true)
57-
.setOnMemberEnd(x -> memberCount.incrementAndGet())
58-
.setInputStream(bis).get()
59-
) {
52+
try (InputStream fis = Files.newInputStream(inputWarc);
53+
BufferedInputStream bis = new BufferedInputStream(fis);
54+
GzipCompressorInputStream gz = GzipCompressorInputStream.builder().setDecompressConcatenated(true)
55+
.setOnMemberEnd(x -> memberCount.incrementAndGet()).setInputStream(bis).get()) {
6056

61-
byte[] buf = new byte[64 * 1024];
62-
while (gz.read(buf) != -1) {
63-
// Read the entire stream to trigger member processing
64-
// We might not need to read the whole stream, just enough to get an idea
65-
}
66-
} catch (IOException e) {
67-
throw new IllegalArgumentException("The file is either not a gzip file or is corrupted.", e);
68-
}
57+
byte[] buf = new byte[64 * 1024];
58+
while (gz.read(buf) != -1) {
59+
// Read the entire stream to trigger member processing
60+
// We might not need to read the whole stream, just enough to get an idea
61+
}
62+
} catch (IOException e) {
63+
throw new IllegalArgumentException("The file is either not a gzip file or is corrupted.", e);
64+
}
6965

70-
return memberCount.get();
71-
}
66+
return memberCount.get();
67+
}
7268

73-
public static void validateRandomAccessWarcOrFail(Path inputWarc) throws IOException {
74-
int n = getWarcCompressionInformation(inputWarc);
69+
public static void validateRandomAccessWarcOrFail(Path inputWarc) throws IOException {
70+
int n = getWarcCompressionInformation(inputWarc);
7571

76-
if (n <= 1) {
77-
throw new IOException("Non-chunked gzip file detected, gzip block continues\n" +
78-
" beyond single record. " + n);
79-
}
72+
if (n <= 1) {
73+
throw new IOException(
74+
"Non-chunked gzip file detected, gzip block continues\n" + " beyond single record. " + n);
75+
}
8076

81-
}
77+
}
8278
}

0 commit comments

Comments
 (0)