Skip to content

Commit 111c3dd

Browse files
committed
implement org.apache HttpClient
1 parent 14757ec commit 111c3dd

8 files changed

Lines changed: 111 additions & 41 deletions

lib/commons-codec-1.9.jar

258 KB
Binary file not shown.

lib/fetch.sh

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env bash
2+
3+
ROOT=$(cd $(dirname $0) && pwd)
4+
5+
fetch() {
6+
fetch_maven_deps() {
7+
local dir=$(mktemp -d);
8+
9+
(
10+
cd "$dir";
11+
cat > pom.xml <<-EOF
12+
<project>
13+
<modelVersion>4.0.0</modelVersion>
14+
<groupId>temp</groupId>
15+
<artifactId>temp</artifactId>
16+
<version>master</version>
17+
<repositories>
18+
<repository>
19+
<id>jitpack.io</id>
20+
<url>https://jitpack.io</url>
21+
</repository>
22+
</repositories>
23+
<dependencies>
24+
EOF
25+
26+
for package in "$@"; do
27+
echo "$package" | \
28+
sed -E 's!([^:]+):([^:]+):([^:]+)!<dependency><groupId>\1</groupId><artifactId>\2</artifactId><version>\3</version></dependency>!g' \
29+
>> pom.xml
30+
done;
31+
32+
cat >> pom.xml <<-EOF
33+
</dependencies>
34+
</project>
35+
EOF
36+
37+
mvn dependency:copy-dependencies
38+
mv target/dependency/* "$ROOT";
39+
)
40+
rm -rf "$dir";
41+
}
42+
43+
fetch_maven_deps \
44+
'org.apache.httpcomponents:httpclient:4.5.3' \
45+
'junit:junit:4.12' \
46+
'org.slf4j:slf4j-log4j12:1.7.7' \
47+
'org.jsoup:jsoup:1.11.3'
48+
49+
curl -sL https://github.com/drm/java-redis-client/releases/download/v2.0.2/java-redis-client-v2.0.2--javac-11.0.2.jar -o ./java-redis-client-v2.0.2.jar
50+
curl -sL https://github.com/drm/java-redis-collections/releases/download/v.0.3.0-beta.3/java-redis-collections-v.0.3.0-beta.3--javac-11.0.3.jar -o ./java-redis-collections-v0.3.0-beta.3.jar
51+
}
52+
53+
clean() {
54+
rm -f *.jar;
55+
}
56+
57+
if [[ "$1" == "" ]]; then
58+
echo "Usage: ${0} [clean] fetch"
59+
exit 1
60+
fi
61+
62+
set -x
63+
set -e
64+
65+
66+
for a in $@; do
67+
$a;
68+
done

lib/httpclient-4.5.3.jar

730 KB
Binary file not shown.

lib/httpcore-4.4.6.jar

316 KB
Binary file not shown.
20.7 KB
Binary file not shown.
32.9 KB
Binary file not shown.

lib/jsoup-1.11.3.jar

0 Bytes
Binary file not shown.

src/nl/melp/linkchecker/LinkChecker.java

Lines changed: 43 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
import nl.melp.redis.Redis;
44
import nl.melp.redis.collections.*;
5+
import org.apache.http.Header;
6+
import org.apache.http.HttpEntity;
7+
import org.apache.http.client.methods.CloseableHttpResponse;
8+
import org.apache.http.client.methods.HttpGet;
9+
import org.apache.http.impl.client.CloseableHttpClient;
10+
import org.apache.http.impl.client.HttpClients;
511
import org.jsoup.Jsoup;
612
import org.jsoup.nodes.Document;
713
import org.jsoup.nodes.Element;
@@ -17,12 +23,8 @@
1723
import java.net.ConnectException;
1824
import java.net.Socket;
1925
import java.net.URI;
20-
import java.net.http.HttpClient;
21-
import java.net.http.HttpRequest;
22-
import java.net.http.HttpResponse;
2326
import java.security.KeyManagementException;
2427
import java.security.NoSuchAlgorithmException;
25-
import java.time.Duration;
2628
import java.time.ZoneOffset;
2729
import java.time.ZonedDateTime;
2830
import java.time.format.DateTimeFormatter;
@@ -35,15 +37,15 @@ public class LinkChecker {
3537
private static Logger logger = LoggerFactory.getLogger(LinkChecker.class);
3638
private static int timeout = 30;
3739

38-
private final BlockingDeque<HttpClient> clients;
40+
private final BlockingDeque<CloseableHttpClient> clients;
3941
private final Map<URI, Integer> statuses;
4042
private final Collection<URI> urls;
4143
private final Map<URI, Set<URI>> reverseLinks;
4244
private final Map<String, Set<URI>> invalidUrls;
4345
private final ExecutorService executor;
4446
private final Set<Future> running;
4547
private final BiPredicate<URI, URI> shouldFollowLinks;
46-
private final BiPredicate<URI, HttpResponse<String>> shouldExtractLinks;
48+
private final BiPredicate<URI, HttpEntity> shouldExtractLinks;
4749
private final int msDelay;
4850
private long startTimeMs;
4951

@@ -53,7 +55,7 @@ public LinkChecker(
5355
Map<URI, Set<URI>> reverseLinks,
5456
Map<String, Set<URI>> invalidUrls,
5557
BiPredicate<URI, URI> shouldFollowLinks,
56-
BiPredicate<URI, HttpResponse<String>> shouldExtractLinks,
58+
BiPredicate<URI, HttpEntity> shouldExtractLinks,
5759
int numThreads,
5860
int msDelay
5961
) {
@@ -69,7 +71,7 @@ public LinkChecker(
6971
this.msDelay = msDelay;
7072

7173
for (int i = 0; i < numThreads; i++) {
72-
clients.offer(HttpClient.newHttpClient());
74+
clients.offer(HttpClients.createMinimal());
7375
}
7476
}
7577

@@ -133,7 +135,7 @@ public Map<URI, Integer> run() throws InterruptedException {
133135
continue;
134136
}
135137
statuses.put(url, -1);
136-
HttpClient httpClient = clients.take();
138+
CloseableHttpClient httpClient = clients.take();
137139

138140
Future<?> task = executor.submit(
139141
() -> {
@@ -147,42 +149,42 @@ public Map<URI, Integer> run() throws InterruptedException {
147149
}
148150

149151
logger.trace("OPENING " + url);
150-
HttpRequest request = HttpRequest.newBuilder()
151-
.uri(url)
152-
.timeout(Duration.ofSeconds(timeout))
153-
.build();
154-
155-
HttpResponse<String> response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
156-
int status = response.statusCode();
157-
statuses.put(url, status);
158-
159-
if (status >= 400) {
160-
logger.info("Got status " + status + " at " + url + "; so far referred to by " + reverseLinks.get(url));
161-
} else {
162-
logger.trace("Got status " + status + " at " + url);
163-
}
164152

165-
if (shouldExtractLinks.test(url, response)) {
166-
String contentType = response.headers().firstValue("Content-Type").orElse("");
167-
if (status == 200) {
168-
if (contentType.startsWith("text/html")) {
169-
response.body();
170-
Document d = Jsoup.parse(response.body());
171-
Elements links = d.select("a[href]");
172-
logger.trace("Found " + links.size() + " on " + url);
173-
for (Element link : links) {
174-
addUrl(url, link.attr("href"));
153+
var request = new HttpGet(url);
154+
try (CloseableHttpResponse response = httpClient.execute(request)) {
155+
int status = response.getStatusLine().getStatusCode();
156+
statuses.put(url, status);
157+
158+
if (status >= 400) {
159+
logger.info("Got status " + status + " at " + url + "; so far referred to by " + reverseLinks.get(url));
160+
} else {
161+
logger.trace("Got status " + status + " at " + url);
162+
}
163+
HttpEntity responseEntity = response.getEntity();
164+
165+
if (shouldExtractLinks.test(url, responseEntity)) {
166+
Header contentTypeHeader = response.getFirstHeader("Content-Type");
167+
String contentType = contentTypeHeader == null ? "UNKNOWN" : contentTypeHeader.getValue();
168+
169+
if (status == 200) {
170+
if (contentType.startsWith("text/html")) {
171+
Document d = Jsoup.parse(responseEntity.getContent(), "UTF-8", url.toString());
172+
Elements links = d.select("a[href]");
173+
logger.trace("Found " + links.size() + " on " + url);
174+
for (Element link : links) {
175+
addUrl(url, link.attr("href"));
176+
}
177+
} else {
178+
logger.trace("Not following links in content type " + contentType);
179+
}
180+
} else if (response.getFirstHeader("Location") != null) {
181+
String location = response.getFirstHeader("Location").getValue();
182+
if (addUrl(url, location)) {
183+
logger.trace("Following redirect (" + status + ") [" + url + " => " + location);
175184
}
176185
} else {
177-
logger.trace("Not following links in content type " + response.headers().firstValue("Content-Type").orElse("UNKNOWN"));
178-
}
179-
} else if (response.headers().firstValue("Location").isPresent()) {
180-
String location = response.headers().firstValue("Location").get();
181-
if (addUrl(url, location)) {
182-
logger.trace("Following redirect (" + status + ") [" + url + " => " + location);
186+
logger.debug("Skipping {}, content-type: {}", url, contentType);
183187
}
184-
} else {
185-
logger.debug("Skipping {}, content-type: {}", url, contentType);
186188
}
187189
}
188190
} catch (java.lang.IllegalArgumentException | IOException e) {

0 commit comments

Comments
 (0)