22
33import nl .melp .redis .Redis ;
44import nl .melp .redis .collections .*;
5+ import org .apache .http .Header ;
6+ import org .apache .http .HttpEntity ;
7+ import org .apache .http .client .methods .CloseableHttpResponse ;
8+ import org .apache .http .client .methods .HttpGet ;
9+ import org .apache .http .impl .client .CloseableHttpClient ;
10+ import org .apache .http .impl .client .HttpClients ;
511import org .jsoup .Jsoup ;
612import org .jsoup .nodes .Document ;
713import org .jsoup .nodes .Element ;
1723import java .net .ConnectException ;
1824import java .net .Socket ;
1925import java .net .URI ;
20- import java .net .http .HttpClient ;
21- import java .net .http .HttpRequest ;
22- import java .net .http .HttpResponse ;
2326import java .security .KeyManagementException ;
2427import java .security .NoSuchAlgorithmException ;
25- import java .time .Duration ;
2628import java .time .ZoneOffset ;
2729import java .time .ZonedDateTime ;
2830import java .time .format .DateTimeFormatter ;
@@ -35,15 +37,15 @@ public class LinkChecker {
3537 private static Logger logger = LoggerFactory .getLogger (LinkChecker .class );
3638 private static int timeout = 30 ;
3739
38- private final BlockingDeque <HttpClient > clients ;
40+ private final BlockingDeque <CloseableHttpClient > clients ;
3941 private final Map <URI , Integer > statuses ;
4042 private final Collection <URI > urls ;
4143 private final Map <URI , Set <URI >> reverseLinks ;
4244 private final Map <String , Set <URI >> invalidUrls ;
4345 private final ExecutorService executor ;
4446 private final Set <Future > running ;
4547 private final BiPredicate <URI , URI > shouldFollowLinks ;
46- private final BiPredicate <URI , HttpResponse < String > > shouldExtractLinks ;
48+ private final BiPredicate <URI , HttpEntity > shouldExtractLinks ;
4749 private final int msDelay ;
4850 private long startTimeMs ;
4951
@@ -53,7 +55,7 @@ public LinkChecker(
5355 Map <URI , Set <URI >> reverseLinks ,
5456 Map <String , Set <URI >> invalidUrls ,
5557 BiPredicate <URI , URI > shouldFollowLinks ,
56- BiPredicate <URI , HttpResponse < String > > shouldExtractLinks ,
58+ BiPredicate <URI , HttpEntity > shouldExtractLinks ,
5759 int numThreads ,
5860 int msDelay
5961 ) {
@@ -69,7 +71,7 @@ public LinkChecker(
6971 this .msDelay = msDelay ;
7072
7173 for (int i = 0 ; i < numThreads ; i ++) {
72- clients .offer (HttpClient . newHttpClient ());
74+ clients .offer (HttpClients . createMinimal ());
7375 }
7476 }
7577
@@ -133,7 +135,7 @@ public Map<URI, Integer> run() throws InterruptedException {
133135 continue ;
134136 }
135137 statuses .put (url , -1 );
136- HttpClient httpClient = clients .take ();
138+ CloseableHttpClient httpClient = clients .take ();
137139
138140 Future <?> task = executor .submit (
139141 () -> {
@@ -147,42 +149,42 @@ public Map<URI, Integer> run() throws InterruptedException {
147149 }
148150
149151 logger .trace ("OPENING " + url );
150- HttpRequest request = HttpRequest .newBuilder ()
151- .uri (url )
152- .timeout (Duration .ofSeconds (timeout ))
153- .build ();
154-
155- HttpResponse <String > response = httpClient .send (request , HttpResponse .BodyHandlers .ofString ());
156- int status = response .statusCode ();
157- statuses .put (url , status );
158-
159- if (status >= 400 ) {
160- logger .info ("Got status " + status + " at " + url + "; so far referred to by " + reverseLinks .get (url ));
161- } else {
162- logger .trace ("Got status " + status + " at " + url );
163- }
164152
165- if (shouldExtractLinks .test (url , response )) {
166- String contentType = response .headers ().firstValue ("Content-Type" ).orElse ("" );
167- if (status == 200 ) {
168- if (contentType .startsWith ("text/html" )) {
169- response .body ();
170- Document d = Jsoup .parse (response .body ());
171- Elements links = d .select ("a[href]" );
172- logger .trace ("Found " + links .size () + " on " + url );
173- for (Element link : links ) {
174- addUrl (url , link .attr ("href" ));
153+ var request = new HttpGet (url );
154+ try (CloseableHttpResponse response = httpClient .execute (request )) {
155+ int status = response .getStatusLine ().getStatusCode ();
156+ statuses .put (url , status );
157+
158+ if (status >= 400 ) {
159+ logger .info ("Got status " + status + " at " + url + "; so far referred to by " + reverseLinks .get (url ));
160+ } else {
161+ logger .trace ("Got status " + status + " at " + url );
162+ }
163+ HttpEntity responseEntity = response .getEntity ();
164+
165+ if (shouldExtractLinks .test (url , responseEntity )) {
166+ Header contentTypeHeader = response .getFirstHeader ("Content-Type" );
167+ String contentType = contentTypeHeader == null ? "UNKNOWN" : contentTypeHeader .getValue ();
168+
169+ if (status == 200 ) {
170+ if (contentType .startsWith ("text/html" )) {
171+ Document d = Jsoup .parse (responseEntity .getContent (), "UTF-8" , url .toString ());
172+ Elements links = d .select ("a[href]" );
173+ logger .trace ("Found " + links .size () + " on " + url );
174+ for (Element link : links ) {
175+ addUrl (url , link .attr ("href" ));
176+ }
177+ } else {
178+ logger .trace ("Not following links in content type " + contentType );
179+ }
180+ } else if (response .getFirstHeader ("Location" ) != null ) {
181+ String location = response .getFirstHeader ("Location" ).getValue ();
182+ if (addUrl (url , location )) {
183+ logger .trace ("Following redirect (" + status + ") [" + url + " => " + location );
175184 }
176185 } else {
177- logger .trace ("Not following links in content type " + response .headers ().firstValue ("Content-Type" ).orElse ("UNKNOWN" ));
178- }
179- } else if (response .headers ().firstValue ("Location" ).isPresent ()) {
180- String location = response .headers ().firstValue ("Location" ).get ();
181- if (addUrl (url , location )) {
182- logger .trace ("Following redirect (" + status + ") [" + url + " => " + location );
186+ logger .debug ("Skipping {}, content-type: {}" , url , contentType );
183187 }
184- } else {
185- logger .debug ("Skipping {}, content-type: {}" , url , contentType );
186188 }
187189 }
188190 } catch (java .lang .IllegalArgumentException | IOException e ) {
0 commit comments