@@ -87,18 +87,29 @@ impl Tool for WebFetchTool {
8787 return Ok ( ToolOutput :: error ( reason) ) ;
8888 }
8989
90- // TODO: Replace with real HTTP fetch + HTML-to-text in Phase 2.
91- // Implementation plan:
92- // 1. reqwest::Client with timeout_secs, redirect policy (max 5)
93- // 2. Check Content-Length against max_size before downloading
94- // 3. Download body with streaming size limit
95- // 4. Detect Content-Type: HTML → strip tags, JSON → pretty-print,
96- // plain text → return as-is
97- // 5. Truncate to ~100k chars to avoid context overflow
98- // 6. Apply prompt as extraction instruction (optional LLM pass)
99-
100- let stub_content = stub_fetch ( & url, & prompt, timeout_secs, max_size) ;
101- Ok ( ToolOutput :: success ( stub_content) )
90+ let content = fetch_url ( & url, timeout_secs, max_size) . await ?;
91+
92+ // Strip HTML tags if the response looks like HTML
93+ let text = if content. contains ( "<html" ) || content. contains ( "<!DOCTYPE" ) {
94+ strip_html_tags ( & content)
95+ } else {
96+ content
97+ } ;
98+
99+ // Truncate to prevent context overflow (~100k chars)
100+ let truncated = if text. len ( ) > 100_000 {
101+ format ! (
102+ "{}...\n \n [truncated — full page was {} chars]" ,
103+ & text[ ..100_000 ] ,
104+ text. len( )
105+ )
106+ } else {
107+ text
108+ } ;
109+
110+ Ok ( ToolOutput :: success ( format ! (
111+ "# Web Fetch: {url}\n \n **Prompt:** {prompt}\n \n ---\n \n {truncated}"
112+ ) ) )
102113 } )
103114 }
104115
@@ -128,23 +139,78 @@ fn validate_url(url: &str) -> std::result::Result<(), String> {
128139 Ok ( ( ) )
129140}
130141
131- /// Generate a stub response for development/testing.
132- fn stub_fetch ( url : & str , prompt : & str , timeout_secs : u64 , max_size : u64 ) -> String {
133- format ! (
134- "# Web Fetch Result\n \n \
135- **URL:** {url}\n \
136- **Prompt:** {prompt}\n \
137- **Timeout:** {timeout_secs}s\n \
138- **Max size:** {max_size} bytes\n \n \
139- ---\n \n \
140- This is a placeholder response. Web fetching is not yet connected to a \
141- real HTTP client. In Phase 2, this tool will:\n \
142- - Fetch the page via reqwest with configurable timeout\n \
143- - Convert HTML to plain text (strip tags, extract main content)\n \
144- - Apply size limits to prevent context overflow\n \
145- - Optionally use the prompt to guide content extraction\n \n \
146- To test with real content, configure an HTTP client in the tools crate."
147- )
142+ /// Fetch a URL using curl subprocess.
143+ async fn fetch_url ( url : & str , timeout_secs : u64 , max_size : u64 ) -> crab_common:: Result < String > {
144+ let cmd = format ! (
145+ "curl -sS -L --max-time {timeout_secs} --max-filesize {max_size} -A 'CrabCode/1.0' '{url}'"
146+ ) ;
147+ let mut opts = crab_process:: spawn:: shell_command ( & cmd) ;
148+ opts. timeout = Some ( std:: time:: Duration :: from_secs ( timeout_secs + 5 ) ) ;
149+
150+ let output = crab_process:: spawn:: run ( opts) . await ?;
151+ if output. exit_code != 0 {
152+ return Err ( crab_common:: Error :: Other ( format ! (
153+ "curl failed (exit {}): {}" ,
154+ output. exit_code,
155+ output. stderr. trim( )
156+ ) ) ) ;
157+ }
158+ Ok ( output. stdout )
159+ }
160+
161+ /// Strip HTML tags to extract plain text content.
162+ fn strip_html_tags ( html : & str ) -> String {
163+ let mut result = String :: with_capacity ( html. len ( ) / 2 ) ;
164+ let mut in_tag = false ;
165+ let mut in_script = false ;
166+ let mut in_style = false ;
167+
168+ let lower = html. to_lowercase ( ) ;
169+ let chars: Vec < char > = html. chars ( ) . collect ( ) ;
170+ let lower_chars: Vec < char > = lower. chars ( ) . collect ( ) ;
171+
172+ let mut i = 0 ;
173+ while i < chars. len ( ) {
174+ if !in_tag && chars[ i] == '<' {
175+ // Check for script/style start
176+ let remaining: String = lower_chars[ i..] . iter ( ) . take ( 10 ) . collect ( ) ;
177+ if remaining. starts_with ( "<script" ) {
178+ in_script = true ;
179+ } else if remaining. starts_with ( "<style" ) {
180+ in_style = true ;
181+ }
182+ in_tag = true ;
183+ } else if in_tag && chars[ i] == '>' {
184+ let remaining: String = lower_chars[ i. saturating_sub ( 8 ) ..=i] . iter ( ) . collect ( ) ;
185+ if remaining. contains ( "</script>" ) {
186+ in_script = false ;
187+ } else if remaining. contains ( "</style>" ) {
188+ in_style = false ;
189+ }
190+ in_tag = false ;
191+ } else if !in_tag && !in_script && !in_style {
192+ result. push ( chars[ i] ) ;
193+ }
194+ i += 1 ;
195+ }
196+
197+ // Clean up excessive whitespace
198+ let mut cleaned = String :: with_capacity ( result. len ( ) ) ;
199+ let mut prev_newline = false ;
200+ for line in result. lines ( ) {
201+ let trimmed = line. trim ( ) ;
202+ if trimmed. is_empty ( ) {
203+ if !prev_newline {
204+ cleaned. push ( '\n' ) ;
205+ prev_newline = true ;
206+ }
207+ } else {
208+ cleaned. push_str ( trimmed) ;
209+ cleaned. push ( '\n' ) ;
210+ prev_newline = false ;
211+ }
212+ }
213+ cleaned
148214}
149215
150216#[ cfg( test) ]
0 commit comments