Skip to content

Commit b73eedd

Browse files
committed
feat(actions): add an action to transform html to markdown
1 parent 0424bb5 commit b73eedd

9 files changed

Lines changed: 1393 additions & 2 deletions

File tree

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ crate-type = ["staticlib", "cdylib", "rlib"]
1515
[build-dependencies]
1616
cbindgen = "0.29.2"
1717
glob = "^0.3.3"
18+
html-to-markdown-rs = "3.0.2"
1819
libtool = "0.1.1"
1920
linked_hash_set = { version = "0.1.6", features = ["serde"] }
2021
serde = { version = "1.0.228", features = ["derive"] }
@@ -45,6 +46,7 @@ cidr = { version = "0.3.2", features = ["serde"] }
4546
dot_graph = { version = "0.2.3", optional = true }
4647
flate2 = { version = "1.1.9", optional = true }
4748
heck = "0.5.0"
49+
html-to-markdown-rs = "3.0.2"
4850
http = "1.4.0"
4951
lazy_static = "1.5.0"
5052
linked_hash_set = { version = "0.1.6", features = ["serde"] }
@@ -63,6 +65,7 @@ url = "2.5.8"
6365
[dev-dependencies]
6466
pprof = { version = "0.15.0", features = ["flamegraph"] }
6567
criterion = { version = "0.8.2", default-features = false }
68+
tracing-subscriber = "0.3.17"
6669

6770
[[bench]]
6871
name = "match_rule_benchmark"

src/action/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ impl Action {
226226
id: text_body_filter.id.clone(),
227227
target_hash: text_body_filter.target_hash.clone(),
228228
}),
229+
BodyFilter::HTMLToMarkdown(html_to_markdown_filter) => BodyFilter::HTMLToMarkdown(html_to_markdown_filter.clone()),
229230
BodyFilter::Other(_) => {
230231
continue;
231232
}

src/api/body_filter.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use html_to_markdown_rs::ConversionOptions;
12
use serde::{Deserialize, Serialize};
23

34
use crate::{api::VariableValue, marker::StaticOrDynamic};
@@ -145,11 +146,24 @@ pub enum TextAction {
145146
Replace,
146147
}
147148

149+
#[derive(Serialize, Deserialize, Debug, Clone)]
150+
pub struct HTMLToMarkdownFilter {
151+
pub action: HTMLToMarkdownAction,
152+
pub options: Option<ConversionOptions>,
153+
}
154+
155+
#[derive(Serialize, Deserialize, Debug, Clone)]
156+
pub enum HTMLToMarkdownAction {
157+
#[serde(rename = "html_to_markdown")]
158+
Filter,
159+
}
160+
148161
#[derive(Serialize, Deserialize, Debug, Clone)]
149162
#[serde(untagged)]
150163
pub enum BodyFilter {
151164
Text(TextBodyFilter),
152165
HTML(HTMLBodyFilter),
166+
HTMLToMarkdown(HTMLToMarkdownFilter),
153167
#[serde(untagged)]
154168
Other(serde_json::Value),
155169
}

src/build.rs

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@ use std::{
1111
};
1212

1313
use glob::glob;
14+
use html_to_markdown_rs::ConversionOptions;
1415
use linked_hash_set::LinkedHashSet;
1516
use serde::{Deserialize, Serialize};
1617
use serde_yaml::from_str as yaml_decode;
17-
use tera::{Context, Tera};
18+
use tera::{Context, Tera, Value, try_get_value};
1819
#[derive(Serialize, Deserialize, Debug, Clone)]
1920
struct RuleSet {
2021
#[serde(default)]
@@ -180,6 +181,8 @@ struct BodyFilter {
180181
css_selector: Option<String>,
181182
#[serde(skip_serializing_if = "Option::is_none")]
182183
ignore_css_selector: Option<String>,
184+
#[serde(skip_serializing_if = "Option::is_none")]
185+
options: Option<ConversionOptions>,
183186
}
184187

185188
#[derive(Serialize, Deserialize, Debug, Clone)]
@@ -270,10 +273,23 @@ struct RuleTestHeader {
270273
#[derive(Serialize, Deserialize, Debug, Clone)]
271274
struct ShouldFilterBody {
272275
enable: bool,
276+
#[serde(default = "default_false")]
277+
is_binary: bool,
273278
original_body: String,
274279
expected_body: String,
275280
}
276281

282+
#[derive(Serialize, Deserialize, Debug, Clone)]
283+
#[serde(untagged)]
284+
pub enum Body {
285+
String(String),
286+
Bytes(Vec<u8>),
287+
}
288+
289+
fn default_false() -> bool {
290+
false
291+
}
292+
277293
#[derive(Serialize, Deserialize, Debug, Clone)]
278294
struct ShouldFilterHeader {
279295
enable: bool,
@@ -347,11 +363,13 @@ fn make_router_tests() {
347363
return;
348364
}
349365

350-
let templating = match Tera::new("tests/templates/**/*") {
366+
let mut templating = match Tera::new("tests/templates/**/*") {
351367
Ok(t) => t,
352368
Err(e) => panic!("{}", e),
353369
};
354370

371+
templating.register_filter("as_bytes", filter_as_bytes_str);
372+
355373
let rule_sets_list = RuleSetList { rule_sets };
356374
let test_path = Path::new("tests/redirectionio_router_test.rs");
357375

@@ -372,6 +390,18 @@ fn make_router_tests() {
372390
}
373391
}
374392

393+
fn filter_as_bytes_str(value: &Value, _: &HashMap<String, Value>) -> tera::Result<Value> {
394+
let body = try_get_value!("as_bytes", "value", String, value);
395+
396+
Ok(Value::String(
397+
body.as_bytes()
398+
.iter()
399+
.map(|b| format!("\\x{b:02x}"))
400+
.collect::<Vec<String>>()
401+
.join(""),
402+
))
403+
}
404+
375405
fn read_router_tests(path: &str) -> HashMap<String, RuleSet> {
376406
let mut rule_sets = HashMap::new();
377407

src/filter/filter_body.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use crate::{
1313
HtmlBodyVisitor,
1414
body_capture::{BodyCapture, CaptureRegistry},
1515
},
16+
html_to_markdown::HtmlToMarkdownFilter,
1617
text_filter_body::{TextFilterAction, TextFilterBodyAction},
1718
},
1819
http::Header,
@@ -27,6 +28,7 @@ pub struct FilterBodyAction {
2728
#[derive(Debug)]
2829
pub enum FilterBodyActionItem {
2930
Buffer(BufferFilterBody),
31+
HtmlToMarkdown(Box<HtmlToMarkdownFilter>),
3032
Html(Box<HtmlFilterBodyAction>),
3133
Text(TextFilterBodyAction),
3234
#[cfg(feature = "compress")]
@@ -218,6 +220,24 @@ impl FilterBodyActionItem {
218220
},
219221
text_body_filter.content,
220222
))),
223+
BodyFilter::HTMLToMarkdown(html_to_md_filter) => match content_type {
224+
Some(content_type) if content_type.contains("text/html") => {
225+
// @TODO Support charset
226+
Some(Self::HtmlToMarkdown(Box::new(HtmlToMarkdownFilter::new(html_to_md_filter.options))))
227+
}
228+
None => {
229+
// Assume HTML if no content type
230+
Some(Self::HtmlToMarkdown(Box::new(HtmlToMarkdownFilter::new(html_to_md_filter.options))))
231+
}
232+
_ => {
233+
log::warn!(
234+
"html to markdown is only supported for text/html content type, {} received",
235+
content_type.unwrap_or_default()
236+
);
237+
238+
None
239+
}
240+
},
221241
BodyFilter::Other(_) => {
222242
log::warn!("unsupported body filter: {filter:?}, you may need to update your agent or module");
223243
None
@@ -229,6 +249,7 @@ impl FilterBodyActionItem {
229249
Ok(match self {
230250
FilterBodyActionItem::Buffer(buffer) => buffer.filter(data),
231251
FilterBodyActionItem::Html(html_body_filter) => html_body_filter.filter(data)?,
252+
FilterBodyActionItem::HtmlToMarkdown(html_to_md_filter) => html_to_md_filter.filter(data),
232253
FilterBodyActionItem::Text(text_body_filter) => text_body_filter.filter(data, unit_trace),
233254
#[cfg(feature = "compress")]
234255
FilterBodyActionItem::Decode(decode_body_filter) => decode_body_filter.filter(data)?,
@@ -241,6 +262,7 @@ impl FilterBodyActionItem {
241262
Ok(match self {
242263
FilterBodyActionItem::Buffer(buffer) => buffer.end(),
243264
FilterBodyActionItem::Html(html_body_filter) => html_body_filter.end(),
265+
FilterBodyActionItem::HtmlToMarkdown(html_to_md_filter) => html_to_md_filter.end(),
244266
FilterBodyActionItem::Text(text_body_filter) => text_body_filter.end(),
245267
#[cfg(feature = "compress")]
246268
FilterBodyActionItem::Decode(decode_body_filter) => decode_body_filter.end()?,

src/filter/html_to_markdown.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
use html_to_markdown_rs::{ConversionOptions, convert};
2+
3+
#[derive(Debug)]
4+
pub struct HtmlToMarkdownFilter {
5+
buffer: Vec<u8>,
6+
options: Option<ConversionOptions>,
7+
}
8+
9+
impl HtmlToMarkdownFilter {
10+
pub fn new(options: Option<ConversionOptions>) -> Self {
11+
Self {
12+
buffer: Vec::new(),
13+
options,
14+
}
15+
}
16+
17+
pub fn filter(&mut self, input: Vec<u8>) -> Vec<u8> {
18+
self.buffer.extend_from_slice(&input);
19+
Vec::new()
20+
}
21+
22+
pub fn end(self) -> Vec<u8> {
23+
let html = match String::from_utf8(self.buffer) {
24+
Err(e) => {
25+
log::error!("error while converting to utf8: {}", e);
26+
27+
return e.into_bytes();
28+
}
29+
Ok(html) => html,
30+
};
31+
32+
match convert(html.as_str(), self.options) {
33+
Ok(md) => md.content.unwrap_or_default().into_bytes(),
34+
Err(e) => {
35+
log::error!("error while converting html to markdown: {}", e);
36+
37+
html.into_bytes()
38+
}
39+
}
40+
}
41+
}

src/filter/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ mod filter_header;
88
mod header_action;
99
mod html_body_action;
1010
mod html_filter_body;
11+
mod html_to_markdown;
1112
mod text_filter_body;
1213

1314
pub use buffer::Buffer;

0 commit comments

Comments
 (0)