use scraper::{ElementRef, Html, Node}; /* The goal here is to flatten the DOM as much as possible. paragraphs with fancy formatting are turned into markdown, same with */ //Supported content #[derive(Debug,Clone)] pub enum Content { Markdown(String), Image(String), Audio(String), Video(String) } //double recursion? This seems dumb. fn markdownify(item: &Item) -> String{ match markdown_content(item) { Content::Markdown(s) => { s.to_owned() } _ => {"".to_owned()} } } fn markdown_content(item: &Item) -> Content { let mut markdown = String::new(); match item { Item::Title(n,t) => { markdown = markdown + &"#".repeat(*n); let _ = t.iter().map(|i|{ markdown = "".to_owned() + &markdown + &markdownify(i); }); }, Item::BoldedText(b) => { let _ = b.iter().map(|i|{ markdown = "**".to_owned() + &markdown + &markdownify(i) + "**"; }); }, Item::EmphasisText(e) => { let _ = e.iter().map(|i|{ markdown = "*".to_owned() + &markdown + &markdownify(i) + "*"; }); } Item::Text(s) => { markdown = markdown + s; }, Item::Link(href, children) => { markdown = markdown + &markdownify(item); } Item::Paragraph(p) => { let _ = p.iter().map(|i|{ markdown = "".to_owned() + &markdown + &markdownify(i); }); } Item::UnorderedList(u) => { let _ = u.iter().map(|i|{ markdown = "".to_owned() + &markdown + &markdownify(i); }); } Item::OrderedList(o) => { let _ = o.iter().map(|i|{ markdown = "".to_owned() + &markdown + &markdownify(i); }); } _ => {} } Content::Markdown(markdown) } fn media_content(item: &Item) -> Content{ Content::Markdown("Media not supported yet".to_owned()) } pub fn process_content(content: &str) -> Vec { let items = itemize_content(content); let mut result: Vec = Vec::new(); let _ = items.iter().map(|i| { match i { Item::Paragraph(children) => { result.push(markdown_content(i)); }, Item::UnorderedList(children) => { result.push(markdown_content(i)); } Item::OrderedList(children) => { result.push(markdown_content(i)); } Item::Image(src) => { result.push(Content::Image(src.to_owned())); } Item::Video(children) => { result.push(media_content(i)); } Item::Audio(children) => { result.push(media_content(i)); } _ => {} } }); [Content::Markdown("Ayy lmao".to_owned())].to_vec() } #[derive(Debug)] enum Item { Ignore, Title(usize,Vec), Text(String), //text, links, formatting are all markdown //arguably, for better control it will be best to turn markdown into its own set of items Image(String), Svg(String),// wont' support for a while I think. Video(Vec), Audio(Vec), Source(String), BoldedText(Vec), EmphasisText(Vec), UnorderedList(Vec), OrderedList(Vec), ListItem(Vec), Paragraph(Vec),//gotta replace this with specific items, needlessly flexible Link(String,Vec), Table(Vec) } pub fn itemize_content(content: &str) -> Vec { let frag = Html::parse_fragment(content); frag.root_element().children().map(|e|{ parse_items(e) }).collect() } fn get_children(el: &ElementRef) -> Vec{ el.children().map(|c|{parse_items(c)}).collect() } fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{ if n.value().is_text(){ return Item::Text((&n.value().as_text().unwrap()).to_string()) } if n.value().is_element(){ let el = ElementRef::wrap(n).unwrap(); let tag_name = el.value().name(); let mut item: Item; match tag_name { "h1" => {return Item::Title(1, get_children(&el))}, "h2" => {return Item::Title(2, get_children(&el))}, "h3" => {return Item::Title(3, get_children(&el))}, "h4" => {return Item::Title(4, get_children(&el))}, "h5" => {return Item::Title(5, get_children(&el))}, "h6" => {return Item::Title(6, get_children(&el))}, "strong" => {return Item::BoldedText(get_children(&el))}, "em" => {return Item::EmphasisText(get_children(&el))}, "br" => {return Item::Text("\n".to_owned())}, "hr" => {return Item::Text("---".to_owned())} "p" => { return Item::Paragraph(get_children(&el)) }, "a" => { let href = match el.attr("href") { Some(link) => {link} None => {""} }; return Item::Link(href.to_owned(),get_children(&el)) } "img" => { match el.attr("src") { Some(src) => { return Item::Image(src.to_owned()) }, None => {return Item::Ignore} } } "source" => { match el.attr("src") { Some(src) => { return Item::Source(src.to_owned()) }, None => {return Item::Ignore} } } "video" => { return Item::Video(get_children(&el)) } "ol" => { return Item::OrderedList(get_children(&el)) } "ul" => { return Item::UnorderedList(get_children(&el)) } "li" => { return Item::ListItem(get_children(&el)) } _ => {} }; } Item::Ignore } /* Ideally I would verify what works and write tests for it. I also need a function to process markdown items. */ /* pub fn add(left: u64, right: u64) -> u64 { left + right } #[cfg(test)] mod tests { use super::*; #[test] fn it_works() { let result = add(2, 2); assert_eq!(result, 4); } } */ #[cfg(test)] mod tests { mod example_data; use crate::{itemize_content, process_content, tests::example_data::FEEDS}; fn get_feed(u: &str) -> rss::Channel { rss::Channel::read_from(u.as_bytes()).unwrap() } #[test] fn itemize_feeds(){ let _ = FEEDS.map(|u|{ let feed = get_feed(u); let results: Vec<_> = feed.items.into_iter().map(|item| { itemize_content(&item.content.unwrap()); }).collect(); //let results: Vec<_> = itemize_content(u); println!("Evaluated feed\nScanned {} items without errors",results.len()) }); } #[test] fn markdownify_feeds(){ let _ = FEEDS.map(|u|{ let feed = get_feed(u); let results: Vec<_> = feed.items.into_iter().map(|item|{ process_content(&item.content.unwrap()); }).collect(); println!("Processed {} items without errors",results.len()) }); } }