use scraper::{ElementRef, Html, Node}; /* The goal here is to flatten the DOM as much as possible. paragraphs with fancy formatting are turned into markdown, same with */ //Supported content #[derive(Debug,Clone)] pub enum Content { Markdown(String), Image(String), Audio(String), Video(String), Ignore } pub fn parse_content(c: &str) -> Vec{ process_content(itemize_content(c)) } fn markdownify_child(item: &Item) -> String { let mut result = "".to_owned(); match markdown_content(&item) { Content::Markdown(s) => { result = result + &s; }, _ => {} } result } fn process_children(children: &Vec) -> String { let mut result = "".to_owned(); for c in children{ result = result + &markdownify_child(c); } result } fn markdown_content(item: &Item) -> Content { let mut markdown = String::new(); match item { Item::Title(n,children) => { markdown = markdown + &"#".repeat(*n) + " " +&process_children(children); }, Item::BoldedText(children) => { markdown = format!("**{}**",process_children(children)); }, Item::EmphasisText(children) => { markdown = format!("*{}*",process_children(children)); } Item::Text(s) => { markdown = markdown + s; }, Item::Link(href, children) => { markdown = markdown + &format!("[{}]({})",process_children(children),href); } Item::Paragraph(children) => { markdown = markdown + &process_children(children); } Item::UnorderedList(children) => { markdown = markdown + &process_children(children); } Item::OrderedList(children) => { markdown = markdown + &process_children(children); } Item::ListItem(children) => { markdown = "\n- ".to_owned() + &process_children(children); } _ => {} } Content::Markdown(markdown) } fn media_content(item: &Item) -> Content{ Content::Markdown("Media not supported yet".to_owned()) } fn process_content(items: Vec) -> Vec { let mut result: Vec = Vec::new(); //println!("Converting {} items into Content",items.len()); for i in &items { match i { Item::Title(_,_) => { result.push(markdown_content(i)); } Item::Paragraph(children) => { result.push(markdown_content(i)); }, Item::Link(href,children) => { result.push(markdown_content(i)) } Item::UnorderedList(children) => { result.push(markdown_content(i)); } Item::OrderedList(children) => { result.push(markdown_content(i)); } Item::ListItem(children) => { result.push(markdown_content(i)); } Item::Image(src) => { result.push(Content::Image(src.to_owned())); } Item::Video(children) => { result.push(media_content(i)); } Item::Audio(children) => { result.push(media_content(i)); } _ => { result.push(Content::Ignore); } } } result } #[derive(Debug,Clone)] enum Item { Ignore, Title(usize,Vec), Text(String), //text, links, formatting are all markdown //arguably, for better control it will be best to turn markdown into its own set of items Image(String), Svg(String),// wont' support for a while I think. Video(Vec), Audio(Vec), Source(String), BoldedText(Vec), EmphasisText(Vec), UnorderedList(Vec), OrderedList(Vec), ListItem(Vec), Paragraph(Vec),//gotta replace this with specific items, needlessly flexible Link(String,Vec), Table(Vec) } fn itemize_content(content: &str) -> Vec { let frag = Html::parse_fragment(content); frag.root_element().children().map(|e|{ parse_items(e) }).collect() } fn get_children(el: &ElementRef) -> Vec{ el.children().map(|c|{parse_items(c)}).collect() } fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{ if n.value().is_text(){ return Item::Text((&n.value().as_text().unwrap()).to_string()) } if n.value().is_element(){ let el = ElementRef::wrap(n).unwrap(); let tag_name = el.value().name(); match tag_name { "h1" => {return Item::Title(1, get_children(&el))}, "h2" => {return Item::Title(2, get_children(&el))}, "h3" => {return Item::Title(3, get_children(&el))}, "h4" => {return Item::Title(4, get_children(&el))}, "h5" => {return Item::Title(5, get_children(&el))}, "h6" => {return Item::Title(6, get_children(&el))}, "strong" => {return Item::BoldedText(get_children(&el))}, "em" => {return Item::EmphasisText(get_children(&el))}, "br" => {return Item::Text("\n".to_owned())}, "hr" => {return Item::Text("---".to_owned())} "p" => { return Item::Paragraph(get_children(&el)) }, "a" => { let href = match el.attr("href") { Some(link) => {link} None => {""} }; return Item::Link(href.to_owned(),get_children(&el)) } "img" => { match el.attr("src") { Some(src) => { return Item::Image(src.to_owned()) }, None => {return Item::Ignore} } } "source" => { match el.attr("src") { Some(src) => { return Item::Source(src.to_owned()) }, None => {return Item::Ignore} } } "video" => { return Item::Video(get_children(&el)) } "ol" => { return Item::OrderedList(get_children(&el)) } "ul" => { return Item::UnorderedList(get_children(&el)) } "li" => { return Item::ListItem(get_children(&el)) } _ => {} }; } Item::Ignore } /* Ideally I would verify what works and write tests for it. I also need a function to process markdown items. */ /* pub fn add(left: u64, right: u64) -> u64 { left + right } #[cfg(test)] mod tests { use super::*; #[test] fn it_works() { let result = add(2, 2); assert_eq!(result, 4); } } */ #[cfg(test)] mod tests { mod example_data; use crate::{itemize_content, process_content, tests::example_data::FEEDS,Content,Item}; fn get_feed(u: &str) -> rss::Channel { rss::Channel::read_from(u.as_bytes()).unwrap() } #[test] fn content_test(){ let example_text = Item::Text("Example.com".to_owned()); let example_link = Item::Link("https://example.com".to_owned(),[example_text].to_vec()); let result = process_content([example_link].to_vec()); println!("Items to content parse result:\n{:#?}",result); } #[test] fn content_display() { let feed = get_feed(example_data::GABE_ROCKS); let content: Vec<_> = process_content( itemize_content(feed.items.first().unwrap().content().unwrap()) ); println!("Content: {:#?}",content) } #[test] fn itemize_feeds(){ let _ = FEEDS.map(|u|{ let feed = get_feed(u); let results: Vec<_> = feed.items.into_iter().map(|item| { itemize_content(&item.content.unwrap()); }).collect(); //let results: Vec<_> = itemize_content(u); println!("Evaluated feed\nScanned {} items without errors",results.len()) }); } #[test] fn markdownify_feeds(){ let _ = FEEDS.map(|u|{ let feed = get_feed(u); let results: Vec<_> = feed.items.into_iter().map(|item|{ process_content( itemize_content(&item.content.unwrap()) ); }).collect(); println!("Processed {} items without errors",results.len()) }); } }