rss_content/src/lib.rs

use scraper::{ElementRef, Html, Node};

/*
    The goal here is to flatten the DOM as much as possible. 
    paragraphs with fancy formatting are turned into markdown, same with 
*/

//Supported content
#[derive(Debug,Clone)]
pub enum Content {
    Markdown(String),
    Image(String),
    Audio(String),
    Video(String)

}

//double recursion? This seems dumb.
fn markdownify(item: &Item) -> String{
    match markdown_content(item) {
        Content::Markdown(s) => {
            s.to_owned()
        }
        _ => {"".to_owned()}
    }

}

fn markdown_content(item: &Item) -> Content {
    let mut markdown = String::new();
    match item {
        Item::Title(n,t) => {
            markdown = markdown + &"#".repeat(*n);
            let _ = t.iter().map(|i|{
                markdown = "".to_owned() + &markdown  + &markdownify(i);
            });
        },
        Item::BoldedText(b) => {
            let _ = b.iter().map(|i|{
                markdown = "**".to_owned() + &markdown  + &markdownify(i) + "**";
            });
        },
        Item::EmphasisText(e) => {
            let _ = e.iter().map(|i|{
                markdown = "*".to_owned() + &markdown  + &markdownify(i) + "*";
            });
        }
        Item::Text(s) => {
            markdown = markdown + s;
        },
        Item::Link(href, children) => {
            markdown = markdown + &markdownify(item);
        }
        Item::Paragraph(p) => {
            let _ = p.iter().map(|i|{
                markdown = "".to_owned() + &markdown  + &markdownify(i);
            });
        }
        Item::UnorderedList(u) => {
            let _ = u.iter().map(|i|{
                markdown = "".to_owned() + &markdown  + &markdownify(i);
            });
        }
        Item::OrderedList(o) => {
            let _ = o.iter().map(|i|{
                markdown = "".to_owned() + &markdown  + &markdownify(i);
            });
        }
        _ => {}
    }
    Content::Markdown(markdown)
}

fn media_content(item: &Item) -> Content{
    Content::Markdown("Media not supported yet".to_owned())
}


pub fn process_content(content: &str) -> Vec<Content> {
    let items = itemize_content(content);
    let mut result: Vec<Content> = Vec::new();
    let _ = items.iter().map(|i| {
        match i {
            Item::Paragraph(children) => {
                result.push(markdown_content(i));
            },
            Item::UnorderedList(children) => {
                result.push(markdown_content(i));
            }
            Item::OrderedList(children) => {
                result.push(markdown_content(i));
            }
            Item::Image(src) => {
                result.push(Content::Image(src.to_owned()));
            }
            Item::Video(children) => {
                result.push(media_content(i));
            }
            Item::Audio(children) => {
                result.push(media_content(i));
            }
            _ => {}
        }
    });

    [Content::Markdown("Ayy lmao".to_owned())].to_vec()
}

#[derive(Debug)]
enum Item {
    Ignore,
    Title(usize,Vec<Item>),
    Text(String),
    //text, links, formatting are all markdown
    //arguably, for better control it will be best to turn markdown into its own set of items
    Image(String),
    Svg(String),// wont' support for a while I think.
    Video(Vec<Item>),
    Audio(Vec<Item>),
    Source(String),
    BoldedText(Vec<Item>),
    EmphasisText(Vec<Item>),
    UnorderedList(Vec<Item>),
    OrderedList(Vec<Item>),
    ListItem(Vec<Item>),
    Paragraph(Vec<Item>),//gotta replace this with specific items, needlessly flexible
    Link(String,Vec<Item>),
    Table(Vec<Item>)
}


pub fn itemize_content(content: &str) -> Vec<Item> {
    let frag = Html::parse_fragment(content);
    frag.root_element().children().map(|e|{
        parse_items(e)
    }).collect()
}

fn get_children(el: &ElementRef) -> Vec<Item>{
    el.children().map(|c|{parse_items(c)}).collect()
}

fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{
    if n.value().is_text(){
        return Item::Text((&n.value().as_text().unwrap()).to_string())
    }
    
    if n.value().is_element(){
        let el = ElementRef::wrap(n).unwrap();  
        let tag_name = el.value().name();
        let mut item: Item;
        match tag_name {
            "h1" => {return Item::Title(1, get_children(&el))},
            "h2" => {return Item::Title(2, get_children(&el))},
            "h3" => {return Item::Title(3, get_children(&el))},
            "h4" => {return Item::Title(4, get_children(&el))},
            "h5" => {return Item::Title(5, get_children(&el))},
            "h6" => {return Item::Title(6, get_children(&el))},
            "strong" => {return Item::BoldedText(get_children(&el))},
            "em" => {return Item::EmphasisText(get_children(&el))},
            "br" => {return Item::Text("\n".to_owned())},
            "hr" => {return Item::Text("---".to_owned())}
            "p" => {
                return Item::Paragraph(get_children(&el))
            },
            "a" => {
                let href = match el.attr("href") {
                    Some(link) => {link}
                    None => {""}
                };
                return Item::Link(href.to_owned(),get_children(&el))
                                            
                
            }
            "img" => {
                match el.attr("src") {
                    Some(src) => {
                        return Item::Image(src.to_owned())
                    },
                    None => {return Item::Ignore}
                }
            }
            "source" => {
                match el.attr("src") {
                    Some(src) => {
                        return Item::Source(src.to_owned())
                    },
                    None => {return Item::Ignore}
                }
            }
            "video" => {
                return Item::Video(get_children(&el))                
            }
            "ol" => {
                return Item::OrderedList(get_children(&el))
            }
            "ul" => {
                return Item::UnorderedList(get_children(&el))
            }
            "li" => {
                return Item::ListItem(get_children(&el))
            }

            _ => {}
        };  
    }
    
    Item::Ignore


}

/*
Ideally I would verify what works and write tests for it. 
I also need a function to process markdown items.
*/
/*
pub fn add(left: u64, right: u64) -> u64 {
    left + right
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn it_works() {
        let result = add(2, 2);
        assert_eq!(result, 4);
    }
}
 */
#[cfg(test)]
 mod tests {
    mod example_data;
    use crate::{itemize_content, process_content, tests::example_data::FEEDS};


    fn get_feed(u: &str) -> rss::Channel {
        rss::Channel::read_from(u.as_bytes()).unwrap()
    }
    #[test]
    fn itemize_feeds(){
        let _ = FEEDS.map(|u|{
            let feed = get_feed(u);
            let results: Vec<_> = feed.items.into_iter().map(|item| {
                itemize_content(&item.content.unwrap());
            }).collect();
            //let results: Vec<_> = itemize_content(u);
            println!("Evaluated feed\nScanned {} items without errors",results.len())
        });
    }
    #[test]
    fn markdownify_feeds(){
        let _ = FEEDS.map(|u|{
            let feed = get_feed(u);
            let results: Vec<_>  = feed.items.into_iter().map(|item|{
                process_content(&item.content.unwrap());
            }).collect();
            println!("Processed {} items without errors",results.len())
        });
    }


}
Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`use scraper::{ElementRef, Html, Node};`

			`/*`
			`The goal here is to flatten the DOM as much as possible.`
			`paragraphs with fancy formatting are turned into markdown, same with`
			`*/`
Figured out how to make using the enum a lot less annoying! 2025-07-11 10:00:21 -04:00
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`//Supported content`
			`#[derive(Debug,Clone)]`
			`pub enum Content {`
			`Markdown(String),`
			`Image(String),`
			`Audio(String),`
			`Video(String)`

			`}`

			`//double recursion? This seems dumb.`
			`fn markdownify(item: &Item) -> String{`
			`match markdown_content(item) {`
			`Content::Markdown(s) => {`
			`s.to_owned()`
			`}`
			`_ => {"".to_owned()}`
			`}`

			`}`

			`fn markdown_content(item: &Item) -> Content {`
			`let mut markdown = String::new();`
			`match item {`
			`Item::Title(n,t) => {`
			`markdown = markdown + &"#".repeat(*n);`
			`let _ = t.iter().map(\|i\|{`
			`markdown = "".to_owned() + &markdown + &markdownify(i);`
			`});`
			`},`
			`Item::BoldedText(b) => {`
			`let _ = b.iter().map(\|i\|{`
			`markdown = "".to_owned() + &markdown + &markdownify(i) + "";`
			`});`
			`},`
			`Item::EmphasisText(e) => {`
			`let _ = e.iter().map(\|i\|{`
			`markdown = "".to_owned() + &markdown + &markdownify(i) + "";`
			`});`
			`}`
			`Item::Text(s) => {`
			`markdown = markdown + s;`
			`},`
			`Item::Link(href, children) => {`
			`markdown = markdown + &markdownify(item);`
			`}`
			`Item::Paragraph(p) => {`
			`let _ = p.iter().map(\|i\|{`
			`markdown = "".to_owned() + &markdown + &markdownify(i);`
			`});`
			`}`
			`Item::UnorderedList(u) => {`
			`let _ = u.iter().map(\|i\|{`
			`markdown = "".to_owned() + &markdown + &markdownify(i);`
			`});`
			`}`
			`Item::OrderedList(o) => {`
			`let _ = o.iter().map(\|i\|{`
			`markdown = "".to_owned() + &markdown + &markdownify(i);`
			`});`
			`}`
			`_ => {}`
			`}`
			`Content::Markdown(markdown)`
			`}`

			`fn media_content(item: &Item) -> Content{`
			`Content::Markdown("Media not supported yet".to_owned())`
			`}`


			`pub fn process_content(content: &str) -> Vec<Content> {`
			`let items = itemize_content(content);`
			`let mut result: Vec<Content> = Vec::new();`
			`let _ = items.iter().map(\|i\| {`
			`match i {`
			`Item::Paragraph(children) => {`
			`result.push(markdown_content(i));`
			`},`
			`Item::UnorderedList(children) => {`
			`result.push(markdown_content(i));`
			`}`
			`Item::OrderedList(children) => {`
			`result.push(markdown_content(i));`
			`}`
			`Item::Image(src) => {`
			`result.push(Content::Image(src.to_owned()));`
			`}`
			`Item::Video(children) => {`
			`result.push(media_content(i));`
			`}`
			`Item::Audio(children) => {`
			`result.push(media_content(i));`
			`}`
			`_ => {}`
			`}`
			`});`

			`[Content::Markdown("Ayy lmao".to_owned())].to_vec()`
			`}`
Figured out how to make using the enum a lot less annoying! 2025-07-11 10:00:21 -04:00
			`#[derive(Debug)]`
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`enum Item {`
Figured out how to make using the enum a lot less annoying! 2025-07-11 10:00:21 -04:00			`Ignore,`
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`Title(usize,Vec<Item>),`
Figured out how to make using the enum a lot less annoying! 2025-07-11 10:00:21 -04:00			`Text(String),`
			`//text, links, formatting are all markdown`
			`//arguably, for better control it will be best to turn markdown into its own set of items`
			`Image(String),`
			`Svg(String),// wont' support for a while I think.`
			`Video(Vec<Item>),`
			`Audio(Vec<Item>),`
			`Source(String),`
			`BoldedText(Vec<Item>),`
			`EmphasisText(Vec<Item>),`
			`UnorderedList(Vec<Item>),`
			`OrderedList(Vec<Item>),`
			`ListItem(Vec<Item>),`
			`Paragraph(Vec<Item>),//gotta replace this with specific items, needlessly flexible`
			`Link(String,Vec<Item>),`
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`Table(Vec<Item>)`
Figured out how to make using the enum a lot less annoying! 2025-07-11 10:00:21 -04:00			`}`


Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`pub fn itemize_content(content: &str) -> Vec<Item> {`
			`let frag = Html::parse_fragment(content);`
			`frag.root_element().children().map(\|e\|{`
			`parse_items(e)`
			`}).collect()`
			`}`

move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`fn get_children(el: &ElementRef) -> Vec<Item>{`
Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`el.children().map(\|c\|{parse_items(c)}).collect()`
			`}`

			`fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{`
			`if n.value().is_text(){`
			`return Item::Text((&n.value().as_text().unwrap()).to_string())`
			`}`

			`if n.value().is_element(){`
			`let el = ElementRef::wrap(n).unwrap();`
			`let tag_name = el.value().name();`
			`let mut item: Item;`
			`match tag_name {`
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`"h1" => {return Item::Title(1, get_children(&el))},`
			`"h2" => {return Item::Title(2, get_children(&el))},`
			`"h3" => {return Item::Title(3, get_children(&el))},`
			`"h4" => {return Item::Title(4, get_children(&el))},`
			`"h5" => {return Item::Title(5, get_children(&el))},`
			`"h6" => {return Item::Title(6, get_children(&el))},`
			`"strong" => {return Item::BoldedText(get_children(&el))},`
			`"em" => {return Item::EmphasisText(get_children(&el))},`
Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`"br" => {return Item::Text("\n".to_owned())},`
			`"hr" => {return Item::Text("---".to_owned())}`
			`"p" => {`
			`return Item::Paragraph(get_children(&el))`
			`},`
			`"a" => {`
			`let href = match el.attr("href") {`
			`Some(link) => {link}`
			`None => {""}`
			`};`
Figured out how to make using the enum a lot less annoying! 2025-07-11 10:00:21 -04:00			`return Item::Link(href.to_owned(),get_children(&el))`


Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`}`
			`"img" => {`
			`match el.attr("src") {`
			`Some(src) => {`
			`return Item::Image(src.to_owned())`
			`},`
			`None => {return Item::Ignore}`
			`}`
			`}`
			`"source" => {`
			`match el.attr("src") {`
			`Some(src) => {`
			`return Item::Source(src.to_owned())`
			`},`
			`None => {return Item::Ignore}`
			`}`
			`}`
			`"video" => {`
Figured out how to make using the enum a lot less annoying! 2025-07-11 10:00:21 -04:00			`return Item::Video(get_children(&el))`
Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`}`
			`"ol" => {`
			`return Item::OrderedList(get_children(&el))`
			`}`
			`"ul" => {`
			`return Item::UnorderedList(get_children(&el))`
			`}`
			`"li" => {`
			`return Item::ListItem(get_children(&el))`
			`}`

			`_ => {}`
			`};`
			`}`

			`Item::Ignore`



			`}`

			`/*`
			`Ideally I would verify what works and write tests for it.`
			`I also need a function to process markdown items.`
			`*/`
			`/*`
			`pub fn add(left: u64, right: u64) -> u64 {`
			`left + right`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn it_works() {`
			`let result = add(2, 2);`
			`assert_eq!(result, 4);`
			`}`
			`}`
			`*/`
			`#[cfg(test)]`
			`mod tests {`
			`mod example_data;`
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`use crate::{itemize_content, process_content, tests::example_data::FEEDS};`
Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00
move markdown handling to this crate 2025-07-15 09:59:38 -04:00
			`fn get_feed(u: &str) -> rss::Channel {`
			`rss::Channel::read_from(u.as_bytes()).unwrap()`
			`}`
Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`#[test]`
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`fn itemize_feeds(){`
Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`let _ = FEEDS.map(\|u\|{`
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`let feed = get_feed(u);`
Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`let results: Vec<_> = feed.items.into_iter().map(\|item\| {`
			`itemize_content(&item.content.unwrap());`
			`}).collect();`
			`//let results: Vec<_> = itemize_content(u);`
			`println!("Evaluated feed\nScanned {} items without errors",results.len())`
			`});`
			`}`
move markdown handling to this crate 2025-07-15 09:59:38 -04:00			`#[test]`
			`fn markdownify_feeds(){`
			`let _ = FEEDS.map(\|u\|{`
			`let feed = get_feed(u);`
			`let results: Vec<_> = feed.items.into_iter().map(\|item\|{`
			`process_content(&item.content.unwrap());`
			`}).collect();`
			`println!("Processed {} items without errors",results.len())`
			`});`
			`}`



Parses content reasonably well for MVP status, will update as I try to integrate it into the main project 2025-07-10 18:58:11 -04:00			`}`