use scraper::{ElementRef, Html, Node}; pub mod elements; use elements::*; /* The goal here is to flatten the DOM as much as possible. paragraphs with fancy formatting are turned into markdown, same with */ pub fn itemize_content(content: &str) -> Vec { let frag = Html::parse_fragment(content); frag.root_element().children().map(|e|{ parse_items(e) }).collect() } pub fn get_children(el: &ElementRef) -> Vec{ el.children().map(|c|{parse_items(c)}).collect() } fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{ if n.value().is_text(){ return Item::Text((&n.value().as_text().unwrap()).to_string()) } if n.value().is_element(){ let el = ElementRef::wrap(n).unwrap(); let tag_name = el.value().name(); let mut item: Item; match tag_name { "br" => {return Item::Text("\n".to_owned())}, "hr" => {return Item::Text("---".to_owned())} "p" => { return Item::Paragraph(get_children(&el)) }, "a" => { let href = match el.attr("href") { Some(link) => {link} None => {""} }; return Item::Link( Link{ href: href.to_owned(), children: get_children(&el) } ) } "img" => { match el.attr("src") { Some(src) => { return Item::Image(src.to_owned()) }, None => {return Item::Ignore} } } "source" => { match el.attr("src") { Some(src) => { return Item::Source(src.to_owned()) }, None => {return Item::Ignore} } } "video" => { return Item::Video( Video{ children: get_children(&el) } ) } "ol" => { return Item::OrderedList(get_children(&el)) } "ul" => { return Item::UnorderedList(get_children(&el)) } "li" => { return Item::ListItem(get_children(&el)) } _ => {} }; } Item::Ignore } /* Ideally I would verify what works and write tests for it. I also need a function to process markdown items. */ /* pub fn add(left: u64, right: u64) -> u64 { left + right } #[cfg(test)] mod tests { use super::*; #[test] fn it_works() { let result = add(2, 2); assert_eq!(result, 4); } } */ #[cfg(test)] mod tests { use core::panic; use rss::Channel; mod example_data; use crate::{itemize_content, tests::example_data::FEEDS}; #[test] fn real_feeds(){ let _ = FEEDS.map(|u|{ let feed = rss::Channel::read_from(u.as_bytes()).unwrap(); let results: Vec<_> = feed.items.into_iter().map(|item| { itemize_content(&item.content.unwrap()); }).collect(); //let results: Vec<_> = itemize_content(u); println!("Evaluated feed\nScanned {} items without errors",results.len()) }); } }