rss_content/src/lib.rs

143 lines
3.8 KiB
Rust
Raw Normal View History

use scraper::{ElementRef, Html, Node};
/*
The goal here is to flatten the DOM as much as possible.
paragraphs with fancy formatting are turned into markdown, same with
*/
#[derive(Debug)]
pub enum Item {
Ignore,
Text(String),
//text, links, formatting are all markdown
//arguably, for better control it will be best to turn markdown into its own set of items
Image(String),
Gif(String), //can't detect gif from image, has to be handled on front-end
Svg(String),// wont' support for a while I think.
Video(Vec<Item>),
Audio(Vec<Item>),
Source(String),
BoldedText(Vec<Item>),
EmphasisText(Vec<Item>),
UnorderedList(Vec<Item>),
OrderedList(Vec<Item>),
ListItem(Vec<Item>),
Paragraph(Vec<Item>),//gotta replace this with specific items, needlessly flexible
Link(String,Vec<Item>),
}
pub fn itemize_content(content: &str) -> Vec<Item> {
let frag = Html::parse_fragment(content);
frag.root_element().children().map(|e|{
parse_items(e)
}).collect()
}
pub fn get_children(el: &ElementRef) -> Vec<Item>{
el.children().map(|c|{parse_items(c)}).collect()
}
fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{
if n.value().is_text(){
return Item::Text((&n.value().as_text().unwrap()).to_string())
}
if n.value().is_element(){
let el = ElementRef::wrap(n).unwrap();
let tag_name = el.value().name();
let mut item: Item;
match tag_name {
"br" => {return Item::Text("\n".to_owned())},
"hr" => {return Item::Text("---".to_owned())}
"p" => {
return Item::Paragraph(get_children(&el))
},
"a" => {
let href = match el.attr("href") {
Some(link) => {link}
None => {""}
};
return Item::Link(href.to_owned(),get_children(&el))
}
"img" => {
match el.attr("src") {
Some(src) => {
return Item::Image(src.to_owned())
},
None => {return Item::Ignore}
}
}
"source" => {
match el.attr("src") {
Some(src) => {
return Item::Source(src.to_owned())
},
None => {return Item::Ignore}
}
}
"video" => {
return Item::Video(get_children(&el))
}
"ol" => {
return Item::OrderedList(get_children(&el))
}
"ul" => {
return Item::UnorderedList(get_children(&el))
}
"li" => {
return Item::ListItem(get_children(&el))
}
_ => {}
};
}
Item::Ignore
}
/*
Ideally I would verify what works and write tests for it.
I also need a function to process markdown items.
*/
/*
pub fn add(left: u64, right: u64) -> u64 {
left + right
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
}
}
*/
#[cfg(test)]
mod tests {
use core::panic;
use rss::Channel;
mod example_data;
use crate::{itemize_content, tests::example_data::FEEDS};
#[test]
fn real_feeds(){
let _ = FEEDS.map(|u|{
let feed = rss::Channel::read_from(u.as_bytes()).unwrap();
let results: Vec<_> = feed.items.into_iter().map(|item| {
itemize_content(&item.content.unwrap());
}).collect();
//let results: Vec<_> = itemize_content(u);
println!("Evaluated feed\nScanned {} items without errors",results.len())
});
}
}