2025-07-10 18:58:11 -04:00
|
|
|
use scraper::{ElementRef, Html, Node};
|
2025-07-10 20:30:35 -04:00
|
|
|
pub mod elements;
|
2025-07-10 18:58:11 -04:00
|
|
|
use elements::*;
|
|
|
|
|
|
|
|
/*
|
|
|
|
The goal here is to flatten the DOM as much as possible.
|
|
|
|
paragraphs with fancy formatting are turned into markdown, same with
|
|
|
|
*/
|
|
|
|
pub fn itemize_content(content: &str) -> Vec<Item> {
|
|
|
|
let frag = Html::parse_fragment(content);
|
|
|
|
frag.root_element().children().map(|e|{
|
|
|
|
parse_items(e)
|
|
|
|
}).collect()
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn get_children(el: &ElementRef) -> Vec<Item>{
|
|
|
|
el.children().map(|c|{parse_items(c)}).collect()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{
|
|
|
|
if n.value().is_text(){
|
|
|
|
return Item::Text((&n.value().as_text().unwrap()).to_string())
|
|
|
|
}
|
|
|
|
|
|
|
|
if n.value().is_element(){
|
|
|
|
let el = ElementRef::wrap(n).unwrap();
|
|
|
|
let tag_name = el.value().name();
|
|
|
|
let mut item: Item;
|
|
|
|
match tag_name {
|
|
|
|
"br" => {return Item::Text("\n".to_owned())},
|
|
|
|
"hr" => {return Item::Text("---".to_owned())}
|
|
|
|
"p" => {
|
|
|
|
return Item::Paragraph(get_children(&el))
|
|
|
|
},
|
|
|
|
"a" => {
|
|
|
|
let href = match el.attr("href") {
|
|
|
|
Some(link) => {link}
|
|
|
|
None => {""}
|
|
|
|
};
|
|
|
|
return Item::Link(
|
|
|
|
Link{
|
|
|
|
href: href.to_owned(),
|
|
|
|
children: get_children(&el)
|
|
|
|
}
|
|
|
|
)
|
|
|
|
}
|
|
|
|
"img" => {
|
|
|
|
match el.attr("src") {
|
|
|
|
Some(src) => {
|
|
|
|
return Item::Image(src.to_owned())
|
|
|
|
},
|
|
|
|
None => {return Item::Ignore}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
"source" => {
|
|
|
|
match el.attr("src") {
|
|
|
|
Some(src) => {
|
|
|
|
return Item::Source(src.to_owned())
|
|
|
|
},
|
|
|
|
None => {return Item::Ignore}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
"video" => {
|
|
|
|
return Item::Video(
|
|
|
|
Video{
|
|
|
|
children: get_children(&el)
|
|
|
|
}
|
|
|
|
)
|
|
|
|
}
|
|
|
|
"ol" => {
|
|
|
|
return Item::OrderedList(get_children(&el))
|
|
|
|
}
|
|
|
|
"ul" => {
|
|
|
|
return Item::UnorderedList(get_children(&el))
|
|
|
|
}
|
|
|
|
"li" => {
|
|
|
|
return Item::ListItem(get_children(&el))
|
|
|
|
}
|
|
|
|
|
|
|
|
_ => {}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
Item::Ignore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
Ideally I would verify what works and write tests for it.
|
|
|
|
I also need a function to process markdown items.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
pub fn add(left: u64, right: u64) -> u64 {
|
|
|
|
left + right
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn it_works() {
|
|
|
|
let result = add(2, 2);
|
|
|
|
assert_eq!(result, 4);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use core::panic;
|
|
|
|
use rss::Channel;
|
|
|
|
mod example_data;
|
|
|
|
|
|
|
|
use crate::{itemize_content, tests::example_data::FEEDS};
|
|
|
|
#[test]
|
|
|
|
fn real_feeds(){
|
|
|
|
let _ = FEEDS.map(|u|{
|
|
|
|
let feed = rss::Channel::read_from(u.as_bytes()).unwrap();
|
|
|
|
let results: Vec<_> = feed.items.into_iter().map(|item| {
|
|
|
|
itemize_content(&item.content.unwrap());
|
|
|
|
}).collect();
|
|
|
|
//let results: Vec<_> = itemize_content(u);
|
|
|
|
println!("Evaluated feed\nScanned {} items without errors",results.len())
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|