Parses content reasonably well for MVP status, will update as I try to integrate it into the main project

2025-07-10 18:58:11 -04:00 · 2025-07-10 18:58:11 -04:00 · 099d891c80
commit 099d891c80
7 changed files with 22051 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,23 @@
+[package]
+name = "rss_content"
+version = "0.1.1"
+edition = "2024"
+
+[dependencies]
+ego-tree = "0.10.0"
+reqwest = {features=["blocking"], version="0.12.22"}
+#iced = { git = "https://github.com/iced-rs/iced", version = "0.14.0-dev" , features=["markdown"]}
+rss = "2.0.12"
+scraper = "0.23.1"
+
+[profile.dev]
+debug=true
+incremental = true
+codegen-units = 16
+
+[profile.release]
+opt-level = "z"
+lto = true
+codegen-units = 1
+panic = "abort"
+strip=true
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,6 @@
+# rss_content
+
+A simple rust crate to serialize `<content:encoded>` into useful objects using the `scraper` crate. If you need to parse general rss, you probably just want the [rss crate](https://crates.io/crates/rss)
+
+This is part of my personal learning of Rust & [iced.](iced.rs)
+The purpose of this library is to be useful for selecting particular iced widgets.
--- a/src/elements.rs
+++ b/src/elements.rs
@ -0,0 +1,46 @@
+
+#[derive(Debug)]
+pub enum Item {
+    Ignore,
+    Text(String),
+    //text, links, formatting are all markdown
+    //arguably, for better control it will be best to turn markdown into its own set of items
+    Image(String),
+    Gif(String), //can't detect gif from image, has to be handled on front-end
+    Svg(String),// wont' support for a while I think.
+    Video(Video),
+    Audio(Audio),
+    Source(String),
+    BoldedText(Vec<Item>),
+    EmphasisText(Vec<Item>),
+    UnorderedList(Vec<Item>),
+    OrderedList(Vec<Item>),
+    ListItem(Vec<Item>),
+    Paragraph(Vec<Item>),//gotta replace this with specific items, needlessly flexible
+    Link(Link),
+}
+
+#[derive(Debug)]
+pub struct Link{
+    pub href: String,
+    pub children: Vec<Item>
+}
+
+#[derive(Debug)]
+pub struct Video{
+    pub children: Vec<Item>
+    //might have to do fancy things to detect autoplay...
+}
+#[derive(Debug)]
+pub struct Audio{
+    pub children: Vec<Item>
+    //might have to do fancy things to detect autoplay...
+}
+#[derive(Debug)]
+pub enum ContainerTag{
+    P,
+    Div,
+    Button,//arguably redundant
+    Table,
+
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,128 @@
+use scraper::{ElementRef, Html, Node};
+mod elements;
+use elements::*;
+
+/*
+    The goal here is to flatten the DOM as much as possible. 
+    paragraphs with fancy formatting are turned into markdown, same with 
+*/
+pub fn itemize_content(content: &str) -> Vec<Item> {
+    let frag = Html::parse_fragment(content);
+    frag.root_element().children().map(|e|{
+        parse_items(e)
+    }).collect()
+}
+
+pub fn get_children(el: &ElementRef) -> Vec<Item>{
+    el.children().map(|c|{parse_items(c)}).collect()
+}
+
+fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{
+    if n.value().is_text(){
+        return Item::Text((&n.value().as_text().unwrap()).to_string())
+    }
+    
+    if n.value().is_element(){
+        let el = ElementRef::wrap(n).unwrap();  
+        let tag_name = el.value().name();
+        let mut item: Item;
+        match tag_name {
+            "br" => {return Item::Text("\n".to_owned())},
+            "hr" => {return Item::Text("---".to_owned())}
+            "p" => {
+                return Item::Paragraph(get_children(&el))
+            },
+            "a" => {
+                let href = match el.attr("href") {
+                    Some(link) => {link}
+                    None => {""}
+                };
+                return Item::Link(
+                        Link{
+                            href: href.to_owned(), 
+                            children: get_children(&el)
+                        }                    
+                 )
+            }
+            "img" => {
+                match el.attr("src") {
+                    Some(src) => {
+                        return Item::Image(src.to_owned())
+                    },
+                    None => {return Item::Ignore}
+                }
+            }
+            "source" => {
+                match el.attr("src") {
+                    Some(src) => {
+                        return Item::Source(src.to_owned())
+                    },
+                    None => {return Item::Ignore}
+                }
+            }
+            "video" => {
+                return Item::Video(
+                    Video{
+                        children: get_children(&el)
+                    }
+                )
+            }
+            "ol" => {
+                return Item::OrderedList(get_children(&el))
+            }
+            "ul" => {
+                return Item::UnorderedList(get_children(&el))
+            }
+            "li" => {
+                return Item::ListItem(get_children(&el))
+            }
+
+            _ => {}
+        };  
+    }
+    
+    Item::Ignore
+
+
+
+}
+
+/*
+Ideally I would verify what works and write tests for it. 
+I also need a function to process markdown items.
+*/
+/*
+pub fn add(left: u64, right: u64) -> u64 {
+    left + right
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = add(2, 2);
+        assert_eq!(result, 4);
+    }
+}
+ */
+#[cfg(test)]
+ mod tests {
+    use core::panic;
+    use rss::Channel;
+    mod example_data;
+
+    use crate::{itemize_content, tests::example_data::FEEDS};
+    #[test]
+    fn real_feeds(){
+        let _ = FEEDS.map(|u|{
+            let feed = rss::Channel::read_from(u.as_bytes()).unwrap();
+            let results: Vec<_> = feed.items.into_iter().map(|item| {
+                itemize_content(&item.content.unwrap());
+            }).collect();
+            //let results: Vec<_> = itemize_content(u);
+            println!("Evaluated feed\nScanned {} items without errors",results.len())
+        });
+    }
+}
--- a/src/tests/example_data.rs
+++ b/src/tests/example_data.rs