Parses content reasonably well for MVP status, will update as I try to integrate it into the main project
This commit is contained in:
commit
099d891c80
7 changed files with 22051 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/target
|
2115
Cargo.lock
generated
Normal file
2115
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
23
Cargo.toml
Normal file
23
Cargo.toml
Normal file
|
@ -0,0 +1,23 @@
|
|||
[package]
|
||||
name = "rss_content"
|
||||
version = "0.1.1"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
ego-tree = "0.10.0"
|
||||
reqwest = {features=["blocking"], version="0.12.22"}
|
||||
#iced = { git = "https://github.com/iced-rs/iced", version = "0.14.0-dev" , features=["markdown"]}
|
||||
rss = "2.0.12"
|
||||
scraper = "0.23.1"
|
||||
|
||||
[profile.dev]
|
||||
debug=true
|
||||
incremental = true
|
||||
codegen-units = 16
|
||||
|
||||
[profile.release]
|
||||
opt-level = "z"
|
||||
lto = true
|
||||
codegen-units = 1
|
||||
panic = "abort"
|
||||
strip=true
|
6
readme.md
Normal file
6
readme.md
Normal file
|
@ -0,0 +1,6 @@
|
|||
# rss_content
|
||||
|
||||
A simple rust crate to serialize `<content:encoded>` into useful objects using the `scraper` crate. If you need to parse general rss, you probably just want the [rss crate](https://crates.io/crates/rss)
|
||||
|
||||
This is part of my personal learning of Rust & [iced.](iced.rs)
|
||||
The purpose of this library is to be useful for selecting particular iced widgets.
|
46
src/elements.rs
Normal file
46
src/elements.rs
Normal file
|
@ -0,0 +1,46 @@
|
|||
|
||||
#[derive(Debug)]
|
||||
pub enum Item {
|
||||
Ignore,
|
||||
Text(String),
|
||||
//text, links, formatting are all markdown
|
||||
//arguably, for better control it will be best to turn markdown into its own set of items
|
||||
Image(String),
|
||||
Gif(String), //can't detect gif from image, has to be handled on front-end
|
||||
Svg(String),// wont' support for a while I think.
|
||||
Video(Video),
|
||||
Audio(Audio),
|
||||
Source(String),
|
||||
BoldedText(Vec<Item>),
|
||||
EmphasisText(Vec<Item>),
|
||||
UnorderedList(Vec<Item>),
|
||||
OrderedList(Vec<Item>),
|
||||
ListItem(Vec<Item>),
|
||||
Paragraph(Vec<Item>),//gotta replace this with specific items, needlessly flexible
|
||||
Link(Link),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Link{
|
||||
pub href: String,
|
||||
pub children: Vec<Item>
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Video{
|
||||
pub children: Vec<Item>
|
||||
//might have to do fancy things to detect autoplay...
|
||||
}
|
||||
#[derive(Debug)]
|
||||
pub struct Audio{
|
||||
pub children: Vec<Item>
|
||||
//might have to do fancy things to detect autoplay...
|
||||
}
|
||||
#[derive(Debug)]
|
||||
pub enum ContainerTag{
|
||||
P,
|
||||
Div,
|
||||
Button,//arguably redundant
|
||||
Table,
|
||||
|
||||
}
|
128
src/lib.rs
Normal file
128
src/lib.rs
Normal file
|
@ -0,0 +1,128 @@
|
|||
use scraper::{ElementRef, Html, Node};
|
||||
mod elements;
|
||||
use elements::*;
|
||||
|
||||
/*
|
||||
The goal here is to flatten the DOM as much as possible.
|
||||
paragraphs with fancy formatting are turned into markdown, same with
|
||||
*/
|
||||
pub fn itemize_content(content: &str) -> Vec<Item> {
|
||||
let frag = Html::parse_fragment(content);
|
||||
frag.root_element().children().map(|e|{
|
||||
parse_items(e)
|
||||
}).collect()
|
||||
}
|
||||
|
||||
pub fn get_children(el: &ElementRef) -> Vec<Item>{
|
||||
el.children().map(|c|{parse_items(c)}).collect()
|
||||
}
|
||||
|
||||
fn parse_items(n: ego_tree::NodeRef<'_,Node>) -> Item{
|
||||
if n.value().is_text(){
|
||||
return Item::Text((&n.value().as_text().unwrap()).to_string())
|
||||
}
|
||||
|
||||
if n.value().is_element(){
|
||||
let el = ElementRef::wrap(n).unwrap();
|
||||
let tag_name = el.value().name();
|
||||
let mut item: Item;
|
||||
match tag_name {
|
||||
"br" => {return Item::Text("\n".to_owned())},
|
||||
"hr" => {return Item::Text("---".to_owned())}
|
||||
"p" => {
|
||||
return Item::Paragraph(get_children(&el))
|
||||
},
|
||||
"a" => {
|
||||
let href = match el.attr("href") {
|
||||
Some(link) => {link}
|
||||
None => {""}
|
||||
};
|
||||
return Item::Link(
|
||||
Link{
|
||||
href: href.to_owned(),
|
||||
children: get_children(&el)
|
||||
}
|
||||
)
|
||||
}
|
||||
"img" => {
|
||||
match el.attr("src") {
|
||||
Some(src) => {
|
||||
return Item::Image(src.to_owned())
|
||||
},
|
||||
None => {return Item::Ignore}
|
||||
}
|
||||
}
|
||||
"source" => {
|
||||
match el.attr("src") {
|
||||
Some(src) => {
|
||||
return Item::Source(src.to_owned())
|
||||
},
|
||||
None => {return Item::Ignore}
|
||||
}
|
||||
}
|
||||
"video" => {
|
||||
return Item::Video(
|
||||
Video{
|
||||
children: get_children(&el)
|
||||
}
|
||||
)
|
||||
}
|
||||
"ol" => {
|
||||
return Item::OrderedList(get_children(&el))
|
||||
}
|
||||
"ul" => {
|
||||
return Item::UnorderedList(get_children(&el))
|
||||
}
|
||||
"li" => {
|
||||
return Item::ListItem(get_children(&el))
|
||||
}
|
||||
|
||||
_ => {}
|
||||
};
|
||||
}
|
||||
|
||||
Item::Ignore
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Ideally I would verify what works and write tests for it.
|
||||
I also need a function to process markdown items.
|
||||
*/
|
||||
/*
|
||||
pub fn add(left: u64, right: u64) -> u64 {
|
||||
left + right
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn it_works() {
|
||||
let result = add(2, 2);
|
||||
assert_eq!(result, 4);
|
||||
}
|
||||
}
|
||||
*/
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use core::panic;
|
||||
use rss::Channel;
|
||||
mod example_data;
|
||||
|
||||
use crate::{itemize_content, tests::example_data::FEEDS};
|
||||
#[test]
|
||||
fn real_feeds(){
|
||||
let _ = FEEDS.map(|u|{
|
||||
let feed = rss::Channel::read_from(u.as_bytes()).unwrap();
|
||||
let results: Vec<_> = feed.items.into_iter().map(|item| {
|
||||
itemize_content(&item.content.unwrap());
|
||||
}).collect();
|
||||
//let results: Vec<_> = itemize_content(u);
|
||||
println!("Evaluated feed\nScanned {} items without errors",results.len())
|
||||
});
|
||||
}
|
||||
}
|
19732
src/tests/example_data.rs
Normal file
19732
src/tests/example_data.rs
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue