Created
April 12, 2026 02:36
-
-
Save copyleftdev/b913cf69e079d3a6d5212e0a39dcea2b to your computer and use it in GitHub Desktop.
Palimpsest: Seeded PRNG (CrawlSeed)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //! Core domain types shared across all crates. | |
| use rand_chacha::ChaCha8Rng; | |
| use rand_chacha::rand_core::SeedableRng; | |
| use serde::{Deserialize, Serialize}; | |
| use url::Url; | |
| use crate::hash::ContentHash; | |
| use crate::time::CaptureInstant; | |
| /// Seed for all deterministic operations in a crawl. | |
| /// | |
| /// Law 1 (Determinism): All randomness flows from this seed through a seeded | |
| /// PRNG. No `rand::thread_rng()`, no `OsRng`, no entropy sources in core paths. | |
| /// Call [`CrawlSeed::rng()`] to get a deterministic `ChaCha8Rng`. | |
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] | |
| pub struct CrawlSeed { | |
| pub value: u64, | |
| } | |
| impl CrawlSeed { | |
| /// Create a new seed from a `u64` value. | |
| pub fn new(value: u64) -> Self { | |
| Self { value } | |
| } | |
| /// Produce a deterministic PRNG from this seed. | |
| /// | |
| /// Same seed value always yields the same RNG sequence. | |
| /// This is the **only** sanctioned way to obtain randomness in the kernel. | |
| pub fn rng(&self) -> ChaCha8Rng { | |
| ChaCha8Rng::seed_from_u64(self.value) | |
| } | |
| /// Derive a child seed for a sub-operation (e.g., per-host frontier shard). | |
| /// | |
| /// Deterministic: same parent seed + same index = same child seed. | |
| pub fn derive(&self, index: u64) -> Self { | |
| // Mix the parent seed with the index using BLAKE3 to avoid correlation. | |
| let mut input = [0u8; 16]; | |
| input[..8].copy_from_slice(&self.value.to_le_bytes()); | |
| input[8..].copy_from_slice(&index.to_le_bytes()); | |
| let hash = blake3::hash(&input); | |
| let b = hash.as_bytes(); | |
| Self { | |
| value: u64::from_le_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]]), | |
| } | |
| } | |
| } | |
| /// A target URL to be crawled, with its depth in the crawl graph and optional | |
| /// parent artifact that discovered it. | |
| #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] | |
| pub struct CrawlTarget { | |
| pub url: Url, | |
| pub depth: u32, | |
| pub parent: Option<ContentHash>, | |
| } | |
| /// Unique identifier for a captured artifact, binding content hash to its | |
| /// temporal and crawl context. | |
| #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] | |
| pub struct ArtifactId { | |
| pub content_hash: ContentHash, | |
| pub captured_at: CaptureInstant, | |
| pub crawl_context: CrawlContextId, | |
| } | |
| /// Opaque identifier for a crawl execution context. | |
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] | |
| pub struct CrawlContextId(pub u64); | |
| #[cfg(test)] | |
| mod tests { | |
| use super::*; | |
| use rand_chacha::rand_core::RngCore; | |
| #[test] | |
| fn test_seed_rng_same_seed_produces_identical_sequence() { | |
| let seed = CrawlSeed::new(42); | |
| let mut rng_a = seed.rng(); | |
| let mut rng_b = seed.rng(); | |
| let seq_a: Vec<u64> = (0..1000).map(|_| rng_a.next_u64()).collect(); | |
| let seq_b: Vec<u64> = (0..1000).map(|_| rng_b.next_u64()).collect(); | |
| assert_eq!( | |
| seq_a, seq_b, | |
| "same seed must produce identical RNG sequence" | |
| ); | |
| } | |
| #[test] | |
| fn test_seed_rng_different_seeds_produce_different_sequences() { | |
| let mut rng_a = CrawlSeed::new(1).rng(); | |
| let mut rng_b = CrawlSeed::new(2).rng(); | |
| let seq_a: Vec<u64> = (0..100).map(|_| rng_a.next_u64()).collect(); | |
| let seq_b: Vec<u64> = (0..100).map(|_| rng_b.next_u64()).collect(); | |
| assert_ne!( | |
| seq_a, seq_b, | |
| "different seeds must produce different sequences" | |
| ); | |
| } | |
| #[test] | |
| fn test_seed_derive_is_deterministic() { | |
| let parent = CrawlSeed::new(42); | |
| let child_a = parent.derive(0); | |
| let child_b = parent.derive(0); | |
| assert_eq!( | |
| child_a, child_b, | |
| "same parent + same index = same child seed" | |
| ); | |
| } | |
| #[test] | |
| fn test_seed_derive_different_indices_differ() { | |
| let parent = CrawlSeed::new(42); | |
| let child_0 = parent.derive(0); | |
| let child_1 = parent.derive(1); | |
| assert_ne!( | |
| child_0, child_1, | |
| "different indices must yield different child seeds" | |
| ); | |
| } | |
| #[test] | |
| fn test_seed_derive_different_parents_differ() { | |
| let child_a = CrawlSeed::new(1).derive(0); | |
| let child_b = CrawlSeed::new(2).derive(0); | |
| assert_ne!( | |
| child_a, child_b, | |
| "different parent seeds must yield different children" | |
| ); | |
| } | |
| #[test] | |
| fn test_seed_derive_child_rng_is_independent() { | |
| let parent = CrawlSeed::new(99); | |
| let child = parent.derive(7); | |
| let mut parent_rng = parent.rng(); | |
| let mut child_rng = child.rng(); | |
| let parent_vals: Vec<u64> = (0..100).map(|_| parent_rng.next_u64()).collect(); | |
| let child_vals: Vec<u64> = (0..100).map(|_| child_rng.next_u64()).collect(); | |
| assert_ne!( | |
| parent_vals, child_vals, | |
| "child RNG must differ from parent RNG" | |
| ); | |
| } | |
| #[test] | |
| fn test_seed_is_copy() { | |
| let seed = CrawlSeed::new(42); | |
| let copy = seed; // Copy, not move | |
| assert_eq!(seed, copy); | |
| } | |
| #[test] | |
| fn test_seed_serialization_roundtrip() { | |
| let seed = CrawlSeed::new(42); | |
| let json = serde_json::to_string(&seed).unwrap(); | |
| let deserialized: CrawlSeed = serde_json::from_str(&json).unwrap(); | |
| assert_eq!(seed, deserialized); | |
| } | |
| #[test] | |
| fn test_crawl_context_id_is_copy_and_eq() { | |
| let id = CrawlContextId(1); | |
| let copy = id; | |
| assert_eq!(id, copy); | |
| } | |
| #[test] | |
| fn test_crawl_target_equality() { | |
| let target_a = CrawlTarget { | |
| url: Url::parse("https://example.com").unwrap(), | |
| depth: 0, | |
| parent: None, | |
| }; | |
| let target_b = CrawlTarget { | |
| url: Url::parse("https://example.com").unwrap(), | |
| depth: 0, | |
| parent: None, | |
| }; | |
| assert_eq!(target_a, target_b); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment