Before running text extraction, check whether the PDF was digitally created or is a scan of a physical document.
// 1.0-compatible heuristic: count pages with near-empty extracted text.
// See deferred_note above for the full raster-analysis approach (1.1).
use pdfluent::prelude::*;
fn main() -> Result<()> {
let doc = PdfDocument::open("maybe_scan.pdf")?;
let total = doc.page_count();
let text_pages = doc
.pages()
.filter(|p| p.text().map(|t| t.trim().len() > 20).unwrap_or(false))
.count();
let is_likely_scan = text_pages * 100 / total.max(1) < 10;
println!("likely scanned: {is_likely_scan}");
Ok(())
}No additional features are required. Page inspection is part of the base crate.
# Cargo.toml
[dependencies]
pdfluent = "0.9"Use doc.pages() to get an iterator over all pages. Each Page gives you access to content stream analysis.
use pdfluent::PdfDocument;
let doc = PdfDocument::open("document.pdf")?;
for (i, page) in doc.pages().enumerate() {
println!("Page {}: {:?}", i + 1, page.content_type());
}has_selectable_text() returns true if the page content stream contains any text operators. has_raster_images() returns true if the page contains XObject images.
for page in doc.pages() {
let has_text = page.has_selectable_text();
let has_images = page.has_raster_images();
if !has_text && has_images {
println!("This page appears to be a scan.");
}
}Count pages without text. A score above 80% is a strong indicator that the document is a scan or a mix.
let total = doc.page_count() as f32;
let no_text = doc.pages()
.filter(|p| !p.has_selectable_text())
.count() as f32;
let scan_ratio = no_text / total;
println!("Scan ratio: {:.0}%", scan_ratio * 100.0);
if scan_ratio > 0.8 {
println!("Likely a scanned document. Consider running OCR.");
}Some scanned PDFs have a hidden text layer added by OCR software. Use has_invisible_text() to detect this.
for (i, page) in doc.pages().enumerate() {
if page.has_invisible_text() {
println!(
"Page {} has an OCR text layer (invisible text).",
i + 1
);
}
}No JVM, no runtime, no DLL dependencies. Ships as a single native binary or WASM module.
Rust's ownership model prevents buffer overflows and use-after-free. No segfaults in PDF parsing.
Same code runs server-side, in Docker, on AWS Lambda, on Cloudflare Workers, or in the browser via WASM.