Add PDF support for document mapping

JordanCoin · claude · JordanCoin · commit aa205b4e42a1 · 2026-01-29T16:22:44.000-05:00
- Parse PDF outlines/bookmarks into section structure
- Fall back to page-based structure for PDFs without outlines
- Handle scanned/image-only PDFs gracefully
- Fix long filename rendering crash in tree output
- Update help text and README with PDF documentation

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/README.md b/README.md
@@ -8,9 +8,9 @@
 
 ## The Problem
 
-Documentation files are everywhere — READMEs, design docs, changelogs, API references. But:
+Documentation files are everywhere — READMEs, design docs, changelogs, API references, PDFs. But:
 
-- LLMs can't open large markdown files (token limits)
+- LLMs can't open large markdown files or PDFs (token limits)
 - Humans have to open each file to see what's inside
 - There's no "file tree" for documentation *content*
 
@@ -61,8 +61,9 @@ scoop install docmap
 ## Usage
 
 ```bash
-docmap .                          # All markdown files in directory
-docmap README.md                  # Single file deep dive
+docmap .                          # All markdown and PDF files in directory
+docmap README.md                  # Single markdown file deep dive
+docmap report.pdf                 # Single PDF file structure
 docmap docs/                      # Specific folder
 docmap README.md --section "API"  # Filter to section
 docmap README.md --expand "API"   # Show section content
@@ -73,7 +74,7 @@ docmap . --refs                   # Show cross-references between docs
 
 ### Directory Mode
 
-Map all markdown files in a project:
+Map all markdown and PDF files in a project:
 
 ```bash
 docmap /path/to/project
@@ -123,6 +124,28 @@ See the actual content:
 docmap docs/API.md --expand "Authentication"
 ```
 
+### PDF Support
+
+Map PDF documents by their outline (table of contents):
+
+```bash
+docmap report.pdf
+```
+
+```
+╭──────────────────── report.pdf ────────────────────╮
+│             Sections: 7 | ~16.9k tokens            │
+╰────────────────────────────────────────────────────╯
+
+└── Claude Code-Powered Writing Editor (9.1k)
+    ├── Introduction and Feasibility Overview (1.3k)
+    ├── Key Features and Benefits (1.3k)
+    ├── Designing the Interface (1.3k)
+    └── Development Steps (1.3k)
+```
+
+**Note:** For PDFs with outlines/bookmarks, docmap shows the document structure with estimated token distribution. For PDFs without outlines, it falls back to page-by-page structure. Scanned/image-only PDFs will show a page count but no text content.
+
 ### References Mode
 
 See how docs link to each other (like `codemap --deps`):
@@ -172,12 +195,19 @@ Together: complete spatial awareness of any repository.
 
 ## How It Works
 
-1. **Parse** markdown headings into a tree structure
+**Markdown:**
+1. **Parse** headings into a tree structure
 2. **Estimate** tokens per section (~4 chars/token)
 3. **Extract** key terms (bold text, inline code)
 4. **Render** as a navigable tree
 
-No external dependencies. No API calls. Just fast, local parsing.
+**PDF:**
+1. **Extract** outline/bookmarks if available, otherwise use pages
+2. **Parse** text content from each page
+3. **Estimate** tokens (~4 chars/token)
+4. **Render** as a navigable tree
+
+No API calls. Just fast, local parsing.
 
 ## Roadmap
 
diff --git a/go.mod b/go.mod
@@ -1,3 +1,5 @@
 module github.com/JordanCoin/docmap
 
 go 1.25.5
+
+require github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
diff --git a/go.sum b/go.sum
@@ -0,0 +1,2 @@
+github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8=
+github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
diff --git a/main.go b/main.go
@@ -97,7 +97,7 @@ func main() {
 		// Multi-file mode: find all .md files
 		docs := parseDirectory(target)
 		if len(docs) == 0 {
-			fmt.Println("No markdown files found")
+			fmt.Println("No markdown or PDF files found")
 			os.Exit(1)
 		}
 		if jsonMode {
@@ -110,13 +110,26 @@ func main() {
 		}
 	} else {
 		// Single file mode
-		content, err := os.ReadFile(target)
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "Error reading file: %v\n", err)
-			os.Exit(1)
+		var doc *parser.Document
+
+		if strings.HasSuffix(strings.ToLower(target), ".pdf") {
+			// PDF file
+			var err error
+			doc, err = parser.ParsePDF(target)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Error parsing PDF: %v\n", err)
+				os.Exit(1)
+			}
+		} else {
+			// Markdown file
+			content, err := os.ReadFile(target)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Error reading file: %v\n", err)
+				os.Exit(1)
+			}
+			doc = parser.Parse(string(content))
 		}
 
-		doc := parser.Parse(string(content))
 		parts := strings.Split(target, "/")
 		doc.Filename = parts[len(parts)-1]
 
@@ -143,21 +156,38 @@ func parseDirectory(dir string) []*parser.Document {
 		if info.IsDir() {
 			return nil
 		}
-		if !strings.HasSuffix(strings.ToLower(path), ".md") {
+
+		lowerPath := strings.ToLower(path)
+		isMd := strings.HasSuffix(lowerPath, ".md")
+		isPdf := strings.HasSuffix(lowerPath, ".pdf")
+
+		if !isMd && !isPdf {
 			return nil
 		}
-		// Skip hidden files and common non-doc files
+
+		// Skip hidden files
 		base := filepath.Base(path)
 		if strings.HasPrefix(base, ".") {
 			return nil
 		}
 
-		content, err := os.ReadFile(path)
-		if err != nil {
-			return nil
+		var doc *parser.Document
+
+		if isPdf {
+			var err error
+			doc, err = parser.ParsePDF(path)
+			if err != nil {
+				// Skip PDFs that can't be parsed
+				return nil
+			}
+		} else {
+			content, err := os.ReadFile(path)
+			if err != nil {
+				return nil
+			}
+			doc = parser.Parse(string(content))
 		}
 
-		doc := parser.Parse(string(content))
 		// Get relative path from dir
 		relPath, _ := filepath.Rel(dir, path)
 		doc.Filename = relPath
@@ -217,11 +247,12 @@ func printUsage() {
 	fmt.Println(`docmap - instant documentation structure for LLMs and humans
 
 Usage:
-  docmap <file.md|dir> [flags]
+  docmap <file.md|file.pdf|dir> [flags]
 
 Examples:
-  docmap .                          # All markdown files in directory
-  docmap README.md                  # Single file deep dive
+  docmap .                          # All markdown and PDF files in directory
+  docmap README.md                  # Single markdown file deep dive
+  docmap report.pdf                 # Single PDF file structure
   docmap docs/                      # Specific folder
   docmap README.md --section "API"  # Filter to section
   docmap README.md --expand "API"   # Show section content
@@ -235,5 +266,9 @@ Flags:
   -v, --version          Print version
   -h, --help             Show this help
 
+PDF Support:
+  PDFs with outlines show document structure; tokens are estimated.
+  PDFs without outlines fall back to page-by-page structure.
+
 More info: https://github.com/JordanCoin/docmap`)
 }
diff --git a/parser/pdf.go b/parser/pdf.go
@@ -0,0 +1,172 @@
+package parser
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/ledongthuc/pdf"
+)
+
+// ParsePDF parses a PDF file into a Document structure
+func ParsePDF(filepath string) (*Document, error) {
+	f, r, err := pdf.Open(filepath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open PDF: %w", err)
+	}
+	defer f.Close()
+
+	doc := &Document{}
+
+	// Try to extract outline (bookmarks) first
+	outline := r.Outline()
+	if hasOutline(outline) {
+		doc.Sections = parseOutline(outline)
+		// Add page content as token estimates
+		addPageTokens(r, doc)
+	} else {
+		// Fall back to page-based structure
+		doc.Sections = parseByPage(r)
+	}
+
+	// Calculate total tokens
+	for _, s := range doc.GetAllSections() {
+		doc.TotalTokens += s.Tokens
+	}
+
+	return doc, nil
+}
+
+// hasOutline checks if the PDF has a meaningful outline structure
+func hasOutline(outline pdf.Outline) bool {
+	return len(outline.Child) > 0
+}
+
+// parseOutline extracts document structure from PDF bookmarks
+func parseOutline(outline pdf.Outline) []*Section {
+	var sections []*Section
+
+	for _, item := range outline.Child {
+		section := outlineItemToSection(item, 1)
+		sections = append(sections, section)
+	}
+
+	return sections
+}
+
+// outlineItemToSection converts a PDF outline item to a Section
+func outlineItemToSection(item pdf.Outline, level int) *Section {
+	section := &Section{
+		Level: level,
+		Title: strings.TrimSpace(item.Title),
+	}
+
+	// Process children recursively
+	for _, child := range item.Child {
+		childSection := outlineItemToSection(child, level+1)
+		childSection.Parent = section
+		section.Children = append(section.Children, childSection)
+	}
+
+	return section
+}
+
+// addPageTokens adds token estimates to outlined documents by reading all pages
+func addPageTokens(r *pdf.Reader, doc *Document) {
+	numPages := r.NumPage()
+	totalText := strings.Builder{}
+
+	for i := 1; i <= numPages; i++ {
+		page := r.Page(i)
+		if page.V.IsNull() {
+			continue
+		}
+		text, err := page.GetPlainText(nil)
+		if err == nil {
+			totalText.WriteString(text)
+		}
+	}
+
+	// Distribute tokens across top-level sections proportionally
+	allText := totalText.String()
+	totalTokens := estimateTokens(allText)
+
+	if len(doc.Sections) > 0 && totalTokens > 0 {
+		tokensPerSection := totalTokens / len(doc.Sections)
+		for _, section := range doc.Sections {
+			distributeTokens(section, tokensPerSection)
+		}
+	}
+}
+
+// distributeTokens assigns token estimates to a section and its children
+func distributeTokens(section *Section, tokens int) {
+	if len(section.Children) == 0 {
+		section.Tokens = tokens
+		return
+	}
+
+	// Give parent a portion, distribute rest to children
+	childCount := len(section.Children)
+	perChild := tokens / (childCount + 1)
+	section.Tokens = perChild
+
+	for _, child := range section.Children {
+		distributeTokens(child, perChild)
+	}
+
+	// Recalculate cumulative tokens
+	calculateCumulativeTokens(section)
+}
+
+// parseByPage creates a page-based structure for PDFs without outlines
+func parseByPage(r *pdf.Reader) []*Section {
+	numPages := r.NumPage()
+	if numPages == 0 {
+		return nil
+	}
+
+	var sections []*Section
+	var hasContent bool
+
+	for i := 1; i <= numPages; i++ {
+		page := r.Page(i)
+		if page.V.IsNull() {
+			continue
+		}
+
+		text, err := page.GetPlainText(nil)
+		if err != nil {
+			continue
+		}
+
+		text = strings.TrimSpace(text)
+		if text == "" {
+			continue
+		}
+
+		hasContent = true
+		section := &Section{
+			Level:     1,
+			Title:     fmt.Sprintf("Page %d", i),
+			Content:   text,
+			Tokens:    estimateTokens(text),
+			LineStart: i, // Page number
+			LineEnd:   i,
+		}
+		sections = append(sections, section)
+	}
+
+	// If no text content was extracted, return a warning section
+	if !hasContent && numPages > 0 {
+		return []*Section{{
+			Level:     1,
+			Title:     fmt.Sprintf("(%d pages - no extractable text)", numPages),
+			Content:   "",
+			Tokens:    0,
+			LineStart: 1,
+			LineEnd:   numPages,
+		}}
+	}
+
+	return sections
+}
diff --git a/parser/pdf_test.go b/parser/pdf_test.go
diff --git a/render/tree.go b/render/tree.go

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8=`
	`2`	`+github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=`