Skip to content

Commit aa205b4

Browse files
JordanCoinclaude
andcommitted
Add PDF support for document mapping
- Parse PDF outlines/bookmarks into section structure - Fall back to page-based structure for PDFs without outlines - Handle scanned/image-only PDFs gracefully - Fix long filename rendering crash in tree output - Update help text and README with PDF documentation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent cd3d1d6 commit aa205b4

File tree

7 files changed

+419
-23
lines changed

7 files changed

+419
-23
lines changed

README.md

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88

99
## The Problem
1010

11-
Documentation files are everywhere — READMEs, design docs, changelogs, API references. But:
11+
Documentation files are everywhere — READMEs, design docs, changelogs, API references, PDFs. But:
1212

13-
- LLMs can't open large markdown files (token limits)
13+
- LLMs can't open large markdown files or PDFs (token limits)
1414
- Humans have to open each file to see what's inside
1515
- There's no "file tree" for documentation *content*
1616

@@ -61,8 +61,9 @@ scoop install docmap
6161
## Usage
6262

6363
```bash
64-
docmap . # All markdown files in directory
65-
docmap README.md # Single file deep dive
64+
docmap . # All markdown and PDF files in directory
65+
docmap README.md # Single markdown file deep dive
66+
docmap report.pdf # Single PDF file structure
6667
docmap docs/ # Specific folder
6768
docmap README.md --section "API" # Filter to section
6869
docmap README.md --expand "API" # Show section content
@@ -73,7 +74,7 @@ docmap . --refs # Show cross-references between docs
7374

7475
### Directory Mode
7576

76-
Map all markdown files in a project:
77+
Map all markdown and PDF files in a project:
7778

7879
```bash
7980
docmap /path/to/project
@@ -123,6 +124,28 @@ See the actual content:
123124
docmap docs/API.md --expand "Authentication"
124125
```
125126

127+
### PDF Support
128+
129+
Map PDF documents by their outline (table of contents):
130+
131+
```bash
132+
docmap report.pdf
133+
```
134+
135+
```
136+
╭──────────────────── report.pdf ────────────────────╮
137+
│ Sections: 7 | ~16.9k tokens │
138+
╰────────────────────────────────────────────────────╯
139+
140+
└── Claude Code-Powered Writing Editor (9.1k)
141+
├── Introduction and Feasibility Overview (1.3k)
142+
├── Key Features and Benefits (1.3k)
143+
├── Designing the Interface (1.3k)
144+
└── Development Steps (1.3k)
145+
```
146+
147+
**Note:** For PDFs with outlines/bookmarks, docmap shows the document structure with estimated token distribution. For PDFs without outlines, it falls back to page-by-page structure. Scanned/image-only PDFs will show a page count but no text content.
148+
126149
### References Mode
127150

128151
See how docs link to each other (like `codemap --deps`):
@@ -172,12 +195,19 @@ Together: complete spatial awareness of any repository.
172195

173196
## How It Works
174197

175-
1. **Parse** markdown headings into a tree structure
198+
**Markdown:**
199+
1. **Parse** headings into a tree structure
176200
2. **Estimate** tokens per section (~4 chars/token)
177201
3. **Extract** key terms (bold text, inline code)
178202
4. **Render** as a navigable tree
179203

180-
No external dependencies. No API calls. Just fast, local parsing.
204+
**PDF:**
205+
1. **Extract** outline/bookmarks if available, otherwise use pages
206+
2. **Parse** text content from each page
207+
3. **Estimate** tokens (~4 chars/token)
208+
4. **Render** as a navigable tree
209+
210+
No API calls. Just fast, local parsing.
181211

182212
## Roadmap
183213

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
module github.com/JordanCoin/docmap
22

33
go 1.25.5
4+
5+
require github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8=
2+
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=

main.go

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ func main() {
9797
// Multi-file mode: find all .md files
9898
docs := parseDirectory(target)
9999
if len(docs) == 0 {
100-
fmt.Println("No markdown files found")
100+
fmt.Println("No markdown or PDF files found")
101101
os.Exit(1)
102102
}
103103
if jsonMode {
@@ -110,13 +110,26 @@ func main() {
110110
}
111111
} else {
112112
// Single file mode
113-
content, err := os.ReadFile(target)
114-
if err != nil {
115-
fmt.Fprintf(os.Stderr, "Error reading file: %v\n", err)
116-
os.Exit(1)
113+
var doc *parser.Document
114+
115+
if strings.HasSuffix(strings.ToLower(target), ".pdf") {
116+
// PDF file
117+
var err error
118+
doc, err = parser.ParsePDF(target)
119+
if err != nil {
120+
fmt.Fprintf(os.Stderr, "Error parsing PDF: %v\n", err)
121+
os.Exit(1)
122+
}
123+
} else {
124+
// Markdown file
125+
content, err := os.ReadFile(target)
126+
if err != nil {
127+
fmt.Fprintf(os.Stderr, "Error reading file: %v\n", err)
128+
os.Exit(1)
129+
}
130+
doc = parser.Parse(string(content))
117131
}
118132

119-
doc := parser.Parse(string(content))
120133
parts := strings.Split(target, "/")
121134
doc.Filename = parts[len(parts)-1]
122135

@@ -143,21 +156,38 @@ func parseDirectory(dir string) []*parser.Document {
143156
if info.IsDir() {
144157
return nil
145158
}
146-
if !strings.HasSuffix(strings.ToLower(path), ".md") {
159+
160+
lowerPath := strings.ToLower(path)
161+
isMd := strings.HasSuffix(lowerPath, ".md")
162+
isPdf := strings.HasSuffix(lowerPath, ".pdf")
163+
164+
if !isMd && !isPdf {
147165
return nil
148166
}
149-
// Skip hidden files and common non-doc files
167+
168+
// Skip hidden files
150169
base := filepath.Base(path)
151170
if strings.HasPrefix(base, ".") {
152171
return nil
153172
}
154173

155-
content, err := os.ReadFile(path)
156-
if err != nil {
157-
return nil
174+
var doc *parser.Document
175+
176+
if isPdf {
177+
var err error
178+
doc, err = parser.ParsePDF(path)
179+
if err != nil {
180+
// Skip PDFs that can't be parsed
181+
return nil
182+
}
183+
} else {
184+
content, err := os.ReadFile(path)
185+
if err != nil {
186+
return nil
187+
}
188+
doc = parser.Parse(string(content))
158189
}
159190

160-
doc := parser.Parse(string(content))
161191
// Get relative path from dir
162192
relPath, _ := filepath.Rel(dir, path)
163193
doc.Filename = relPath
@@ -217,11 +247,12 @@ func printUsage() {
217247
fmt.Println(`docmap - instant documentation structure for LLMs and humans
218248
219249
Usage:
220-
docmap <file.md|dir> [flags]
250+
docmap <file.md|file.pdf|dir> [flags]
221251
222252
Examples:
223-
docmap . # All markdown files in directory
224-
docmap README.md # Single file deep dive
253+
docmap . # All markdown and PDF files in directory
254+
docmap README.md # Single markdown file deep dive
255+
docmap report.pdf # Single PDF file structure
225256
docmap docs/ # Specific folder
226257
docmap README.md --section "API" # Filter to section
227258
docmap README.md --expand "API" # Show section content
@@ -235,5 +266,9 @@ Flags:
235266
-v, --version Print version
236267
-h, --help Show this help
237268
269+
PDF Support:
270+
PDFs with outlines show document structure; tokens are estimated.
271+
PDFs without outlines fall back to page-by-page structure.
272+
238273
More info: https://github.com/JordanCoin/docmap`)
239274
}

parser/pdf.go

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
package parser
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
7+
"github.com/ledongthuc/pdf"
8+
)
9+
10+
// ParsePDF parses a PDF file into a Document structure
11+
func ParsePDF(filepath string) (*Document, error) {
12+
f, r, err := pdf.Open(filepath)
13+
if err != nil {
14+
return nil, fmt.Errorf("failed to open PDF: %w", err)
15+
}
16+
defer f.Close()
17+
18+
doc := &Document{}
19+
20+
// Try to extract outline (bookmarks) first
21+
outline := r.Outline()
22+
if hasOutline(outline) {
23+
doc.Sections = parseOutline(outline)
24+
// Add page content as token estimates
25+
addPageTokens(r, doc)
26+
} else {
27+
// Fall back to page-based structure
28+
doc.Sections = parseByPage(r)
29+
}
30+
31+
// Calculate total tokens
32+
for _, s := range doc.GetAllSections() {
33+
doc.TotalTokens += s.Tokens
34+
}
35+
36+
return doc, nil
37+
}
38+
39+
// hasOutline checks if the PDF has a meaningful outline structure
40+
func hasOutline(outline pdf.Outline) bool {
41+
return len(outline.Child) > 0
42+
}
43+
44+
// parseOutline extracts document structure from PDF bookmarks
45+
func parseOutline(outline pdf.Outline) []*Section {
46+
var sections []*Section
47+
48+
for _, item := range outline.Child {
49+
section := outlineItemToSection(item, 1)
50+
sections = append(sections, section)
51+
}
52+
53+
return sections
54+
}
55+
56+
// outlineItemToSection converts a PDF outline item to a Section
57+
func outlineItemToSection(item pdf.Outline, level int) *Section {
58+
section := &Section{
59+
Level: level,
60+
Title: strings.TrimSpace(item.Title),
61+
}
62+
63+
// Process children recursively
64+
for _, child := range item.Child {
65+
childSection := outlineItemToSection(child, level+1)
66+
childSection.Parent = section
67+
section.Children = append(section.Children, childSection)
68+
}
69+
70+
return section
71+
}
72+
73+
// addPageTokens adds token estimates to outlined documents by reading all pages
74+
func addPageTokens(r *pdf.Reader, doc *Document) {
75+
numPages := r.NumPage()
76+
totalText := strings.Builder{}
77+
78+
for i := 1; i <= numPages; i++ {
79+
page := r.Page(i)
80+
if page.V.IsNull() {
81+
continue
82+
}
83+
text, err := page.GetPlainText(nil)
84+
if err == nil {
85+
totalText.WriteString(text)
86+
}
87+
}
88+
89+
// Distribute tokens across top-level sections proportionally
90+
allText := totalText.String()
91+
totalTokens := estimateTokens(allText)
92+
93+
if len(doc.Sections) > 0 && totalTokens > 0 {
94+
tokensPerSection := totalTokens / len(doc.Sections)
95+
for _, section := range doc.Sections {
96+
distributeTokens(section, tokensPerSection)
97+
}
98+
}
99+
}
100+
101+
// distributeTokens assigns token estimates to a section and its children
102+
func distributeTokens(section *Section, tokens int) {
103+
if len(section.Children) == 0 {
104+
section.Tokens = tokens
105+
return
106+
}
107+
108+
// Give parent a portion, distribute rest to children
109+
childCount := len(section.Children)
110+
perChild := tokens / (childCount + 1)
111+
section.Tokens = perChild
112+
113+
for _, child := range section.Children {
114+
distributeTokens(child, perChild)
115+
}
116+
117+
// Recalculate cumulative tokens
118+
calculateCumulativeTokens(section)
119+
}
120+
121+
// parseByPage creates a page-based structure for PDFs without outlines
122+
func parseByPage(r *pdf.Reader) []*Section {
123+
numPages := r.NumPage()
124+
if numPages == 0 {
125+
return nil
126+
}
127+
128+
var sections []*Section
129+
var hasContent bool
130+
131+
for i := 1; i <= numPages; i++ {
132+
page := r.Page(i)
133+
if page.V.IsNull() {
134+
continue
135+
}
136+
137+
text, err := page.GetPlainText(nil)
138+
if err != nil {
139+
continue
140+
}
141+
142+
text = strings.TrimSpace(text)
143+
if text == "" {
144+
continue
145+
}
146+
147+
hasContent = true
148+
section := &Section{
149+
Level: 1,
150+
Title: fmt.Sprintf("Page %d", i),
151+
Content: text,
152+
Tokens: estimateTokens(text),
153+
LineStart: i, // Page number
154+
LineEnd: i,
155+
}
156+
sections = append(sections, section)
157+
}
158+
159+
// If no text content was extracted, return a warning section
160+
if !hasContent && numPages > 0 {
161+
return []*Section{{
162+
Level: 1,
163+
Title: fmt.Sprintf("(%d pages - no extractable text)", numPages),
164+
Content: "",
165+
Tokens: 0,
166+
LineStart: 1,
167+
LineEnd: numPages,
168+
}}
169+
}
170+
171+
return sections
172+
}

0 commit comments

Comments
 (0)