diff --git a/.agents/summary/.last_commit b/.agents/summary/.last_commit new file mode 100644 index 00000000..f8cbf9ca --- /dev/null +++ b/.agents/summary/.last_commit @@ -0,0 +1 @@ +8d6102bc644641c94f5a695a32ea50c19b3c8d68 diff --git a/.agents/summary/architecture.md b/.agents/summary/architecture.md new file mode 100644 index 00000000..f005f8c2 --- /dev/null +++ b/.agents/summary/architecture.md @@ -0,0 +1,411 @@ +# System Architecture + +## High-Level Overview + +The PDF Accessibility Solutions system provides two independent but complementary approaches to making PDF documents accessible: + +1. **PDF-to-PDF Remediation**: Maintains PDF format while adding accessibility features +2. **PDF-to-HTML Remediation**: Converts PDFs to accessible HTML + +Both solutions are serverless, event-driven architectures deployed on AWS. + +## Architecture Diagram + +```mermaid +graph TB + subgraph "PDF-to-PDF Solution" + S3_PDF[S3 Bucket
pdf/ folder] + Splitter[Lambda: PDF Splitter] + StepFn[Step Functions
Orchestrator] + Adobe[ECS: Adobe Autotag] + AltText[ECS: Alt Text Generator] + TitleGen[Lambda: Title Generator] + PreCheck[Lambda: Pre-Check] + PostCheck[Lambda: Post-Check] + Merger[Lambda: PDF Merger
Java] + S3_Result[S3: result/ folder] + + S3_PDF -->|S3 Event| Splitter + Splitter -->|Trigger| StepFn + StepFn --> PreCheck + PreCheck --> Adobe + Adobe --> AltText + AltText --> TitleGen + TitleGen --> PostCheck + PostCheck --> Merger + Merger --> S3_Result + end + + subgraph "PDF-to-HTML Solution" + S3_HTML[S3 Bucket
uploads/ folder] + Lambda_HTML[Lambda: PDF2HTML
Container] + BDA[Bedrock Data
Automation] + Bedrock[Bedrock
Nova Pro] + S3_Remediated[S3: remediated/ folder] + + S3_HTML -->|S3 Event| Lambda_HTML + Lambda_HTML --> BDA + BDA -->|Parse PDF| Lambda_HTML + Lambda_HTML -->|Audit| Lambda_HTML + Lambda_HTML -->|Remediate| Bedrock + Bedrock -->|AI Fixes| Lambda_HTML + Lambda_HTML --> S3_Remediated + end + + subgraph "Shared Services" + CW[CloudWatch
Metrics & Logs] + Secrets[Secrets Manager
Adobe Credentials] + Tagger[Lambda: S3 Tagger
User Attribution] + end + + StepFn -.->|Metrics| CW + Lambda_HTML -.->|Metrics| CW + Adobe -.->|Credentials| Secrets + S3_PDF -.->|Tag Objects| Tagger + S3_HTML -.->|Tag Objects| Tagger +``` + +## PDF-to-PDF Solution Architecture + +### Workflow + +```mermaid +sequenceDiagram + participant User + participant S3 + participant Splitter + participant StepFn + participant Adobe + participant AltText + participant TitleGen + participant Merger + + User->>S3: Upload PDF to pdf/ folder + S3->>Splitter: S3 Event Notification + Splitter->>S3: Split into chunks (temp/) + Splitter->>StepFn: Start workflow + + loop For each chunk + StepFn->>Adobe: ECS Task (Fargate) + Adobe->>Adobe: Auto-tag PDF structure + Adobe->>AltText: Pass tagged PDF + AltText->>AltText: Generate alt text (Bedrock) + AltText->>S3: Save processed chunk + end + + StepFn->>TitleGen: Generate document title + TitleGen->>Merger: Merge all chunks + Merger->>S3: Save to result/ folder + S3->>User: Download compliant PDF +``` + +### Components + +#### 1. PDF Splitter Lambda +- **Runtime**: Python 3.12 +- **Trigger**: S3 PUT event on `pdf/` folder +- **Function**: Splits large PDFs into manageable chunks (pages) +- **Output**: Individual page PDFs in `temp/` folder +- **Metrics**: Pages processed, file sizes + +#### 2. Step Functions Orchestrator +- **Type**: Standard workflow +- **Purpose**: Coordinates parallel processing of PDF chunks +- **Features**: + - Parallel execution for multiple chunks + - Error handling and retries + - Progress tracking +- **Timeout**: Configurable (default: 1 hour) + +#### 3. Adobe Autotag ECS Task +- **Platform**: ECS Fargate +- **Container**: Python-based +- **Function**: + - Calls Adobe PDF Services API + - Adds PDF structure tags (headings, lists, tables) + - Extracts images and metadata +- **API**: Adobe PDF Extract API +- **Credentials**: Stored in Secrets Manager + +#### 4. Alt Text Generator ECS Task +- **Platform**: ECS Fargate +- **Container**: Node.js-based +- **Function**: + - Generates alt text for images using Bedrock + - Embeds alt text into PDF structure + - Uses vision-capable models +- **Model**: Amazon Nova Pro (multimodal) + +#### 5. Title Generator Lambda +- **Runtime**: Python 3.12 +- **Function**: Generates descriptive PDF title using Bedrock +- **Model**: Amazon Nova Pro +- **Input**: PDF text content +- **Output**: Metadata with generated title + +#### 6. PDF Merger Lambda +- **Runtime**: Java 11 +- **Function**: Merges processed chunks into single PDF +- **Library**: Apache PDFBox +- **Output**: Final compliant PDF with "COMPLIANT" prefix + +#### 7. Accessibility Checkers +- **Pre-Remediation**: Audits original PDF +- **Post-Remediation**: Validates compliance +- **Output**: JSON reports with WCAG issues + +### Infrastructure + +#### VPC Configuration +- **Subnets**: Public and Private with NAT Gateway +- **VPC Endpoints**: ECR, ECR Docker, S3 (reduces cold start by 10-15s) +- **Security**: Private subnets for ECS tasks + +#### ECS Cluster +- **Launch Type**: Fargate +- **CPU**: 2 vCPU (configurable) +- **Memory**: 4 GB (configurable) +- **Networking**: Private subnets with NAT egress + +#### S3 Bucket Structure +``` +pdfaccessibility-{id}/ +├── pdf/ # Input PDFs (trigger) +├── temp/ # Intermediate chunks +└── result/ # Final compliant PDFs +``` + +## PDF-to-HTML Solution Architecture + +### Workflow + +```mermaid +sequenceDiagram + participant User + participant S3 + participant Lambda + participant BDA + participant Bedrock + + User->>S3: Upload PDF to uploads/ + S3->>Lambda: S3 Event Notification + Lambda->>BDA: Create parsing job + BDA->>BDA: Parse PDF structure + BDA->>Lambda: Return structured data + Lambda->>Lambda: Convert to HTML + Lambda->>Lambda: Audit accessibility + + loop For each issue + Lambda->>Bedrock: Generate fix + Bedrock->>Lambda: AI-generated solution + Lambda->>Lambda: Apply remediation + end + + Lambda->>Lambda: Generate report + Lambda->>S3: Save to remediated/ + S3->>User: Download ZIP file +``` + +### Components + +#### 1. PDF2HTML Lambda Function +- **Runtime**: Python 3.12 (container) +- **Trigger**: S3 PUT event on `uploads/` folder +- **Timeout**: 15 minutes +- **Memory**: 3 GB +- **Container**: Custom Docker image with dependencies + +#### 2. Bedrock Data Automation (BDA) +- **Service**: AWS Bedrock Data Automation +- **Function**: + - Parses PDF structure (text, images, tables) + - Extracts layout information + - Identifies document elements +- **Output**: Structured JSON with page elements + +#### 3. Accessibility Auditor +- **Module**: `audit/auditor.py` +- **Checks**: + - WCAG 2.1 Level AA criteria + - Document structure (headings, landmarks) + - Images (alt text, decorative vs. informative) + - Forms (labels, fieldsets) + - Tables (headers, captions, scope) + - Links (descriptive text) + - Color contrast +- **Output**: Detailed issue list with locations + +#### 4. Remediation Engine +- **Module**: `remediate/remediation_manager.py` +- **Strategies**: + - Image remediation (alt text generation) + - Heading hierarchy fixes + - Table structure improvements + - Form label associations + - Landmark additions (main, nav, header, footer) + - Link text improvements + - Color contrast adjustments +- **AI Integration**: Bedrock Nova Pro for complex fixes + +#### 5. Report Generator +- **Formats**: HTML, JSON, CSV, TXT +- **Content**: + - Issues found and fixed + - WCAG criteria mapping + - Before/after comparisons + - Usage statistics (tokens, API calls, costs) + +### Infrastructure + +#### Lambda Container +- **Base Image**: `public.ecr.aws/lambda/python:3.12` +- **Dependencies**: + - `beautifulsoup4`, `lxml` (HTML parsing) + - `boto3` (AWS SDK) + - `Pillow` (image processing) + - Custom accessibility utility library + +#### S3 Bucket Structure +``` +pdf2html-bucket-{id}/ +├── uploads/ # Input PDFs (trigger) +├── output/ # Temporary processing files +└── remediated/ # Final ZIP files + └── final_{filename}.zip + ├── remediated.html + ├── result.html + ├── images/ + ├── remediation_report.html + └── usage_data.json +``` + +## Shared Infrastructure + +### CloudWatch Monitoring + +```mermaid +graph LR + Lambda[Lambda Functions] -->|Logs| CW[CloudWatch Logs] + ECS[ECS Tasks] -->|Logs| CW + Lambda -->|Metrics| CWM[CloudWatch Metrics] + ECS -->|Metrics| CWM + CWM --> Dashboard[Usage Dashboard] + + Dashboard --> Pages[Pages Processed] + Dashboard --> Costs[Cost Estimates] + Dashboard --> Errors[Error Rates] + Dashboard --> Tokens[Token Usage] +``` + +#### Custom Metrics Namespace: `PDFAccessibility` + +**Metrics Published**: +- `PagesProcessed`: Total pages remediated +- `AdobeAPICalls`: Adobe API invocations +- `BedrockInvocations`: Bedrock API calls +- `BedrockTokensUsed`: Input/output tokens +- `ProcessingDuration`: End-to-end time +- `ErrorCount`: Failures by type +- `FileSizeBytes`: Input/output file sizes +- `EstimatedCost`: Per-user cost tracking + +**Dimensions**: +- `Solution`: `PDF2PDF` or `PDF2HTML` +- `UserId`: Cognito user ID (from S3 tags) +- `Operation`: Specific operation type + +### S3 Object Tagging +- **Lambda**: `s3_object_tagger` +- **Purpose**: Attribute usage to individual users +- **Tags**: `user-id`, `upload-timestamp` +- **Integration**: Cognito user pools (when UI deployed) + +### Secrets Manager +- **Secret**: `adobe-pdf-services-credentials` +- **Contents**: + - `client_id`: Adobe API client ID + - `client_secret`: Adobe API client secret +- **Access**: Adobe Autotag ECS task only + +## Design Patterns + +### Event-Driven Architecture +- S3 events trigger processing pipelines +- Loose coupling between components +- Asynchronous processing + +### Serverless-First +- Lambda for lightweight operations +- ECS Fargate for heavy processing +- No server management + +### Infrastructure as Code +- AWS CDK for all resources +- Version-controlled infrastructure +- Repeatable deployments + +### Observability +- Comprehensive CloudWatch logging +- Custom metrics for business KPIs +- Cost tracking per user + +### Security +- Least privilege IAM roles +- VPC isolation for ECS tasks +- Secrets Manager for credentials +- SSL/TLS enforcement on S3 + +## Scalability Considerations + +### PDF-to-PDF +- **Parallel Processing**: Step Functions processes chunks concurrently +- **ECS Auto-scaling**: Fargate scales based on task count +- **Bottleneck**: Adobe API rate limits + +### PDF-to-HTML +- **Lambda Concurrency**: Configurable (default: 10) +- **BDA Limits**: Project-level quotas +- **Bedrock Throttling**: Model-specific limits + +### Cost Optimization +- **VPC Endpoints**: Reduce data transfer costs +- **zstd Compression**: Faster container startup (2-3x vs gzip) +- **Spot Instances**: Not used (Fargate on-demand for reliability) +- **S3 Lifecycle**: Automatic cleanup of temp files (optional) + +## Deployment Architecture + +```mermaid +graph TB + Dev[Developer] -->|git push| Repo[GitHub Repo] + Repo -->|webhook| CodeBuild[CodeBuild Project] + CodeBuild -->|cdk synth| CFN[CloudFormation] + CFN -->|deploy| Stack1[PDF-to-PDF Stack] + CFN -->|deploy| Stack2[PDF-to-HTML Stack] + CFN -->|deploy| Stack3[Metrics Stack] + + CodeBuild -->|docker build| ECR[ECR Repositories] + ECR --> Stack1 + ECR --> Stack2 +``` + +### Deployment Options +1. **One-Click**: `deploy.sh` script (CloudShell) +2. **CodeBuild**: Automated CI/CD pipeline +3. **Manual**: `cdk deploy` commands +4. **Local**: `deploy-local.sh` for development + +## Disaster Recovery + +### Backup Strategy +- **S3 Versioning**: Enabled on all buckets +- **CloudFormation**: Infrastructure recreatable from code +- **Secrets**: Backed up in Secrets Manager + +### Recovery Time Objective (RTO) +- **Infrastructure**: ~15 minutes (CDK redeploy) +- **Data**: Immediate (S3 versioning) + +### Recovery Point Objective (RPO) +- **Processing State**: Lost (stateless architecture) +- **Input Files**: Zero data loss (S3 durability) diff --git a/.agents/summary/codebase_info.md b/.agents/summary/codebase_info.md new file mode 100644 index 00000000..91a19530 --- /dev/null +++ b/.agents/summary/codebase_info.md @@ -0,0 +1,110 @@ +# Codebase Information + +## Overview + +**Project**: PDF Accessibility Solutions +**Organization**: Arizona State University's AI Cloud Innovation Center (AI CIC) +**Purpose**: Automated PDF accessibility remediation using AWS services and generative AI + +## Statistics + +- **Total Files**: 140 +- **Lines of Code**: 27,949 +- **Primary Languages**: Python (95 files), JavaScript (3 files), Java (2 files), Shell (2 files) +- **Size Category**: Medium (M) + +## Language Distribution + +| Language | Files | Functions | Classes | LOC | +|----------|-------|-----------|---------|-----| +| Python | 95 | 457 | 74 | ~25,000 | +| JavaScript | 3 | 5 | 1 | ~700 | +| Java | 2 | 7 | 2 | ~200 | +| Shell | 2 | 16 | 0 | ~1,300 | + +## Technology Stack + +### Infrastructure & Deployment +- **AWS CDK** (Python & JavaScript): Infrastructure as Code +- **AWS CloudFormation**: Stack deployment +- **CodeBuild**: CI/CD pipeline + +### AWS Services +- **Compute**: Lambda, ECS Fargate, Step Functions +- **Storage**: S3 +- **AI/ML**: Bedrock (Nova Pro model), Bedrock Data Automation +- **Monitoring**: CloudWatch, CloudWatch Dashboards +- **Security**: Secrets Manager, IAM +- **Networking**: VPC, VPC Endpoints + +### External Services +- **Adobe PDF Services API**: PDF auto-tagging and extraction + +### Python Dependencies +- `aws-cdk-lib==2.147.2` +- `boto3`: AWS SDK +- `beautifulsoup4`: HTML parsing +- `lxml`: XML/HTML processing +- `pypdf`: PDF manipulation +- `PyMuPDF (fitz)`: PDF text extraction +- `Pillow`: Image processing + +### JavaScript Dependencies +- `@aws-cdk/aws-lambda-python-alpha`: Lambda Python constructs +- `pdf-lib`: PDF manipulation +- `@aws-sdk/client-bedrock-runtime`: Bedrock API client + +## Repository Structure + +``` +PDF_Accessibility/ +├── .agents/ # AI assistant documentation +├── cdk/ # CDK infrastructure (Python) +│ ├── usage_metrics_stack.py # CloudWatch metrics dashboard +│ └── cdk_stack.py # Base stack definition +├── lambda/ # Lambda functions +│ ├── pdf-splitter-lambda/ # Splits PDFs into chunks +│ ├── pdf-merger-lambda/ # Merges processed PDFs (Java) +│ ├── title-generator-lambda/ # Generates PDF titles +│ ├── pre-remediation-accessibility-checker/ +│ ├── post-remediation-accessibility-checker/ +│ ├── s3_object_tagger/ # Tags S3 objects with user metadata +│ └── shared/ # Shared utilities (metrics) +├── pdf2html/ # PDF-to-HTML solution +│ ├── cdk/ # CDK infrastructure (JavaScript) +│ ├── content_accessibility_utility_on_aws/ # Core library +│ │ ├── audit/ # Accessibility auditing +│ │ ├── remediate/ # Accessibility remediation +│ │ ├── pdf2html/ # PDF to HTML conversion +│ │ ├── batch/ # Batch processing +│ │ └── utils/ # Utilities +│ ├── lambda_function.py # Lambda entry point +│ └── Dockerfile # Lambda container image +├── adobe-autotag-container/ # ECS container for Adobe API +├── alt-text-generator-container/ # ECS container for alt text (Node.js) +├── docs/ # Documentation +├── app.py # Main CDK app (PDF-to-PDF) +├── deploy.sh # Unified deployment script +└── deploy-local.sh # Local deployment script + +``` + +## Supported Standards + +- **WCAG 2.1 Level AA**: Web Content Accessibility Guidelines +- **PDF/UA**: PDF Universal Accessibility (ISO 14289) + +## Development Environment + +- **Python**: 3.9+ (Lambda runtime: 3.12) +- **Node.js**: 18+ (for JavaScript Lambda and CDK) +- **Java**: 11+ (for PDF merger Lambda) +- **Docker**: Required for container builds +- **AWS CLI**: Required for deployment + +## Build Artifacts + +- **CDK Output**: `cdk.out/` directory +- **Docker Images**: Pushed to ECR +- **Lambda Packages**: Zipped and uploaded to S3 +- **CloudFormation Templates**: Generated in `cdk.out/` diff --git a/.agents/summary/components.md b/.agents/summary/components.md new file mode 100644 index 00000000..a95e9487 --- /dev/null +++ b/.agents/summary/components.md @@ -0,0 +1,634 @@ +# System Components + +## Component Catalog + +This document provides detailed information about each major component in the PDF Accessibility Solutions system. + +## PDF-to-PDF Solution Components + +### 1. PDF Splitter Lambda + +**Location**: `lambda/pdf-splitter-lambda/main.py` + +**Purpose**: Splits large PDF files into individual pages for parallel processing. + +**Key Functions**: +- `lambda_handler()`: Entry point, processes S3 events +- `split_pdf_into_pages()`: Splits PDF using pypdf library +- `log_chunk_created()`: Tracks chunk creation metrics + +**Dependencies**: +- `pypdf`: PDF manipulation +- `boto3`: S3 operations +- `metrics_helper`: CloudWatch metrics + +**Metrics Published**: +- `PagesProcessed`: Number of pages split +- `FileSizeBytes`: Input file size +- `ProcessingDuration`: Split operation time + +**Triggers**: S3 PUT event on `pdf/` folder + +**Output**: Individual page PDFs in `temp/` folder with naming pattern: `{original_name}_page_{n}.pdf` + +**Error Handling**: +- Retries with exponential backoff +- Logs errors to CloudWatch +- Publishes error metrics + +--- + +### 2. Adobe Autotag Container + +**Location**: `adobe-autotag-container/adobe_autotag_processor.py` + +**Purpose**: Adds accessibility tags to PDFs using Adobe PDF Services API. + +**Key Functions**: +- `main()`: Entry point for ECS task +- `autotag_pdf_with_options()`: Calls Adobe API +- `extract_api()`: Extracts images and structure +- `add_toc_to_pdf()`: Adds table of contents +- `set_language_comprehend()`: Detects document language +- `extract_images_from_extract_api()`: Extracts images for alt text + +**Adobe API Operations**: +- **Autotag**: Adds structure tags (headings, paragraphs, lists, tables) +- **Extract**: Extracts images, text, and layout information + +**Dependencies**: +- `adobe.pdfservices.operation`: Adobe SDK +- `boto3`: S3 and Secrets Manager +- `openpyxl`: Excel parsing for image metadata +- `sqlite3`: Image metadata database + +**Configuration**: +- Credentials from Secrets Manager +- Language detection via AWS Comprehend +- Configurable tagging options + +**Metrics**: +- Adobe API calls +- Processing duration +- File sizes +- Error tracking + +**Container Specs**: +- **Base Image**: `python:3.9-slim` +- **CPU**: 2 vCPU +- **Memory**: 4 GB +- **Timeout**: 30 minutes + +--- + +### 3. Alt Text Generator Container + +**Location**: `alt-text-generator-container/alt_text_generator.js` + +**Purpose**: Generates alt text for images using Amazon Bedrock. + +**Key Functions**: +- `startProcess()`: Entry point +- `modifyPDF()`: Embeds alt text into PDF +- `generateAltText()`: Calls Bedrock for image description +- `generateAltTextForLink()`: Handles linked images + +**AI Model**: Amazon Nova Pro (multimodal vision model) + +**Process**: +1. Reads PDF with existing tags +2. Identifies images without alt text +3. Extracts image context (surrounding text) +4. Generates descriptive alt text via Bedrock +5. Embeds alt text into PDF structure +6. Saves modified PDF + +**Dependencies**: +- `pdf-lib`: PDF manipulation +- `@aws-sdk/client-bedrock-runtime`: Bedrock API +- `@aws-sdk/client-s3`: S3 operations + +**Prompt Engineering**: +- Includes image context from surrounding text +- Distinguishes decorative vs. informative images +- Generates concise, descriptive alt text + +**Container Specs**: +- **Base Image**: `node:18-alpine` +- **CPU**: 2 vCPU +- **Memory**: 4 GB +- **Timeout**: 30 minutes + +--- + +### 4. Title Generator Lambda + +**Location**: `lambda/title-generator-lambda/title_generator.py` + +**Purpose**: Generates descriptive PDF titles using AI. + +**Key Functions**: +- `lambda_handler()`: Entry point +- `generate_title()`: Calls Bedrock for title generation +- `extract_text_from_pdf()`: Extracts text using PyMuPDF +- `set_custom_metadata()`: Embeds title in PDF metadata + +**AI Model**: Amazon Nova Pro + +**Process**: +1. Extracts first few pages of text +2. Sends to Bedrock with prompt +3. Receives generated title +4. Embeds in PDF metadata +5. Saves updated PDF + +**Prompt**: Instructs model to create concise, descriptive title based on content + +**Dependencies**: +- `pymupdf (fitz)`: PDF text extraction +- `pypdf`: PDF metadata modification +- `boto3`: S3 and Bedrock + +**Metrics**: Bedrock invocations, token usage, processing time + +--- + +### 5. PDF Merger Lambda + +**Location**: `lambda/pdf-merger-lambda/PDFMergerLambda/src/main/java/com/example/App.java` + +**Purpose**: Merges processed PDF chunks into single compliant PDF. + +**Key Functions**: +- `handleRequest()`: Lambda entry point +- `downloadPDF()`: Downloads chunks from S3 +- `mergePDFs()`: Merges using Apache PDFBox +- `uploadPDF()`: Uploads final PDF + +**Technology**: Java 11 with Apache PDFBox + +**Process**: +1. Receives list of processed chunks +2. Downloads all chunks from S3 +3. Merges in correct page order +4. Adds "COMPLIANT" prefix to filename +5. Uploads to `result/` folder + +**Dependencies**: +- `org.apache.pdfbox:pdfbox`: PDF merging +- `com.amazonaws:aws-lambda-java-core`: Lambda runtime +- `software.amazon.awssdk:s3`: S3 operations + +**Memory**: 1 GB +**Timeout**: 5 minutes + +--- + +### 6. Pre/Post Remediation Accessibility Checkers + +**Locations**: +- `lambda/pre-remediation-accessibility-checker/main.py` +- `lambda/post-remediation-accessibility-checker/main.py` + +**Purpose**: Audit PDF accessibility before and after remediation. + +**Key Functions**: +- `lambda_handler()`: Entry point +- Calls external accessibility checking service/library +- Generates JSON report with WCAG issues + +**Output**: JSON file with: +- List of accessibility issues +- WCAG criteria violations +- Issue severity levels +- Suggested fixes + +**Use Case**: +- Pre-check: Baseline audit +- Post-check: Validation of remediation + +--- + +### 7. Step Functions Orchestrator + +**Definition**: Defined in `app.py` CDK stack + +**Purpose**: Coordinates parallel processing of PDF chunks. + +**Workflow**: +```mermaid +graph TD + Start[Start] --> PreCheck[Pre-Remediation Check] + PreCheck --> Map[Map State: Process Chunks] + Map --> Adobe[Adobe Autotag Task] + Adobe --> AltText[Alt Text Generator Task] + AltText --> MapEnd[Map Complete] + MapEnd --> TitleGen[Title Generator] + TitleGen --> PostCheck[Post-Remediation Check] + PostCheck --> Merge[PDF Merger] + Merge --> End[End] +``` + +**Features**: +- **Map State**: Parallel execution of chunks +- **Error Handling**: Retry logic with exponential backoff +- **Timeouts**: Configurable per task +- **Logging**: CloudWatch Logs integration + +**Configuration**: +- Max concurrency: 10 (configurable) +- Retry attempts: 3 +- Backoff rate: 2.0 + +--- + +## PDF-to-HTML Solution Components + +### 8. PDF2HTML Lambda Function + +**Location**: `pdf2html/lambda_function.py` + +**Purpose**: Converts PDFs to accessible HTML with full remediation. + +**Key Functions**: +- `lambda_handler()`: Entry point, orchestrates entire pipeline +- Calls `process_pdf_accessibility()` from main API + +**Pipeline Stages**: +1. **Conversion**: PDF → HTML via Bedrock Data Automation +2. **Audit**: Identify accessibility issues +3. **Remediation**: Fix issues using AI +4. **Report Generation**: Create detailed reports +5. **Packaging**: ZIP all outputs + +**Dependencies**: +- `content_accessibility_utility_on_aws`: Core library +- `boto3`: AWS services +- `beautifulsoup4`, `lxml`: HTML processing + +**Container**: Custom Docker image with all dependencies + +**Timeout**: 15 minutes +**Memory**: 3 GB + +--- + +### 9. Bedrock Data Automation Client + +**Location**: `pdf2html/content_accessibility_utility_on_aws/pdf2html/services/bedrock_client.py` + +**Purpose**: Interface to AWS Bedrock Data Automation for PDF parsing. + +**Key Classes**: +- `BDAClient`: Base client for BDA operations +- `ExtendedBDAClient`: Enhanced client with additional features + +**Key Functions**: +- `create_project()`: Creates BDA project +- `process_and_retrieve()`: Submits PDF and retrieves results +- `_extract_html_from_result_json()`: Parses BDA output + +**BDA Capabilities**: +- PDF structure parsing +- Text extraction with layout preservation +- Image extraction +- Table detection +- Element positioning + +**Output**: Structured JSON with page elements and HTML fragments + +--- + +### 10. Accessibility Auditor + +**Location**: `pdf2html/content_accessibility_utility_on_aws/audit/auditor.py` + +**Purpose**: Comprehensive WCAG 2.1 Level AA accessibility audit. + +**Key Class**: `AccessibilityAuditor` + +**Key Functions**: +- `audit()`: Main audit entry point +- `_audit_page()`: Audits single HTML page +- `_check_text_alternatives()`: Image alt text checks +- `_generate_report()`: Creates audit report + +**Audit Checks** (from `audit/checks/`): + +#### Image Checks +- Missing alt text +- Empty alt text +- Generic alt text (e.g., "image", "picture") +- Long alt text (>150 characters) +- Decorative image identification +- Figure structure (figcaption) + +#### Heading Checks +- Missing H1 +- Skipped heading levels +- Empty heading content +- Heading hierarchy + +#### Table Checks +- Missing headers +- Missing caption +- Missing scope attributes +- Irregular header structure +- Missing thead/tbody + +#### Form Checks +- Missing labels +- Missing fieldsets for radio/checkbox groups +- Missing required field indicators + +#### Link Checks +- Empty link text +- Generic link text ("click here", "read more") +- URL as link text +- New window without warning + +#### Structure Checks +- Missing document language +- Missing document title +- Missing landmarks (main, nav, header, footer) +- Missing skip links + +#### Color Contrast Checks +- Insufficient contrast ratios +- WCAG AA compliance (4.5:1 normal, 3:1 large text) + +**Output**: `AuditReport` object with: +- List of issues with locations +- WCAG criteria mapping +- Severity levels (critical, serious, moderate, minor) +- Element selectors for precise location + +--- + +### 11. Remediation Manager + +**Location**: `pdf2html/content_accessibility_utility_on_aws/remediate/remediation_manager.py` + +**Purpose**: Applies fixes to accessibility issues. + +**Key Class**: `RemediationManager` + +**Key Functions**: +- `remediate_issues()`: Processes all issues +- `remediate_issue()`: Fixes single issue +- `_get_remediation_strategies()`: Maps issues to strategies + +**Remediation Strategies** (from `remediate/remediation_strategies/`): + +#### Image Remediation +- `remediate_missing_alt_text()`: Generates alt text via Bedrock +- `remediate_empty_alt_text()`: Adds descriptive alt text +- `remediate_generic_alt_text()`: Improves generic descriptions +- `remediate_long_alt_text()`: Shortens verbose alt text +- `_is_decorative_image()`: Identifies decorative images + +#### Heading Remediation +- `remediate_missing_h1()`: Adds H1 based on content +- `remediate_skipped_heading_level()`: Fixes hierarchy +- `remediate_empty_heading_content()`: Adds content or removes +- `remediate_missing_headings()`: Adds structure + +#### Table Remediation +- `remediate_table_missing_headers()`: Adds th elements +- `remediate_table_missing_caption()`: Generates caption +- `remediate_table_missing_scope()`: Adds scope attributes +- `remediate_table_missing_thead()`: Adds thead structure +- `remediate_table_irregular_headers()`: Fixes complex tables + +#### Form Remediation +- `remediate_missing_form_labels()`: Associates labels +- `remediate_missing_fieldsets()`: Groups related fields +- `remediate_missing_required_indicators()`: Adds required markers + +#### Link Remediation +- `remediate_empty_link_text()`: Adds descriptive text +- `remediate_generic_link_text()`: Improves link text +- `remediate_url_as_link_text()`: Replaces URLs with descriptions +- `remediate_new_window_link_no_warning()`: Adds warnings + +#### Landmark Remediation +- `remediate_missing_main_landmark()`: Adds main element +- `remediate_missing_navigation_landmark()`: Adds nav +- `remediate_missing_header_landmark()`: Adds header +- `remediate_missing_footer_landmark()`: Adds footer +- `remediate_missing_skip_link()`: Adds skip navigation + +#### Document Structure Remediation +- `remediate_missing_document_title()`: Generates title +- `remediate_missing_language()`: Adds lang attribute + +#### Color Contrast Remediation +- `remediate_insufficient_color_contrast()`: Adjusts colors + +**AI Integration**: +- Uses Bedrock Nova Pro for complex remediations +- Prompt engineering for context-aware fixes +- Fallback to rule-based fixes + +--- + +### 12. Bedrock Client (Remediation) + +**Location**: `pdf2html/content_accessibility_utility_on_aws/remediate/services/bedrock_client.py` + +**Purpose**: Interface to Amazon Bedrock for AI-powered remediation. + +**Key Class**: `BedrockClient` + +**Key Functions**: +- `generate_text()`: Text generation for fixes +- `generate_alt_text_for_image()`: Image description generation + +**Models Used**: +- Amazon Nova Pro (default) +- Configurable model selection + +**Prompt Engineering**: +- Context-aware prompts +- Element context inclusion +- WCAG criteria guidance + +--- + +### 13. Report Generator + +**Location**: `pdf2html/content_accessibility_utility_on_aws/utils/report_generator.py` + +**Purpose**: Generates comprehensive accessibility reports. + +**Key Functions**: +- `generate_report()`: Main entry point +- `generate_html_report()`: Interactive HTML report +- `generate_json_report()`: Machine-readable JSON +- `generate_csv_report()`: Spreadsheet format +- `generate_text_report()`: Plain text summary + +**HTML Report Features**: +- Issue summary with counts +- WCAG criteria breakdown +- Before/after comparisons +- Interactive filtering +- Severity color coding + +**Report Contents**: +- Total issues found +- Issues fixed automatically +- Issues requiring manual review +- WCAG 2.1 criteria mapping +- Element locations with selectors +- Remediation actions taken +- Usage statistics (tokens, costs) + +--- + +### 14. Usage Tracker + +**Location**: `pdf2html/content_accessibility_utility_on_aws/utils/usage_tracker.py` + +**Purpose**: Tracks API usage and estimates costs. + +**Key Class**: `SessionUsageTracker` (Singleton) + +**Tracked Metrics**: +- Bedrock invocations +- Token usage (input/output) +- BDA processing time +- API call counts +- Estimated costs + +**Cost Estimation**: +- Bedrock: $0.0008/1K input tokens, $0.0032/1K output tokens +- BDA: Per-page pricing +- Lambda: Per-GB-second +- S3: Storage and requests + +**Output**: `usage_data.json` with detailed breakdown + +--- + +## Shared Components + +### 15. Metrics Helper + +**Location**: `lambda/shared/metrics_helper.py` + +**Purpose**: Centralized CloudWatch metrics publishing. + +**Key Class**: `MetricsContext` (Context Manager) + +**Key Functions**: +- `emit_metric()`: Publishes metric to CloudWatch +- `track_pages_processed()`: Pages metric +- `track_adobe_api_call()`: Adobe API tracking +- `track_bedrock_invocation()`: Bedrock tracking +- `track_processing_duration()`: Timing metrics +- `track_error()`: Error tracking +- `track_file_size()`: File size metrics +- `estimate_cost()`: Cost calculation + +**Usage Pattern**: +```python +with MetricsContext(user_id="user123", solution="PDF2PDF") as metrics: + metrics.track_pages_processed(10) + metrics.track_adobe_api_call() + # ... processing ... + metrics.estimate_cost(adobe_calls=1, pages=10) +``` + +**Namespace**: `PDFAccessibility` + +**Dimensions**: `Solution`, `UserId`, `Operation` + +--- + +### 16. S3 Object Tagger + +**Location**: `lambda/s3_object_tagger/main.py` + +**Purpose**: Tags S3 objects with user metadata for attribution. + +**Key Functions**: +- `lambda_handler()`: Processes S3 events +- Tags objects with `user-id` and `upload-timestamp` + +**Integration**: Cognito user pools (when UI deployed) + +**Use Case**: Per-user usage tracking and cost allocation + +--- + +### 17. CloudWatch Dashboard + +**Location**: `cdk/usage_metrics_stack.py` + +**Purpose**: Visualizes usage metrics and costs. + +**Dashboard Name**: `PDF-Accessibility-Usage-Metrics` + +**Widgets**: +- Pages processed over time +- Adobe API calls +- Bedrock invocations +- Token usage (input/output) +- Error rates by type +- Estimated costs by user +- Processing duration percentiles + +**Refresh**: Real-time (1-minute intervals) + +--- + +## Component Dependencies + +```mermaid +graph TD + Splitter[PDF Splitter] --> StepFn[Step Functions] + StepFn --> Adobe[Adobe Autotag] + StepFn --> AltText[Alt Text Generator] + StepFn --> TitleGen[Title Generator] + StepFn --> Merger[PDF Merger] + + Adobe --> Secrets[Secrets Manager] + Adobe --> Metrics[Metrics Helper] + AltText --> Bedrock[Bedrock] + AltText --> Metrics + TitleGen --> Bedrock + TitleGen --> Metrics + + PDF2HTML[PDF2HTML Lambda] --> BDA[BDA Client] + PDF2HTML --> Auditor[Auditor] + PDF2HTML --> Remediator[Remediation Manager] + + Auditor --> Checks[Audit Checks] + Remediator --> Strategies[Remediation Strategies] + Remediator --> BedrockClient[Bedrock Client] + + PDF2HTML --> Reporter[Report Generator] + PDF2HTML --> UsageTracker[Usage Tracker] + + Metrics --> CloudWatch[CloudWatch] + UsageTracker --> CloudWatch +``` + +## Component Communication + +### Synchronous +- Lambda → S3 (direct API calls) +- Lambda → Bedrock (direct API calls) +- Lambda → Secrets Manager (direct API calls) + +### Asynchronous +- S3 → Lambda (event notifications) +- Step Functions → ECS (task invocation) +- Lambda → CloudWatch (metrics/logs) + +### Data Flow +- **Input**: S3 buckets +- **Processing**: Lambda/ECS +- **Output**: S3 buckets +- **Monitoring**: CloudWatch diff --git a/.agents/summary/data_models.md b/.agents/summary/data_models.md new file mode 100644 index 00000000..fde475d5 --- /dev/null +++ b/.agents/summary/data_models.md @@ -0,0 +1,629 @@ +# Data Models and Structures + +## Core Data Models + +### 1. Audit Models + +#### AuditReport +**Location**: `pdf2html/content_accessibility_utility_on_aws/utils/report_models.py` + +```python +@dataclass +class AuditReport(BaseReport): + summary: AuditSummary + issues: List[AuditIssue] + wcag_summary: Dict[str, Dict[str, Any]] + config: Config +``` + +**Fields**: +- `summary`: High-level statistics +- `issues`: List of accessibility issues found +- `wcag_summary`: Issues grouped by WCAG criteria +- `config`: Audit configuration used + +--- + +#### AuditSummary +```python +@dataclass +class AuditSummary(BaseSummary): + total_issues: int + by_severity: Dict[Severity, int] + by_wcag_level: Dict[str, int] + pages_audited: int + elements_checked: int +``` + +**Severity Levels**: +- `CRITICAL`: Blocks accessibility (e.g., missing alt text) +- `SERIOUS`: Major barrier (e.g., skipped heading levels) +- `MODERATE`: Significant issue (e.g., generic link text) +- `MINOR`: Minor improvement (e.g., missing lang attribute) + +--- + +#### AuditIssue +```python +@dataclass +class AuditIssue(BaseIssue): + id: str + type: str + severity: Severity + wcag_criteria: List[str] + element: str + selector: str + location: Location + message: str + suggestion: str + context: Optional[str] + status: IssueStatus +``` + +**Issue Types**: +- `missing_alt_text` +- `empty_alt_text` +- `generic_alt_text` +- `long_alt_text` +- `missing_h1` +- `skipped_heading_level` +- `empty_heading_content` +- `table_missing_headers` +- `table_missing_caption` +- `table_missing_scope` +- `form_missing_label` +- `form_missing_fieldset` +- `empty_link_text` +- `generic_link_text` +- `url_as_link_text` +- `missing_main_landmark` +- `missing_document_language` +- `insufficient_color_contrast` + +--- + +#### Location +```python +@dataclass +class Location: + page: int + line: Optional[int] + column: Optional[int] + xpath: Optional[str] +``` + +--- + +### 2. Remediation Models + +#### RemediationReport +```python +@dataclass +class RemediationReport(BaseReport): + summary: RemediationSummary + fixes: List[RemediationFix] + manual_review_items: List[ManualReviewItem] + config: Config +``` + +--- + +#### RemediationSummary +```python +@dataclass +class RemediationSummary(BaseSummary): + total_issues: int + fixed_automatically: int + requires_manual_review: int + failed: int + by_method: Dict[str, int] # ai_generated, rule_based, manual +``` + +--- + +#### RemediationFix +```python +@dataclass +class RemediationFix: + issue_id: str + issue_type: str + status: RemediationStatus + method: str + original_element: str + fixed_element: str + details: RemediationDetails + timestamp: datetime +``` + +**RemediationStatus**: +- `FIXED`: Successfully remediated +- `FAILED`: Remediation failed +- `MANUAL_REVIEW`: Requires human review +- `SKIPPED`: Intentionally skipped + +--- + +#### RemediationDetails +```python +@dataclass +class RemediationDetails: + ai_prompt: Optional[str] + ai_response: Optional[str] + ai_model: Optional[str] + tokens_used: Optional[int] + confidence: Optional[float] + fallback_used: bool + error_message: Optional[str] +``` + +--- + +#### ManualReviewItem +```python +@dataclass +class ManualReviewItem: + issue_id: str + issue_type: str + reason: str + element: str + selector: str + suggestion: str + priority: str # high, medium, low +``` + +--- + +### 3. Configuration Models + +#### Config +```python +@dataclass +class Config: + wcag_level: str = "AA" # AA or AAA + include_warnings: bool = True + check_color_contrast: bool = True + auto_remediate: bool = True + use_ai: bool = True + bedrock_model: str = "amazon.nova-pro-v1:0" + max_retries: int = 3 + timeout_seconds: int = 300 + output_formats: List[str] = field(default_factory=lambda: ["html", "json"]) +``` + +--- + +### 4. Usage Tracking Models + +#### UsageData +**Location**: `pdf2html/content_accessibility_utility_on_aws/utils/usage_tracker.py` + +```python +{ + "session_id": "uuid", + "user_id": "user123", + "solution": "PDF2HTML", + "timestamp": "2026-03-02T15:00:00Z", + "pdf_info": { + "filename": "document.pdf", + "size_bytes": 1024000, + "pages": 10 + }, + "bedrock_usage": { + "invocations": 15, + "input_tokens": 5000, + "output_tokens": 2000, + "models_used": ["amazon.nova-pro-v1:0"] + }, + "bda_usage": { + "pages_processed": 10, + "processing_time_seconds": 45 + }, + "processing_metrics": { + "total_duration_seconds": 120, + "conversion_time": 45, + "audit_time": 20, + "remediation_time": 55 + }, + "cost_estimates": { + "bedrock": 0.0224, + "bda": 0.50, + "lambda": 0.0015, + "s3": 0.0001, + "total": 0.524 + } +} +``` + +--- + +### 5. BDA Models + +#### BDAElement +**Location**: `pdf2html/content_accessibility_utility_on_aws/remediate/bda_integration/element_parser.py` + +```python +{ + "id": "element-001", + "type": "text" | "image" | "table" | "heading", + "page": 1, + "content": "Element content", + "bounding_box": { + "x": 100, + "y": 200, + "width": 300, + "height": 50 + }, + "confidence": 0.95, + "attributes": { + "font_size": 12, + "font_family": "Arial", + "color": "#000000" + }, + "children": [] # Nested elements +} +``` + +--- + +#### BDAPage +```python +{ + "page_number": 1, + "width": 612, + "height": 792, + "elements": [BDAElement], + "images": [ + { + "id": "img-001", + "s3_path": "s3://bucket/images/img-001.png", + "bounding_box": {...}, + "alt_text": None + } + ] +} +``` + +--- + +### 6. Metrics Models + +#### MetricData +**Location**: `lambda/shared/metrics_helper.py` + +```python +{ + "namespace": "PDFAccessibility", + "metric_name": "PagesProcessed", + "value": 10, + "unit": "Count", + "timestamp": "2026-03-02T15:00:00Z", + "dimensions": [ + {"name": "Solution", "value": "PDF2PDF"}, + {"name": "UserId", "value": "user123"}, + {"name": "Operation", "value": "adobe_autotag"} + ] +} +``` + +--- + +### 7. Step Functions State + +#### ChunkProcessingState +```python +{ + "chunk_id": "chunk-001", + "s3_key": "temp/document_page_1.pdf", + "page_number": 1, + "status": "processing" | "completed" | "failed", + "adobe_output": "temp/document_page_1_tagged.pdf", + "alttext_output": "temp/document_page_1_final.pdf", + "errors": [] +} +``` + +--- + +#### WorkflowState +```python +{ + "execution_id": "exec-uuid", + "original_file": "pdf/document.pdf", + "user_id": "user123", + "chunks": [ChunkProcessingState], + "pre_check_results": {...}, + "post_check_results": {...}, + "final_output": "result/COMPLIANT_document.pdf", + "metrics": { + "total_pages": 10, + "processing_time": 120, + "adobe_calls": 10, + "bedrock_calls": 50 + } +} +``` + +--- + +## WCAG Criteria Mapping + +### WCAG 2.1 Level AA Criteria + +```python +WCAG_CRITERIA = { + "1.1.1": { + "name": "Non-text Content", + "level": "A", + "description": "All non-text content has text alternative", + "issue_types": ["missing_alt_text", "empty_alt_text"] + }, + "1.3.1": { + "name": "Info and Relationships", + "level": "A", + "description": "Information, structure, and relationships can be programmatically determined", + "issue_types": ["table_missing_headers", "form_missing_label", "missing_headings"] + }, + "1.3.2": { + "name": "Meaningful Sequence", + "level": "A", + "description": "Correct reading sequence can be programmatically determined", + "issue_types": ["skipped_heading_level"] + }, + "1.4.3": { + "name": "Contrast (Minimum)", + "level": "AA", + "description": "Text has contrast ratio of at least 4.5:1", + "issue_types": ["insufficient_color_contrast"] + }, + "2.4.1": { + "name": "Bypass Blocks", + "level": "A", + "description": "Mechanism to bypass blocks of repeated content", + "issue_types": ["missing_skip_link"] + }, + "2.4.2": { + "name": "Page Titled", + "level": "A", + "description": "Web pages have titles that describe topic or purpose", + "issue_types": ["missing_document_title"] + }, + "2.4.4": { + "name": "Link Purpose (In Context)", + "level": "A", + "description": "Purpose of each link can be determined from link text", + "issue_types": ["empty_link_text", "generic_link_text", "url_as_link_text"] + }, + "2.4.6": { + "name": "Headings and Labels", + "level": "AA", + "description": "Headings and labels describe topic or purpose", + "issue_types": ["empty_heading_content", "generic_heading_text"] + }, + "3.1.1": { + "name": "Language of Page", + "level": "A", + "description": "Default human language can be programmatically determined", + "issue_types": ["missing_document_language"] + }, + "4.1.2": { + "name": "Name, Role, Value", + "level": "A", + "description": "Name and role can be programmatically determined", + "issue_types": ["missing_aria_labels", "invalid_aria_attributes"] + } +} +``` + +--- + +## File Formats + +### 1. Audit Report JSON +```json +{ + "version": "1.0", + "timestamp": "2026-03-02T15:00:00Z", + "html_file": "document.html", + "summary": { + "total_issues": 42, + "by_severity": { + "critical": 5, + "serious": 15, + "moderate": 18, + "minor": 4 + }, + "by_wcag_level": { + "A": 25, + "AA": 17 + }, + "pages_audited": 10, + "elements_checked": 523 + }, + "issues": [ + { + "id": "img-001", + "type": "missing_alt_text", + "severity": "critical", + "wcag_criteria": ["1.1.1"], + "element": "", + "selector": "body > div.content > img:nth-child(3)", + "location": { + "page": 1, + "line": 45, + "column": 12 + }, + "message": "Image is missing alt attribute", + "suggestion": "Add descriptive alt text that conveys the purpose of the image", + "context": "Surrounding text: Lorem ipsum...", + "status": "open" + } + ], + "wcag_summary": { + "1.1.1": { + "count": 5, + "description": "Non-text Content", + "level": "A" + } + }, + "config": { + "wcag_level": "AA", + "include_warnings": true, + "check_color_contrast": true + } +} +``` + +--- + +### 2. Remediation Report JSON +```json +{ + "version": "1.0", + "timestamp": "2026-03-02T15:00:00Z", + "html_file": "document_remediated.html", + "summary": { + "total_issues": 42, + "fixed_automatically": 35, + "requires_manual_review": 7, + "failed": 0, + "by_method": { + "ai_generated": 20, + "rule_based": 15, + "manual": 0 + } + }, + "fixes": [ + { + "issue_id": "img-001", + "issue_type": "missing_alt_text", + "status": "fixed", + "method": "ai_generated", + "original_element": "", + "fixed_element": "A graph showing sales trends over time", + "details": { + "ai_prompt": "Generate alt text for this image...", + "ai_response": "A graph showing sales trends over time", + "ai_model": "amazon.nova-pro-v1:0", + "tokens_used": 150, + "confidence": 0.92, + "fallback_used": false + }, + "timestamp": "2026-03-02T15:01:23Z" + } + ], + "manual_review_items": [ + { + "issue_id": "table-005", + "issue_type": "table_irregular_headers", + "reason": "Complex table structure with merged cells", + "element": "...
", + "selector": "body > table:nth-child(5)", + "suggestion": "Manually verify header associations and add scope attributes", + "priority": "high" + } + ] +} +``` + +--- + +### 3. Usage Data JSON +```json +{ + "session_id": "550e8400-e29b-41d4-a716-446655440000", + "user_id": "user123", + "solution": "PDF2HTML", + "timestamp": "2026-03-02T15:00:00Z", + "pdf_info": { + "filename": "document.pdf", + "size_bytes": 1024000, + "pages": 10 + }, + "bedrock_usage": { + "invocations": 15, + "input_tokens": 5000, + "output_tokens": 2000, + "models_used": ["amazon.nova-pro-v1:0"] + }, + "bda_usage": { + "pages_processed": 10, + "processing_time_seconds": 45 + }, + "processing_metrics": { + "total_duration_seconds": 120, + "conversion_time": 45, + "audit_time": 20, + "remediation_time": 55 + }, + "cost_estimates": { + "bedrock": 0.0224, + "bda": 0.50, + "lambda": 0.0015, + "s3": 0.0001, + "total": 0.524 + } +} +``` + +--- + +## Database Schemas + +### Image Metadata SQLite (Adobe Container) + +**Table**: `image_metadata` + +```sql +CREATE TABLE image_metadata ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + image_path TEXT NOT NULL, + page_number INTEGER, + bounding_box TEXT, -- JSON string + alt_text TEXT, + is_decorative BOOLEAN DEFAULT 0, + context TEXT, + confidence REAL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); +``` + +**Usage**: Stores extracted image information from Adobe Extract API for alt text generation. + +--- + +## Enumerations + +### Severity +```python +class Severity(Enum): + CRITICAL = "critical" + SERIOUS = "serious" + MODERATE = "moderate" + MINOR = "minor" +``` + +### IssueStatus +```python +class IssueStatus(Enum): + OPEN = "open" + FIXED = "fixed" + MANUAL_REVIEW = "manual_review" + SKIPPED = "skipped" +``` + +### RemediationStatus +```python +class RemediationStatus(Enum): + FIXED = "fixed" + FAILED = "failed" + MANUAL_REVIEW = "manual_review" + SKIPPED = "skipped" +``` + +### RemediationMethod +```python +class RemediationMethod(Enum): + AI_GENERATED = "ai_generated" + RULE_BASED = "rule_based" + MANUAL = "manual" +``` diff --git a/.agents/summary/dependencies.md b/.agents/summary/dependencies.md new file mode 100644 index 00000000..1dd13c72 --- /dev/null +++ b/.agents/summary/dependencies.md @@ -0,0 +1,540 @@ +# Dependencies and External Services + +## External Service Dependencies + +### 1. Adobe PDF Services API + +**Purpose**: PDF structure tagging and content extraction + +**Service Type**: Third-party REST API + +**Authentication**: OAuth 2.0 client credentials + +**Required Credentials**: +- Client ID +- Client Secret + +**Pricing Model**: Enterprise contract or trial account + +**Rate Limits**: Contract-dependent + +**APIs Used**: +- **Autotag API**: Adds accessibility tags +- **Extract API**: Extracts images and structure + +**Failure Impact**: +- **Critical**: PDF-to-PDF solution cannot function without it +- **Mitigation**: Retry logic with exponential backoff + +**Documentation**: https://developer.adobe.com/document-services/docs/overview/pdf-services-api/ + +--- + +### 2. AWS Bedrock + +**Purpose**: AI-powered content generation + +**Service Type**: AWS managed service + +**Authentication**: IAM role-based + +**Models Used**: +- **Amazon Nova Pro** (`amazon.nova-pro-v1:0`) + - Multimodal (text + vision) + - Alt text generation + - Title generation + - Remediation suggestions + +**Pricing**: +- Input tokens: $0.0008 per 1K tokens +- Output tokens: $0.0032 per 1K tokens + +**Rate Limits**: +- Requests per minute: Model-dependent +- Tokens per minute: Model-dependent + +**Failure Impact**: +- **High**: AI-powered features unavailable +- **Mitigation**: Fall back to rule-based fixes + +**Required Permissions**: +```json +{ + "Effect": "Allow", + "Action": [ + "bedrock:InvokeModel" + ], + "Resource": "arn:aws:bedrock:*::foundation-model/amazon.nova-pro-v1:0" +} +``` + +--- + +### 3. AWS Bedrock Data Automation + +**Purpose**: PDF parsing and structure extraction + +**Service Type**: AWS managed service + +**Authentication**: IAM role-based + +**Pricing**: Per-page processing fee + +**Rate Limits**: Project-level quotas + +**Failure Impact**: +- **Critical**: PDF-to-HTML solution cannot function without it +- **Mitigation**: Retry logic, timeout handling + +**Required Permissions**: +```json +{ + "Effect": "Allow", + "Action": [ + "bedrock:CreateDataAutomationProject", + "bedrock:InvokeDataAutomationAsync", + "bedrock:GetDataAutomationStatus" + ], + "Resource": "*" +} +``` + +--- + +## AWS Service Dependencies + +### 4. Amazon S3 + +**Purpose**: Object storage for PDFs and outputs + +**Pricing**: +- Storage: $0.023 per GB/month (Standard) +- PUT requests: $0.005 per 1,000 requests +- GET requests: $0.0004 per 1,000 requests + +**Features Used**: +- Event notifications +- Versioning +- Server-side encryption (SSE-S3) +- Object tagging +- Lifecycle policies (optional) + +**Required Permissions**: +```json +{ + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:PutObjectTagging" + ], + "Resource": [ + "arn:aws:s3:::bucket-name", + "arn:aws:s3:::bucket-name/*" + ] +} +``` + +--- + +### 5. AWS Lambda + +**Purpose**: Serverless compute for lightweight operations + +**Runtimes Used**: +- Python 3.12 +- Java 11 +- Node.js 18 (via container) + +**Pricing**: +- Requests: $0.20 per 1M requests +- Duration: $0.0000166667 per GB-second + +**Limits**: +- Timeout: 15 minutes (max) +- Memory: 10 GB (max) +- Deployment package: 250 MB (unzipped) +- Container image: 10 GB + +**Functions Deployed**: +- PDF Splitter (Python) +- PDF Merger (Java) +- Title Generator (Python) +- Pre/Post Accessibility Checkers (Python) +- S3 Object Tagger (Python) +- PDF2HTML Pipeline (Python container) + +--- + +### 6. Amazon ECS Fargate + +**Purpose**: Containerized compute for heavy processing + +**Pricing**: +- vCPU: $0.04048 per vCPU per hour +- Memory: $0.004445 per GB per hour + +**Configuration**: +- CPU: 2 vCPU +- Memory: 4 GB +- Platform: Linux/AMD64 + +**Containers Deployed**: +- Adobe Autotag Processor (Python) +- Alt Text Generator (Node.js) + +**Cold Start Optimization**: +- VPC endpoints for ECR (reduces 10-15s) +- zstd compression (2-3x faster than gzip) + +--- + +### 7. AWS Step Functions + +**Purpose**: Workflow orchestration + +**Pricing**: +- State transitions: $0.025 per 1,000 transitions + +**Features Used**: +- Map state (parallel execution) +- Error handling and retries +- CloudWatch integration + +**Workflow**: PDF-to-PDF chunk processing + +--- + +### 8. Amazon ECR + +**Purpose**: Container image registry + +**Pricing**: +- Storage: $0.10 per GB/month +- Data transfer: Standard AWS rates + +**Images Stored**: +- Adobe Autotag container +- Alt Text Generator container +- PDF2HTML Lambda container + +--- + +### 9. AWS Secrets Manager + +**Purpose**: Secure credential storage + +**Pricing**: +- Secret: $0.40 per secret per month +- API calls: $0.05 per 10,000 calls + +**Secrets Stored**: +- Adobe PDF Services credentials + +--- + +### 10. Amazon CloudWatch + +**Purpose**: Monitoring, logging, and metrics + +**Pricing**: +- Logs ingestion: $0.50 per GB +- Logs storage: $0.03 per GB/month +- Custom metrics: $0.30 per metric per month +- Dashboard: $3.00 per dashboard per month + +**Features Used**: +- Log groups for all Lambda/ECS +- Custom metrics namespace: `PDFAccessibility` +- Usage metrics dashboard + +--- + +### 11. Amazon VPC + +**Purpose**: Network isolation for ECS tasks + +**Pricing**: +- NAT Gateway: $0.045 per hour + $0.045 per GB processed +- VPC Endpoints: $0.01 per hour per AZ + +**Configuration**: +- 2 Availability Zones +- Public and private subnets +- NAT Gateway for egress +- VPC endpoints for ECR and S3 + +--- + +### 12. AWS IAM + +**Purpose**: Access control and permissions + +**Pricing**: Free + +**Roles Created**: +- Lambda execution roles +- ECS task roles +- ECS task execution roles +- Step Functions execution role + +--- + +### 13. AWS CodeBuild + +**Purpose**: CI/CD pipeline for deployment + +**Pricing**: +- Build minutes: $0.005 per minute (general1.small) + +**Usage**: Automated deployment via `deploy.sh` + +--- + +## Python Dependencies + +### Core Libraries + +#### boto3 +- **Version**: Latest +- **Purpose**: AWS SDK for Python +- **Used By**: All Python components +- **License**: Apache 2.0 + +#### aws-cdk-lib +- **Version**: 2.147.2 +- **Purpose**: AWS CDK framework +- **Used By**: Infrastructure code +- **License**: Apache 2.0 + +### PDF Processing + +#### pypdf +- **Version**: 4.3.1 +- **Purpose**: PDF manipulation +- **Used By**: PDF Splitter, Title Generator +- **License**: BSD + +#### PyMuPDF (fitz) +- **Version**: 1.24.14 +- **Purpose**: PDF text extraction +- **Used By**: Title Generator +- **License**: AGPL + +### HTML Processing + +#### beautifulsoup4 +- **Version**: Latest +- **Purpose**: HTML parsing +- **Used By**: PDF2HTML, Auditor, Remediator +- **License**: MIT + +#### lxml +- **Version**: Latest +- **Purpose**: XML/HTML processing +- **Used By**: PDF2HTML, Auditor +- **License**: BSD + +### Image Processing + +#### Pillow +- **Version**: Latest +- **Purpose**: Image manipulation +- **Used By**: PDF2HTML, Alt Text Generator +- **License**: HPND + +### Adobe SDK + +#### pdfservices-sdk +- **Version**: 4.1.0 +- **Purpose**: Adobe PDF Services API client +- **Used By**: Adobe Autotag container +- **License**: Proprietary (Adobe) + +### Utilities + +#### openpyxl +- **Version**: Latest +- **Purpose**: Excel file parsing +- **Used By**: Adobe Autotag container +- **License**: MIT + +#### requests +- **Version**: 2.31.0 +- **Purpose**: HTTP client +- **Used By**: Adobe SDK, BDA client +- **License**: Apache 2.0 + +--- + +## JavaScript Dependencies + +### AWS SDK + +#### @aws-sdk/client-bedrock-runtime +- **Version**: Latest +- **Purpose**: Bedrock API client +- **Used By**: Alt Text Generator +- **License**: Apache 2.0 + +#### @aws-sdk/client-s3 +- **Version**: Latest +- **Purpose**: S3 API client +- **Used By**: Alt Text Generator, PDF2HTML CDK +- **License**: Apache 2.0 + +### PDF Processing + +#### pdf-lib +- **Version**: Latest +- **Purpose**: PDF manipulation +- **Used By**: Alt Text Generator +- **License**: MIT + +### CDK + +#### aws-cdk-lib +- **Version**: Latest +- **Purpose**: AWS CDK framework +- **Used By**: PDF2HTML CDK stack +- **License**: Apache 2.0 + +#### @aws-cdk/aws-lambda-python-alpha +- **Version**: Latest +- **Purpose**: Python Lambda constructs +- **Used By**: PDF2HTML CDK stack +- **License**: Apache 2.0 + +--- + +## Java Dependencies + +### PDF Processing + +#### org.apache.pdfbox:pdfbox +- **Version**: Latest +- **Purpose**: PDF merging +- **Used By**: PDF Merger Lambda +- **License**: Apache 2.0 + +### AWS SDK + +#### software.amazon.awssdk:s3 +- **Version**: Latest +- **Purpose**: S3 operations +- **Used By**: PDF Merger Lambda +- **License**: Apache 2.0 + +#### com.amazonaws:aws-lambda-java-core +- **Version**: Latest +- **Purpose**: Lambda runtime +- **Used By**: PDF Merger Lambda +- **License**: Apache 2.0 + +--- + +## Development Dependencies + +### Python + +#### pytest +- **Purpose**: Testing framework +- **License**: MIT + +#### black +- **Purpose**: Code formatting +- **License**: MIT + +#### mypy +- **Purpose**: Type checking +- **License**: MIT + +### Node.js + +#### eslint +- **Purpose**: Linting +- **License**: MIT + +#### prettier +- **Purpose**: Code formatting +- **License**: MIT + +--- + +## Dependency Management + +### Python +- **File**: `requirements.txt` +- **Tool**: pip +- **Virtual Environment**: venv + +### JavaScript +- **File**: `package.json`, `package-lock.json` +- **Tool**: npm + +### Java +- **File**: `pom.xml` +- **Tool**: Maven + +--- + +## Security Considerations + +### Dependency Scanning +- Regular updates for security patches +- Vulnerability scanning with AWS Inspector +- Dependabot alerts (GitHub) + +### License Compliance +- All dependencies use permissive licenses +- Adobe SDK requires enterprise contract +- AGPL license (PyMuPDF) - consider alternatives for commercial use + +### Supply Chain Security +- Pin dependency versions +- Use official package repositories +- Verify package signatures + +--- + +## Version Compatibility + +### Python +- **Minimum**: 3.9 +- **Recommended**: 3.12 +- **Lambda Runtime**: 3.12 + +### Node.js +- **Minimum**: 18 +- **Recommended**: 18 LTS +- **Lambda Runtime**: 18 + +### Java +- **Minimum**: 11 +- **Recommended**: 11 +- **Lambda Runtime**: 11 + +### AWS CDK +- **Version**: 2.147.2 +- **Compatibility**: AWS CDK v2 + +--- + +## Dependency Update Strategy + +### Regular Updates +- Monthly security patch review +- Quarterly minor version updates +- Annual major version updates + +### Testing +- Unit tests after updates +- Integration tests with AWS services +- End-to-end workflow validation + +### Rollback Plan +- Version pinning in requirements files +- CDK snapshot testing +- Blue/green deployment for major changes diff --git a/.agents/summary/index.md b/.agents/summary/index.md new file mode 100644 index 00000000..33eeef08 --- /dev/null +++ b/.agents/summary/index.md @@ -0,0 +1,454 @@ +# PDF Accessibility Solutions - Knowledge Base Index + +## 🤖 Instructions for AI Assistants + +This index serves as your primary entry point for understanding the PDF Accessibility Solutions codebase. Each document below contains rich metadata and detailed information about specific aspects of the system. + +**How to Use This Index**: +1. **Start Here**: Read the summaries below to understand which documents contain relevant information +2. **Navigate Efficiently**: Use the metadata tags to quickly find specific topics +3. **Deep Dive**: Reference the full documents only when you need detailed implementation information +4. **Cross-Reference**: Documents are interconnected - follow references between them + +**Key Principle**: This index contains sufficient metadata for you to answer most questions without reading full documents. Only access detailed documents when you need specific implementation details, code examples, or technical specifications. + +--- + +## 📚 Document Catalog + +### 1. Codebase Information +**File**: `codebase_info.md` +**Purpose**: High-level overview of the codebase structure, statistics, and technology stack +**When to Use**: Understanding project scope, technology choices, repository organization + +**Key Topics**: +- Project statistics (140 files, 27,949 LOC) +- Language distribution (Python 95 files, JavaScript 3, Java 2, Shell 2) +- Technology stack (AWS CDK, Lambda, ECS, Bedrock, S3) +- Repository structure and organization +- Development environment requirements +- Supported standards (WCAG 2.1 Level AA, PDF/UA) + +**Metadata Tags**: `#overview` `#statistics` `#technology-stack` `#repository-structure` + +**Quick Facts**: +- Two main solutions: PDF-to-PDF and PDF-to-HTML +- Built by Arizona State University's AI Cloud Innovation Center +- Serverless architecture on AWS +- Python 3.12, Node.js 18, Java 11 runtimes + +--- + +### 2. Architecture +**File**: `architecture.md` +**Purpose**: System architecture, component interactions, and design patterns +**When to Use**: Understanding system design, data flow, infrastructure, scalability + +**Key Topics**: +- High-level architecture diagrams (Mermaid) +- PDF-to-PDF solution workflow (S3 → Lambda → Step Functions → ECS → Merger) +- PDF-to-HTML solution workflow (S3 → Lambda → BDA → Bedrock → Remediation) +- VPC configuration and networking +- ECS Fargate setup +- CloudWatch monitoring architecture +- Deployment architecture +- Scalability and cost optimization strategies + +**Metadata Tags**: `#architecture` `#design-patterns` `#infrastructure` `#workflows` `#scalability` + +**Key Diagrams**: +- Overall system architecture +- PDF-to-PDF sequence diagram +- PDF-to-HTML sequence diagram +- Monitoring architecture +- Deployment flow + +**Design Patterns**: +- Event-driven architecture +- Serverless-first +- Infrastructure as Code (CDK) +- Observability-first + +--- + +### 3. Components +**File**: `components.md` +**Purpose**: Detailed descriptions of all system components, their responsibilities, and interactions +**When to Use**: Understanding specific components, debugging, extending functionality + +**Key Topics**: +- **PDF-to-PDF Components**: + - PDF Splitter Lambda (splits PDFs into pages) + - Adobe Autotag Container (adds accessibility tags) + - Alt Text Generator Container (generates image descriptions) + - Title Generator Lambda (creates document titles) + - PDF Merger Lambda (Java, merges processed chunks) + - Accessibility Checkers (pre/post validation) + - Step Functions Orchestrator (workflow coordination) + +- **PDF-to-HTML Components**: + - PDF2HTML Lambda Function (main pipeline) + - Bedrock Data Automation Client (PDF parsing) + - Accessibility Auditor (WCAG compliance checking) + - Remediation Manager (fixes accessibility issues) + - Report Generator (creates detailed reports) + - Usage Tracker (cost and metrics tracking) + +- **Shared Components**: + - Metrics Helper (CloudWatch metrics) + - S3 Object Tagger (user attribution) + - CloudWatch Dashboard (visualization) + +**Metadata Tags**: `#components` `#lambda` `#ecs` `#step-functions` `#auditing` `#remediation` + +**Component Dependencies**: Includes dependency graph showing relationships between components + +--- + +### 4. Interfaces and APIs +**File**: `interfaces.md` +**Purpose**: API specifications, data contracts, and integration points +**When to Use**: Integrating with the system, understanding API contracts, debugging API calls + +**Key Topics**: +- **External APIs**: + - Adobe PDF Services API (Autotag, Extract) + - AWS Bedrock API (Nova Pro model) + - AWS Bedrock Data Automation API + +- **Internal APIs**: + - Content Accessibility Utility API + - Audit API + - Remediation API + +- **AWS Service Interfaces**: + - S3 operations + - CloudWatch metrics and logs + - Secrets Manager + - Step Functions + +- **Data Models**: AuditReport, RemediationReport, AuditIssue, RemediationFix +- **Event Schemas**: S3 events, Step Functions input/output +- **Error Responses**: Standard error format and codes + +**Metadata Tags**: `#apis` `#interfaces` `#data-contracts` `#integration` `#events` + +**API Examples**: Includes request/response examples for all major APIs + +--- + +### 5. Data Models +**File**: `data_models.md` +**Purpose**: Data structures, schemas, and type definitions +**When to Use**: Understanding data formats, implementing new features, parsing outputs + +**Key Topics**: +- **Audit Models**: AuditReport, AuditSummary, AuditIssue, Location +- **Remediation Models**: RemediationReport, RemediationSummary, RemediationFix, RemediationDetails +- **Configuration Models**: Config with WCAG levels and options +- **Usage Tracking Models**: UsageData with cost estimates +- **BDA Models**: BDAElement, BDAPage +- **Metrics Models**: MetricData with dimensions +- **WCAG Criteria Mapping**: Complete mapping of WCAG 2.1 Level AA criteria +- **File Formats**: JSON schemas for reports and usage data +- **Database Schemas**: SQLite schema for image metadata +- **Enumerations**: Severity, IssueStatus, RemediationStatus + +**Metadata Tags**: `#data-models` `#schemas` `#types` `#wcag` `#reports` + +**Issue Types**: Complete list of 20+ accessibility issue types with WCAG mappings + +--- + +### 6. Workflows +**File**: `workflows.md` +**Purpose**: End-to-end process flows and operational procedures +**When to Use**: Understanding process flows, troubleshooting, optimizing performance + +**Key Topics**: +- **PDF-to-PDF Workflow**: 8-step process from upload to compliant PDF + - Upload → Split → Pre-check → Parallel Processing → Title → Post-check → Merge → Output + - Processing time: 3-60 minutes depending on size + +- **PDF-to-HTML Workflow**: 7-step process from upload to remediated HTML + - Upload → BDA Conversion → Audit → Remediation → Report → Package → Output + - Processing time: 1-20 minutes depending on size + +- **Deployment Workflow**: One-click and manual deployment processes +- **Error Handling Workflows**: Retry logic and recovery procedures +- **Monitoring Workflow**: Metrics collection and log aggregation +- **Cost Tracking Workflow**: Per-user cost attribution + +**Metadata Tags**: `#workflows` `#processes` `#deployment` `#error-handling` `#monitoring` + +**Timing Information**: Detailed timing for each workflow step + +--- + +### 7. Dependencies +**File**: `dependencies.md` +**Purpose**: External services, libraries, and version requirements +**When to Use**: Setting up development environment, troubleshooting dependency issues, updating versions + +**Key Topics**: +- **External Services**: + - Adobe PDF Services API (enterprise contract required) + - AWS Bedrock (IAM-based, Nova Pro model) + - AWS Bedrock Data Automation (per-page pricing) + +- **AWS Services**: S3, Lambda, ECS, Step Functions, ECR, Secrets Manager, CloudWatch, VPC, IAM, CodeBuild + +- **Python Dependencies**: boto3, aws-cdk-lib, pypdf, PyMuPDF, beautifulsoup4, lxml, Pillow, pdfservices-sdk + +- **JavaScript Dependencies**: AWS SDK, pdf-lib, CDK libraries + +- **Java Dependencies**: Apache PDFBox, AWS SDK + +- **Version Compatibility**: Python 3.9+, Node.js 18+, Java 11+ + +- **Security Considerations**: Dependency scanning, license compliance, supply chain security + +**Metadata Tags**: `#dependencies` `#libraries` `#versions` `#external-services` `#security` + +**Pricing Information**: Detailed pricing for all AWS services and external APIs + +--- + +### 8. Review Notes +**File**: `review_notes.md` +**Purpose**: Documentation quality assessment, identified gaps, and recommendations +**When to Use**: Understanding documentation completeness, planning improvements + +**Key Topics**: +- **Consistency Check**: Identified inconsistencies (language diversity, metrics duplication) +- **Completeness Check**: Well-documented areas and gaps +- **Language Support**: All languages fully supported +- **Documentation Quality**: Strengths and areas for improvement +- **Recommendations**: Short, medium, and long-term improvements +- **Validation Checklist**: Coverage assessment +- **Priority Gaps**: Testing strategy, security best practices, troubleshooting + +**Metadata Tags**: `#review` `#quality` `#gaps` `#recommendations` `#maintenance` + +**Action Items**: Prioritized list of documentation improvements needed + +--- + +## 🔍 Quick Reference Guide + +### For Understanding the System +1. Start with **Codebase Information** for overview +2. Read **Architecture** for system design +3. Review **Components** for detailed component information + +### For Development +1. Check **Dependencies** for setup requirements +2. Review **Components** for implementation details +3. Reference **Data Models** for data structures +4. Follow **Workflows** for process understanding + +### For Integration +1. Read **Interfaces and APIs** for API contracts +2. Review **Data Models** for data formats +3. Check **Dependencies** for external service requirements + +### For Operations +1. Review **Workflows** for operational procedures +2. Check **Architecture** for infrastructure details +3. Reference **Components** for troubleshooting + +### For Troubleshooting +1. Check **Review Notes** for known issues +2. Review **Workflows** for error handling +3. Reference **Components** for component-specific issues +4. Check **Dependencies** for version compatibility + +--- + +## 🏷️ Metadata Tag Index + +### By Topic +- **Architecture**: `architecture.md` +- **Components**: `components.md` +- **APIs**: `interfaces.md` +- **Data**: `data_models.md` +- **Processes**: `workflows.md` +- **Dependencies**: `dependencies.md` +- **Quality**: `review_notes.md` + +### By Technology +- **AWS Services**: `architecture.md`, `components.md`, `dependencies.md` +- **Python**: `codebase_info.md`, `components.md`, `dependencies.md` +- **JavaScript**: `codebase_info.md`, `components.md`, `dependencies.md` +- **Java**: `codebase_info.md`, `components.md`, `dependencies.md` + +### By Use Case +- **Development**: `codebase_info.md`, `components.md`, `data_models.md`, `dependencies.md` +- **Operations**: `architecture.md`, `workflows.md`, `components.md` +- **Integration**: `interfaces.md`, `data_models.md`, `dependencies.md` +- **Troubleshooting**: `review_notes.md`, `workflows.md`, `components.md` + +--- + +## 📊 Key Statistics + +- **Total Files**: 140 +- **Lines of Code**: 27,949 +- **Components**: 17 major components +- **AWS Services**: 13 services used +- **External APIs**: 3 (Adobe, Bedrock, BDA) +- **Supported Languages**: Python, JavaScript, Java, Shell +- **WCAG Criteria**: 10+ Level AA criteria supported +- **Issue Types**: 20+ accessibility issue types + +--- + +## 🔗 Cross-References + +### Architecture ↔ Components +- Architecture describes high-level design +- Components provide implementation details +- Both reference same component names + +### Components ↔ Interfaces +- Components describe functionality +- Interfaces define API contracts +- Both use same data models + +### Interfaces ↔ Data Models +- Interfaces reference data models +- Data Models define structures used in APIs +- Both include JSON examples + +### Workflows ↔ Components +- Workflows describe process flows +- Components implement workflow steps +- Both reference same operations + +### Dependencies ↔ All Documents +- Dependencies lists all external requirements +- All documents reference dependencies +- Version compatibility documented + +--- + +## 💡 Tips for AI Assistants + +### Answering Architecture Questions +→ Start with `architecture.md` for system design +→ Reference `components.md` for specific component details +→ Check `workflows.md` for process flows + +### Answering API Questions +→ Start with `interfaces.md` for API specifications +→ Reference `data_models.md` for data structures +→ Check `components.md` for implementation details + +### Answering Development Questions +→ Start with `codebase_info.md` for overview +→ Reference `dependencies.md` for setup requirements +→ Check `components.md` for implementation guidance + +### Answering Operational Questions +→ Start with `workflows.md` for procedures +→ Reference `architecture.md` for infrastructure +→ Check `review_notes.md` for known issues + +### Answering Troubleshooting Questions +→ Start with `review_notes.md` for known issues +→ Reference `workflows.md` for error handling +→ Check `components.md` for component-specific details + +--- + +## 📝 Document Relationships + +```mermaid +graph TD + Index[index.md
YOU ARE HERE] --> Info[codebase_info.md
Overview] + Index --> Arch[architecture.md
System Design] + Index --> Comp[components.md
Implementation] + Index --> API[interfaces.md
APIs] + Index --> Data[data_models.md
Data Structures] + Index --> Work[workflows.md
Processes] + Index --> Deps[dependencies.md
Requirements] + Index --> Review[review_notes.md
Quality] + + Arch --> Comp + Comp --> API + API --> Data + Work --> Comp + Deps --> Comp + Review --> Index +``` + +--- + +## 🎯 Common Questions and Where to Find Answers + +**Q: How does the PDF-to-PDF solution work?** +→ `architecture.md` (high-level) → `workflows.md` (detailed process) → `components.md` (implementation) + +**Q: What accessibility checks are performed?** +→ `components.md` (Auditor section) → `data_models.md` (issue types) → `interfaces.md` (API) + +**Q: How do I deploy the system?** +→ `workflows.md` (deployment workflow) → `dependencies.md` (requirements) → `codebase_info.md` (overview) + +**Q: What AWS services are used?** +→ `dependencies.md` (complete list) → `architecture.md` (how they're used) → `components.md` (specific usage) + +**Q: How is cost tracked?** +→ `workflows.md` (cost tracking workflow) → `components.md` (Usage Tracker) → `data_models.md` (UsageData) + +**Q: What are the data models?** +→ `data_models.md` (complete definitions) → `interfaces.md` (API usage) → `components.md` (implementation) + +**Q: How do I troubleshoot errors?** +→ `review_notes.md` (known issues) → `workflows.md` (error handling) → `components.md` (component details) + +**Q: What external APIs are used?** +→ `dependencies.md` (service list) → `interfaces.md` (API specs) → `components.md` (usage) + +--- + +## 📅 Documentation Metadata + +**Generated**: 2026-03-02 +**Generator**: AI Documentation System +**Codebase Version**: Git commit `8d6102bc644641c94f5a695a32ea50c19b3c8d68` +**Documentation Version**: 1.0 +**Last Updated**: 2026-03-02 +**Next Review**: Recommended within 30 days + +--- + +## 🚀 Getting Started Paths + +### Path 1: New Developer +1. Read `codebase_info.md` - Understand the project +2. Read `architecture.md` - Learn the system design +3. Review `dependencies.md` - Set up your environment +4. Explore `components.md` - Understand the code + +### Path 2: Integration Developer +1. Read `interfaces.md` - Understand the APIs +2. Review `data_models.md` - Learn the data formats +3. Check `dependencies.md` - Understand requirements +4. Reference `workflows.md` - Understand processes + +### Path 3: Operations Engineer +1. Read `architecture.md` - Understand infrastructure +2. Review `workflows.md` - Learn operational procedures +3. Check `components.md` - Understand components +4. Reference `review_notes.md` - Know the issues + +### Path 4: AI Assistant +1. Read this index completely +2. Use metadata tags to navigate +3. Reference specific documents only when needed +4. Cross-reference between documents for complete answers + +--- + +**Remember**: This index is designed to minimize the need to read full documents. Use the summaries and metadata to answer questions efficiently, and only dive into detailed documents when you need specific implementation information. diff --git a/.agents/summary/interfaces.md b/.agents/summary/interfaces.md new file mode 100644 index 00000000..a059982b --- /dev/null +++ b/.agents/summary/interfaces.md @@ -0,0 +1,661 @@ +# Interfaces and APIs + +## External APIs + +### 1. Adobe PDF Services API + +**Purpose**: PDF structure tagging and content extraction + +**Authentication**: OAuth 2.0 with client credentials + +**Credentials Storage**: AWS Secrets Manager (`adobe-pdf-services-credentials`) + +**Operations Used**: + +#### Autotag API +- **Endpoint**: Adobe PDF Services REST API +- **Method**: POST +- **Function**: Adds accessibility tags to PDF +- **Input**: PDF file +- **Output**: Tagged PDF with structure tree +- **Tags Added**: + - Headings (H1-H6) + - Paragraphs (P) + - Lists (L, LI) + - Tables (Table, TR, TH, TD) + - Figures (Figure) + - Links (Link) + +**Options**: +```python +{ + "generate_report": True, + "shift_headings": False +} +``` + +#### Extract API +- **Endpoint**: Adobe PDF Services REST API +- **Method**: POST +- **Function**: Extracts content and structure +- **Input**: PDF file +- **Output**: ZIP file containing: + - `structuredData.json`: Document structure + - `images/`: Extracted images + - Excel file with image metadata + +**Rate Limits**: Enterprise contract dependent + +**Error Handling**: +- Exponential backoff retry +- CloudWatch error logging +- Fallback to basic processing + +--- + +### 2. AWS Bedrock API + +**Purpose**: AI-powered content generation and image analysis + +**Authentication**: IAM role-based + +**Models Used**: + +#### Amazon Nova Pro +- **Model ID**: `amazon.nova-pro-v1:0` +- **Capabilities**: + - Text generation + - Image analysis (multimodal) + - Context understanding +- **Use Cases**: + - Alt text generation + - Title generation + - Remediation suggestions + - Table caption generation + +**API Operations**: + +#### InvokeModel +```python +{ + "modelId": "amazon.nova-pro-v1:0", + "contentType": "application/json", + "accept": "application/json", + "body": { + "messages": [ + { + "role": "user", + "content": [ + {"text": "prompt"}, + {"image": {"source": {"bytes": image_bytes}}} + ] + } + ], + "inferenceConfig": { + "max_new_tokens": 512, + "temperature": 0.7 + } + } +} +``` + +**Response**: +```json +{ + "output": { + "message": { + "content": [{"text": "generated text"}] + } + }, + "usage": { + "inputTokens": 100, + "outputTokens": 50 + } +} +``` + +**Pricing**: +- Input: $0.0008 per 1K tokens +- Output: $0.0032 per 1K tokens + +**Rate Limits**: +- Requests per minute: Model-dependent +- Tokens per minute: Model-dependent + +--- + +### 3. AWS Bedrock Data Automation API + +**Purpose**: PDF parsing and structure extraction + +**Authentication**: IAM role-based + +**Operations**: + +#### CreateDataAutomationProject +```python +{ + "projectName": "pdf-accessibility-project", + "projectStage": "LIVE" +} +``` + +#### InvokeDataAutomationAsync +```python +{ + "projectArn": "arn:aws:bedrock:region:account:data-automation-project/name", + "inputConfiguration": { + "s3Uri": "s3://bucket/input.pdf" + }, + "outputConfiguration": { + "s3Uri": "s3://bucket/output/" + } +} +``` + +**Output Structure**: +```json +{ + "pages": [ + { + "pageNumber": 1, + "elements": [ + { + "type": "text", + "content": "...", + "boundingBox": {...}, + "confidence": 0.95 + }, + { + "type": "image", + "s3Path": "s3://...", + "boundingBox": {...} + } + ] + } + ] +} +``` + +**Capabilities**: +- Text extraction with layout +- Image extraction +- Table detection +- Element positioning +- Confidence scores + +--- + +## Internal APIs + +### 4. Content Accessibility Utility API + +**Location**: `pdf2html/content_accessibility_utility_on_aws/api.py` + +**Purpose**: Main entry point for PDF accessibility processing + +#### process_pdf_accessibility() +```python +def process_pdf_accessibility( + pdf_path: str, + output_dir: str, + config: Optional[Dict] = None +) -> Dict[str, Any] +``` + +**Parameters**: +- `pdf_path`: Path to input PDF +- `output_dir`: Directory for outputs +- `config`: Configuration options + +**Returns**: +```python +{ + "html_path": "path/to/remediated.html", + "report_path": "path/to/report.html", + "audit_results": {...}, + "remediation_results": {...}, + "usage_data": {...} +} +``` + +**Process Flow**: +1. Convert PDF to HTML +2. Audit HTML for accessibility +3. Remediate issues +4. Generate reports +5. Package outputs + +--- + +#### convert_pdf_to_html() +```python +def convert_pdf_to_html( + pdf_path: str, + output_dir: str, + bda_project_arn: Optional[str] = None +) -> str +``` + +**Purpose**: Converts PDF to HTML using BDA + +**Returns**: Path to generated HTML file + +--- + +#### audit_html_accessibility() +```python +def audit_html_accessibility( + html_path: str, + output_dir: Optional[str] = None +) -> AuditReport +``` + +**Purpose**: Audits HTML for WCAG compliance + +**Returns**: `AuditReport` object with issues + +--- + +#### remediate_html_accessibility() +```python +def remediate_html_accessibility( + html_path: str, + audit_report: AuditReport, + output_dir: str, + config: Optional[Dict] = None +) -> RemediationReport +``` + +**Purpose**: Fixes accessibility issues + +**Returns**: `RemediationReport` object with fixes applied + +--- + +#### generate_remediation_report() +```python +def generate_remediation_report( + audit_report: AuditReport, + remediation_report: RemediationReport, + output_path: str, + format: str = "html" +) -> str +``` + +**Purpose**: Generates accessibility report + +**Formats**: `html`, `json`, `csv`, `txt` + +**Returns**: Path to generated report + +--- + +### 5. Audit API + +**Location**: `pdf2html/content_accessibility_utility_on_aws/audit/api.py` + +#### audit_html_accessibility() +```python +def audit_html_accessibility( + html_path: str, + config: Optional[Dict] = None +) -> AuditReport +``` + +**Configuration Options**: +```python +{ + "wcag_level": "AA", # AA or AAA + "include_warnings": True, + "check_color_contrast": True, + "context_lines": 3 +} +``` + +**AuditReport Structure**: +```python +{ + "summary": { + "total_issues": 42, + "critical": 5, + "serious": 15, + "moderate": 18, + "minor": 4 + }, + "issues": [ + { + "id": "img-001", + "type": "missing_alt_text", + "severity": "critical", + "wcag_criteria": ["1.1.1"], + "element": "", + "selector": "body > div > img:nth-child(2)", + "location": {"page": 1, "line": 45}, + "message": "Image missing alt attribute", + "suggestion": "Add descriptive alt text" + } + ], + "wcag_summary": { + "1.1.1": {"count": 5, "description": "Non-text Content"}, + "1.3.1": {"count": 8, "description": "Info and Relationships"} + } +} +``` + +--- + +### 6. Remediation API + +**Location**: `pdf2html/content_accessibility_utility_on_aws/remediate/api.py` + +#### remediate_html_accessibility() +```python +def remediate_html_accessibility( + html_path: str, + audit_report: AuditReport, + output_path: str, + config: Optional[Dict] = None +) -> RemediationReport +``` + +**Configuration Options**: +```python +{ + "auto_remediate": True, + "use_ai": True, + "bedrock_model": "amazon.nova-pro-v1:0", + "max_retries": 3, + "skip_manual_review": False +} +``` + +**RemediationReport Structure**: +```python +{ + "summary": { + "total_issues": 42, + "fixed_automatically": 35, + "requires_manual_review": 7, + "failed": 0 + }, + "fixes": [ + { + "issue_id": "img-001", + "status": "fixed", + "method": "ai_generated", + "original": "", + "fixed": "Description", + "ai_prompt": "...", + "ai_response": "..." + } + ], + "manual_review_items": [ + { + "issue_id": "table-005", + "reason": "Complex table structure", + "suggestion": "Manually verify header associations" + } + ] +} +``` + +--- + +## AWS Service Interfaces + +### 7. S3 Interface + +**Operations Used**: + +#### GetObject +```python +s3_client.get_object( + Bucket='bucket-name', + Key='path/to/file.pdf' +) +``` + +#### PutObject +```python +s3_client.put_object( + Bucket='bucket-name', + Key='path/to/output.pdf', + Body=file_content, + ServerSideEncryption='AES256', + Metadata={'user-id': 'user123'} +) +``` + +#### PutObjectTagging +```python +s3_client.put_object_tagging( + Bucket='bucket-name', + Key='path/to/file.pdf', + Tagging={ + 'TagSet': [ + {'Key': 'user-id', 'Value': 'user123'}, + {'Key': 'upload-timestamp', 'Value': '2026-03-02T15:00:00Z'} + ] + } +) +``` + +--- + +### 8. CloudWatch Interface + +**Metrics**: + +#### PutMetricData +```python +cloudwatch_client.put_metric_data( + Namespace='PDFAccessibility', + MetricData=[ + { + 'MetricName': 'PagesProcessed', + 'Value': 10, + 'Unit': 'Count', + 'Timestamp': datetime.utcnow(), + 'Dimensions': [ + {'Name': 'Solution', 'Value': 'PDF2PDF'}, + {'Name': 'UserId', 'Value': 'user123'} + ] + } + ] +) +``` + +**Logs**: + +#### PutLogEvents +```python +logs_client.put_log_events( + logGroupName='/aws/lambda/function-name', + logStreamName='stream-name', + logEvents=[ + { + 'timestamp': int(time.time() * 1000), + 'message': 'Processing PDF: file.pdf' + } + ] +) +``` + +--- + +### 9. Secrets Manager Interface + +#### GetSecretValue +```python +secrets_client.get_secret_value( + SecretId='adobe-pdf-services-credentials' +) +``` + +**Response**: +```json +{ + "SecretString": "{\"client_id\":\"...\",\"client_secret\":\"...\"}" +} +``` + +--- + +### 10. Step Functions Interface + +#### StartExecution +```python +sfn_client.start_execution( + stateMachineArn='arn:aws:states:...', + input=json.dumps({ + 'bucket': 'bucket-name', + 'key': 'path/to/file.pdf', + 'chunks': ['chunk1.pdf', 'chunk2.pdf'] + }) +) +``` + +--- + +## Data Models + +### AuditReport +```python +@dataclass +class AuditReport: + summary: AuditSummary + issues: List[AuditIssue] + wcag_summary: Dict[str, WCAGCriterion] + timestamp: datetime + html_path: str +``` + +### AuditIssue +```python +@dataclass +class AuditIssue: + id: str + type: str + severity: Severity # CRITICAL, SERIOUS, MODERATE, MINOR + wcag_criteria: List[str] + element: str + selector: str + location: Location + message: str + suggestion: str + context: Optional[str] +``` + +### RemediationReport +```python +@dataclass +class RemediationReport: + summary: RemediationSummary + fixes: List[RemediationFix] + manual_review_items: List[ManualReviewItem] + timestamp: datetime + html_path: str +``` + +### RemediationFix +```python +@dataclass +class RemediationFix: + issue_id: str + status: RemediationStatus # FIXED, FAILED, MANUAL_REVIEW + method: str # ai_generated, rule_based, manual + original: str + fixed: str + ai_prompt: Optional[str] + ai_response: Optional[str] + error: Optional[str] +``` + +--- + +## Event Schemas + +### S3 Event (Lambda Trigger) +```json +{ + "Records": [ + { + "eventVersion": "2.1", + "eventSource": "aws:s3", + "eventName": "ObjectCreated:Put", + "s3": { + "bucket": { + "name": "bucket-name" + }, + "object": { + "key": "pdf/document.pdf", + "size": 1024000 + } + } + } + ] +} +``` + +### Step Functions Input +```json +{ + "bucket": "pdfaccessibility-bucket", + "original_key": "pdf/document.pdf", + "chunks": [ + "temp/document_page_1.pdf", + "temp/document_page_2.pdf", + "temp/document_page_3.pdf" + ], + "user_id": "user123", + "timestamp": "2026-03-02T15:00:00Z" +} +``` + +### Step Functions Output +```json +{ + "status": "SUCCESS", + "result_key": "result/COMPLIANT_document.pdf", + "pages_processed": 3, + "audit_results": { + "pre_remediation": {...}, + "post_remediation": {...} + }, + "metrics": { + "adobe_api_calls": 3, + "bedrock_invocations": 15, + "processing_duration_seconds": 120 + } +} +``` + +--- + +## Error Responses + +### Standard Error Format +```json +{ + "error": { + "code": "ERROR_CODE", + "message": "Human-readable error message", + "details": { + "file": "document.pdf", + "operation": "adobe_autotag", + "timestamp": "2026-03-02T15:00:00Z" + }, + "retry_after": 60 + } +} +``` + +### Common Error Codes +- `INVALID_PDF`: PDF file is corrupted or invalid +- `ADOBE_API_ERROR`: Adobe API call failed +- `BEDROCK_THROTTLING`: Bedrock rate limit exceeded +- `BDA_TIMEOUT`: BDA processing timeout +- `INSUFFICIENT_PERMISSIONS`: IAM permissions issue +- `S3_ACCESS_DENIED`: S3 access error +- `PROCESSING_TIMEOUT`: Overall timeout exceeded diff --git a/.agents/summary/review_notes.md b/.agents/summary/review_notes.md new file mode 100644 index 00000000..287420f1 --- /dev/null +++ b/.agents/summary/review_notes.md @@ -0,0 +1,330 @@ +# Documentation Review Notes + +## Consistency Check Results + +### ✅ Consistent Areas + +1. **Architecture Patterns** + - Event-driven architecture consistently applied + - Serverless-first approach throughout + - IAM role-based security model + +2. **Naming Conventions** + - S3 bucket naming: `{solution}-{resource}-{id}` + - Lambda functions: Descriptive names with hyphens + - Metrics namespace: `PDFAccessibility` + +3. **Error Handling** + - Exponential backoff retry logic + - CloudWatch error logging + - Metrics publishing for failures + +4. **Monitoring** + - CloudWatch Logs for all components + - Custom metrics with consistent dimensions + - Usage tracking across both solutions + +### ⚠️ Inconsistencies Found + +1. **Language Diversity** + - **Issue**: PDF Merger uses Java while other Lambdas use Python + - **Impact**: Different deployment processes, dependencies + - **Recommendation**: Consider migrating to Python for consistency + - **Justification**: Apache PDFBox (Java) may offer better PDF merging capabilities + +2. **Container Base Images** + - **Issue**: Adobe container uses `python:3.9-slim`, Alt Text uses `node:18-alpine` + - **Impact**: Different security patching schedules + - **Recommendation**: Standardize on specific base image versions + +3. **Metrics Helper Duplication** + - **Issue**: `metrics_helper.py` exists in multiple locations: + - `lambda/shared/metrics_helper.py` + - `lambda/shared/python/metrics_helper.py` + - `adobe-autotag-container/metrics_helper.py` + - `pdf2html/metrics_helper.py` + - **Impact**: Maintenance burden, potential version drift + - **Recommendation**: Consolidate into single shared module + +4. **Configuration Management** + - **Issue**: PDF-to-PDF uses environment variables, PDF-to-HTML uses config files + - **Impact**: Different configuration approaches + - **Recommendation**: Standardize on configuration method + +--- + +## Completeness Check Results + +### ✅ Well-Documented Areas + +1. **Architecture**: Comprehensive diagrams and explanations +2. **Components**: Detailed component descriptions +3. **Workflows**: Clear process flows +4. **APIs**: Well-defined interfaces +5. **Data Models**: Complete structure definitions + +### 📝 Areas Needing More Detail + +#### 1. Testing Strategy +- **Gap**: No documentation on testing approach +- **Missing**: + - Unit test structure + - Integration test scenarios + - End-to-end test procedures + - Test data requirements +- **Recommendation**: Add `testing.md` with: + - Test framework setup + - Sample test cases + - Mocking strategies for AWS services + - CI/CD test integration + +#### 2. Security Best Practices +- **Gap**: Limited security documentation +- **Missing**: + - IAM policy details + - Encryption at rest/in transit + - Secret rotation procedures + - Security audit procedures +- **Recommendation**: Add `security.md` with: + - Least privilege IAM policies + - Encryption configuration + - Secret management best practices + - Security checklist + +#### 3. Performance Optimization +- **Gap**: Limited performance tuning guidance +- **Missing**: + - Lambda memory optimization + - ECS task sizing guidelines + - Bedrock prompt optimization + - Cost optimization strategies +- **Recommendation**: Add `performance.md` with: + - Benchmarking results + - Tuning recommendations + - Cost vs. performance tradeoffs + +#### 4. Disaster Recovery +- **Gap**: Basic DR mentioned but not detailed +- **Missing**: + - Backup procedures + - Recovery testing + - Failover scenarios + - Data retention policies +- **Recommendation**: Add `disaster_recovery.md` with: + - Backup schedules + - Recovery procedures + - RTO/RPO definitions + - DR testing plan + +#### 5. Troubleshooting Guide +- **Gap**: README has basic troubleshooting, needs expansion +- **Missing**: + - Common error messages and solutions + - Debug logging procedures + - Performance issue diagnosis + - Support escalation paths +- **Recommendation**: Expand existing troubleshooting docs + +#### 6. API Rate Limiting +- **Gap**: Rate limits mentioned but not detailed +- **Missing**: + - Adobe API rate limit specifics + - Bedrock throttling handling + - BDA quota management + - Backpressure strategies +- **Recommendation**: Add rate limiting section to interfaces.md + +#### 7. Multi-Region Deployment +- **Gap**: No documentation on multi-region setup +- **Missing**: + - Cross-region replication + - Regional failover + - Latency optimization +- **Recommendation**: Add if multi-region support is planned + +#### 8. Monitoring and Alerting +- **Gap**: Metrics documented but alerting not detailed +- **Missing**: + - Alert thresholds + - Notification channels + - On-call procedures + - Runbook for common alerts +- **Recommendation**: Add `monitoring.md` with: + - Alert definitions + - Response procedures + - Dashboard usage guide + +--- + +## Language Support Limitations + +### Supported Languages +- **Python**: Fully supported (95 files) + - Comprehensive analysis + - All functions and classes documented +- **JavaScript**: Fully supported (3 files) + - Complete coverage +- **Java**: Fully supported (2 files) + - Complete coverage +- **Shell**: Fully supported (2 files) + - All functions documented + +### No Gaps Identified +All languages in the codebase are well-supported and documented. + +--- + +## Documentation Quality Assessment + +### Strengths +1. **Comprehensive Coverage**: All major components documented +2. **Visual Aids**: Mermaid diagrams for architecture and workflows +3. **Structured Organization**: Clear hierarchy and navigation +4. **Practical Examples**: Code snippets and data structures +5. **WCAG Compliance**: Detailed accessibility standards mapping + +### Areas for Improvement + +#### 1. Code Examples +- **Current**: Limited inline code examples +- **Recommendation**: Add more code snippets showing: + - Lambda handler patterns + - Bedrock API calls + - Error handling examples + - Configuration examples + +#### 2. Deployment Variations +- **Current**: Focuses on one-click deployment +- **Recommendation**: Document: + - Local development setup + - CI/CD pipeline configuration + - Multi-account deployment + - Environment-specific configurations + +#### 3. Migration Guide +- **Current**: No migration documentation +- **Recommendation**: Add guide for: + - Upgrading between versions + - Migrating from other solutions + - Data migration procedures + +#### 4. API Versioning +- **Current**: No versioning strategy documented +- **Recommendation**: Define: + - API version scheme + - Backward compatibility policy + - Deprecation process + +#### 5. Contribution Guidelines +- **Current**: Basic "Contributing" section in README +- **Recommendation**: Expand with: + - Code style guide + - PR review process + - Development workflow + - Testing requirements + +--- + +## Recommendations for Documentation Maintenance + +### Short-Term (1-3 months) +1. Add testing documentation +2. Expand security best practices +3. Create troubleshooting runbook +4. Add code examples to existing docs + +### Medium-Term (3-6 months) +1. Create performance optimization guide +2. Document disaster recovery procedures +3. Add monitoring and alerting guide +4. Create migration guide + +### Long-Term (6-12 months) +1. Establish documentation review cycle +2. Create video tutorials +3. Build interactive documentation site +4. Develop certification program + +--- + +## Documentation Gaps by Priority + +### High Priority +1. **Testing Strategy**: Critical for development workflow +2. **Security Best Practices**: Essential for production deployment +3. **Troubleshooting Guide**: Needed for operational support + +### Medium Priority +1. **Performance Optimization**: Important for cost management +2. **Monitoring and Alerting**: Needed for production operations +3. **API Rate Limiting**: Important for reliability + +### Low Priority +1. **Multi-Region Deployment**: Only if required +2. **Migration Guide**: Needed when versions diverge +3. **API Versioning**: Future consideration + +--- + +## Validation Checklist + +### Architecture Documentation +- [x] High-level overview +- [x] Component diagrams +- [x] Data flow diagrams +- [x] Deployment architecture +- [ ] Multi-region architecture (if applicable) + +### Component Documentation +- [x] All major components described +- [x] Dependencies documented +- [x] Configuration options listed +- [ ] Performance characteristics +- [ ] Scaling considerations + +### API Documentation +- [x] External APIs documented +- [x] Internal APIs documented +- [x] Data models defined +- [x] Error responses documented +- [ ] Rate limits detailed +- [ ] API versioning strategy + +### Operational Documentation +- [x] Deployment procedures +- [x] Monitoring setup +- [ ] Alerting configuration +- [ ] Troubleshooting procedures +- [ ] Disaster recovery plan +- [ ] Security procedures + +### Development Documentation +- [x] Repository structure +- [x] Technology stack +- [x] Dependencies +- [ ] Development setup +- [ ] Testing procedures +- [ ] Contribution guidelines + +--- + +## Next Steps + +1. **Review with Team**: Share documentation with development team for feedback +2. **Prioritize Gaps**: Determine which gaps to address first +3. **Assign Owners**: Assign documentation tasks to team members +4. **Set Timeline**: Create schedule for documentation completion +5. **Establish Process**: Define ongoing documentation maintenance process + +--- + +## Feedback and Updates + +**Last Review**: 2026-03-02 +**Reviewer**: AI Documentation Generator +**Next Review**: Recommended within 30 days + +**How to Provide Feedback**: +- Create GitHub issue with label `documentation` +- Email: ai-cic@amazon.com +- Submit PR with documentation improvements diff --git a/.agents/summary/workflows.md b/.agents/summary/workflows.md new file mode 100644 index 00000000..92faf4f0 --- /dev/null +++ b/.agents/summary/workflows.md @@ -0,0 +1,481 @@ +# Key Workflows and Processes + +## PDF-to-PDF Remediation Workflow + +### End-to-End Process + +```mermaid +flowchart TD + Start([User Uploads PDF]) --> S3Upload[PDF saved to S3 pdf/ folder] + S3Upload --> S3Event[S3 Event Notification] + S3Event --> Splitter[PDF Splitter Lambda] + + Splitter --> Split{Split into
pages} + Split --> Chunk1[Page 1 PDF] + Split --> Chunk2[Page 2 PDF] + Split --> ChunkN[Page N PDF] + + Chunk1 & Chunk2 & ChunkN --> StepFn[Step Functions
Orchestrator] + + StepFn --> PreCheck[Pre-Remediation
Accessibility Check] + PreCheck --> MapState[Map State:
Parallel Processing] + + MapState --> Adobe1[Adobe Autotag
ECS Task 1] + MapState --> Adobe2[Adobe Autotag
ECS Task 2] + MapState --> AdobeN[Adobe Autotag
ECS Task N] + + Adobe1 --> Alt1[Alt Text Generator
ECS Task 1] + Adobe2 --> Alt2[Alt Text Generator
ECS Task 2] + AdobeN --> AltN[Alt Text Generator
ECS Task N] + + Alt1 & Alt2 & AltN --> MapComplete[All Chunks
Processed] + + MapComplete --> TitleGen[Title Generator
Lambda] + TitleGen --> PostCheck[Post-Remediation
Accessibility Check] + PostCheck --> Merger[PDF Merger
Lambda] + Merger --> Result[Save to S3
result/ folder] + Result --> End([User Downloads
Compliant PDF]) +``` + +### Detailed Steps + +#### 1. Upload and Trigger (0-5 seconds) +- User uploads PDF to S3 `pdf/` folder +- S3 generates PUT event notification +- Event triggers PDF Splitter Lambda +- S3 Object Tagger adds user metadata + +#### 2. PDF Splitting (5-30 seconds) +- Lambda downloads PDF from S3 +- Splits PDF into individual pages using pypdf +- Uploads each page to `temp/` folder +- Publishes metrics (pages processed, file size) +- Triggers Step Functions with chunk list + +#### 3. Pre-Remediation Check (10-20 seconds) +- Lambda downloads original PDF +- Runs accessibility audit +- Generates baseline report +- Saves report to S3 + +#### 4. Parallel Chunk Processing (2-10 minutes per chunk) + +**Map State Configuration**: +- Max concurrency: 10 +- Retry attempts: 3 +- Timeout: 30 minutes per chunk + +**For Each Chunk**: + +##### 4a. Adobe Autotag (1-5 minutes) +- ECS Fargate task starts +- Downloads chunk from S3 +- Retrieves Adobe credentials from Secrets Manager +- Calls Adobe Autotag API + - Adds structure tags (headings, lists, tables) + - Identifies reading order +- Calls Adobe Extract API + - Extracts images + - Generates image metadata Excel file +- Creates SQLite database with image info +- Uploads tagged PDF to S3 +- Publishes metrics (API calls, duration) + +##### 4b. Alt Text Generation (1-5 minutes) +- ECS Fargate task starts +- Downloads tagged PDF and image metadata +- For each image: + - Extracts surrounding text context + - Determines if decorative or informative + - If informative: + - Encodes image as base64 + - Calls Bedrock Nova Pro with image + context + - Receives AI-generated alt text + - Embeds alt text in PDF structure +- Uploads final PDF to S3 +- Publishes metrics (Bedrock calls, tokens) + +#### 5. Title Generation (30-60 seconds) +- Lambda downloads first processed chunk +- Extracts text from first few pages +- Calls Bedrock Nova Pro with prompt +- Receives generated title +- Embeds title in PDF metadata +- Saves updated PDF + +#### 6. Post-Remediation Check (10-20 seconds) +- Lambda downloads processed PDF +- Runs accessibility audit +- Compares with pre-check results +- Generates compliance report +- Saves report to S3 + +#### 7. PDF Merging (30-120 seconds) +- Java Lambda starts +- Downloads all processed chunks +- Merges in correct page order using Apache PDFBox +- Adds "COMPLIANT" prefix to filename +- Uploads to `result/` folder +- Publishes completion metrics + +#### 8. Notification and Cleanup +- User receives notification (if UI deployed) +- Temporary files remain in `temp/` folder +- Optional: S3 lifecycle policy cleans up temp files after 7 days + +### Total Processing Time +- **Small PDF (1-10 pages)**: 3-8 minutes +- **Medium PDF (11-50 pages)**: 8-20 minutes +- **Large PDF (51-200 pages)**: 20-60 minutes + +--- + +## PDF-to-HTML Remediation Workflow + +### End-to-End Process + +```mermaid +flowchart TD + Start([User Uploads PDF]) --> S3Upload[PDF saved to S3
uploads/ folder] + S3Upload --> S3Event[S3 Event Notification] + S3Event --> Lambda[PDF2HTML Lambda] + + Lambda --> BDACreate[Create BDA Job] + BDACreate --> BDAProcess[BDA Parses PDF] + BDAProcess --> BDAWait{Wait for
Completion} + BDAWait -->|Polling| BDACheck[Check Status] + BDACheck -->|Processing| BDAWait + BDACheck -->|Complete| BDAResult[Retrieve Results] + + BDAResult --> Convert[Convert to HTML] + Convert --> Audit[Audit Accessibility] + + Audit --> IssueLoop{For Each
Issue} + IssueLoop --> CheckType{Issue Type} + + CheckType -->|Simple| RuleBased[Rule-Based Fix] + CheckType -->|Complex| AIFix[AI-Generated Fix] + + AIFix --> Bedrock[Call Bedrock
Nova Pro] + Bedrock --> ApplyFix[Apply Fix to HTML] + RuleBased --> ApplyFix + + ApplyFix --> MoreIssues{More
Issues?} + MoreIssues -->|Yes| IssueLoop + MoreIssues -->|No| Report[Generate Reports] + + Report --> Package[Package Outputs] + Package --> ZIP[Create ZIP File] + ZIP --> S3Save[Save to S3
remediated/ folder] + S3Save --> End([User Downloads ZIP]) +``` + +### Detailed Steps + +#### 1. Upload and Trigger (0-5 seconds) +- User uploads PDF to S3 `uploads/` folder +- S3 generates PUT event notification +- Event triggers PDF2HTML Lambda (container) +- S3 Object Tagger adds user metadata + +#### 2. PDF to HTML Conversion (30-120 seconds) + +##### 2a. BDA Job Creation +- Lambda calls Bedrock Data Automation API +- Creates async parsing job +- Receives job ID + +##### 2b. BDA Processing +- BDA parses PDF structure +- Extracts text with layout information +- Identifies images, tables, headings +- Generates structured JSON output +- Saves to S3 output location + +##### 2c. Status Polling +- Lambda polls BDA job status every 5 seconds +- Timeout: 5 minutes +- On completion, retrieves results + +##### 2d. HTML Generation +- Lambda processes BDA JSON output +- Builds HTML structure from elements +- Preserves layout and styling +- Copies images to output directory +- Saves initial HTML to `output/result.html` + +#### 3. Accessibility Audit (10-30 seconds) + +##### 3a. HTML Parsing +- Loads HTML with BeautifulSoup +- Builds DOM tree + +##### 3b. Check Execution +- Runs all accessibility checks: + - Image checks (alt text) + - Heading checks (hierarchy) + - Table checks (headers, captions) + - Form checks (labels, fieldsets) + - Link checks (descriptive text) + - Structure checks (landmarks, language) + - Color contrast checks + +##### 3c. Issue Collection +- Collects all issues with: + - Element selector + - WCAG criteria + - Severity level + - Suggested fix +- Generates audit report + +#### 4. Remediation (1-5 minutes) + +##### 4a. Issue Prioritization +- Groups issues by type +- Prioritizes critical issues +- Determines remediation strategy + +##### 4b. Rule-Based Fixes (Simple Issues) +**Examples**: +- Add missing `lang` attribute +- Add `main` landmark +- Fix heading hierarchy +- Add table `scope` attributes +- Associate form labels + +**Process**: +- Apply predefined transformation +- Update HTML DOM +- Mark issue as fixed + +##### 4c. AI-Generated Fixes (Complex Issues) +**Examples**: +- Generate alt text for images +- Create table captions +- Improve link text +- Generate document title + +**Process**: +1. Extract element and context +2. Build AI prompt with: + - Issue description + - Element HTML + - Surrounding context + - WCAG guidance +3. Call Bedrock Nova Pro +4. Parse AI response +5. Apply fix to HTML +6. Validate fix +7. Mark issue as fixed or manual review + +##### 4d. Manual Review Items +**Flagged for Manual Review**: +- Complex table structures +- Ambiguous image context +- Color contrast requiring design changes +- Structural changes affecting layout + +#### 5. Report Generation (5-15 seconds) + +##### 5a. HTML Report +- Interactive report with: + - Summary statistics + - Issue breakdown by severity + - WCAG criteria mapping + - Before/after comparisons + - Manual review items +- Styled with CSS +- JavaScript for filtering + +##### 5b. JSON Report +- Machine-readable format +- Complete issue details +- Remediation actions +- Usage statistics + +##### 5c. Usage Data +- Bedrock invocations and tokens +- BDA processing time +- Cost estimates +- Processing metrics + +#### 6. Packaging and Output (5-10 seconds) + +##### 6a. File Collection +- `remediated.html`: Final accessible HTML +- `result.html`: Original conversion (before remediation) +- `images/`: Extracted images with alt text +- `remediation_report.html`: Detailed report +- `usage_data.json`: Usage statistics + +##### 6b. ZIP Creation +- Creates `final_{filename}.zip` +- Includes all output files +- Preserves directory structure + +##### 6c. S3 Upload +- Uploads ZIP to `remediated/` folder +- Sets appropriate metadata +- Publishes completion metrics + +#### 7. Cleanup +- Removes temporary files +- Logs completion +- Returns success response + +### Total Processing Time +- **Small PDF (1-10 pages)**: 1-3 minutes +- **Medium PDF (11-50 pages)**: 3-8 minutes +- **Large PDF (51-200 pages)**: 8-20 minutes + +--- + +## Deployment Workflow + +### One-Click Deployment (deploy.sh) + +```mermaid +flowchart TD + Start([Run deploy.sh]) --> Check[Check Prerequisites] + Check --> Region[Select AWS Region] + Region --> Solution{Select Solution} + + Solution -->|PDF-to-PDF| Adobe[Enter Adobe Credentials] + Solution -->|PDF-to-HTML| BDA[Check BDA Access] + Solution -->|Both| Adobe + + Adobe --> Secrets[Store in Secrets Manager] + BDA --> Project[Create BDA Project] + Secrets & Project --> CodeBuild[Create CodeBuild Project] + + CodeBuild --> Build[Start Build] + Build --> CDKSynth[CDK Synth] + CDKSynth --> CDKDeploy[CDK Deploy] + + CDKDeploy --> Stack1[Deploy PDF-to-PDF Stack] + CDKDeploy --> Stack2[Deploy PDF-to-HTML Stack] + CDKDeploy --> Stack3[Deploy Metrics Stack] + + Stack1 & Stack2 & Stack3 --> Verify[Verify Deployment] + Verify --> UI{Deploy UI?} + + UI -->|Yes| UIBuild[Build UI Stack] + UI -->|No| Complete + UIBuild --> Complete[Deployment Complete] + Complete --> End([Show Testing Instructions]) +``` + +### Manual Deployment + +```mermaid +flowchart TD + Start([Developer]) --> Clone[Clone Repository] + Clone --> Install[Install Dependencies] + Install --> Config[Configure AWS CLI] + Config --> Secrets[Create Secrets] + Secrets --> Synth[cdk synth] + Synth --> Deploy[cdk deploy --all] + Deploy --> Verify[Verify Resources] + Verify --> Test[Run Tests] + Test --> End([Deployment Complete]) +``` + +--- + +## Error Handling Workflows + +### Retry Logic + +```mermaid +flowchart TD + Start[Operation Starts] --> Try[Attempt Operation] + Try --> Success{Success?} + Success -->|Yes| End([Complete]) + Success -->|No| CheckRetries{Retries
Remaining?} + CheckRetries -->|Yes| Wait[Exponential Backoff] + Wait --> Retry[Retry Attempt] + Retry --> Try + CheckRetries -->|No| Error[Log Error] + Error --> Metric[Publish Error Metric] + Metric --> Fail([Fail]) +``` + +**Retry Configuration**: +- Max attempts: 3 +- Backoff rate: 2.0 +- Initial delay: 1 second +- Max delay: 60 seconds + +### Error Recovery + +#### Adobe API Failure +1. Log error to CloudWatch +2. Publish error metric +3. Retry with exponential backoff +4. If all retries fail: + - Mark chunk as failed + - Continue with other chunks + - Generate partial result + +#### Bedrock Throttling +1. Detect throttling error +2. Implement exponential backoff +3. Reduce request rate +4. Retry operation +5. If persistent: + - Fall back to rule-based fixes + - Flag for manual review + +#### BDA Timeout +1. Cancel BDA job +2. Retry with smaller page range +3. If timeout persists: + - Process pages individually + - Combine results + +--- + +## Monitoring Workflow + +### Metrics Collection + +```mermaid +flowchart LR + Lambda[Lambda/ECS] --> Emit[Emit Metrics] + Emit --> CW[CloudWatch Metrics] + CW --> Dashboard[Dashboard] + CW --> Alarms[CloudWatch Alarms] + Alarms --> SNS[SNS Notifications] + SNS --> Email[Email/SMS] +``` + +### Log Aggregation + +```mermaid +flowchart LR + Components[All Components] --> Logs[CloudWatch Logs] + Logs --> Insights[CloudWatch Insights] + Insights --> Queries[Custom Queries] + Queries --> Analysis[Analysis & Debugging] +``` + +--- + +## Cost Tracking Workflow + +```mermaid +flowchart TD + Upload[User Uploads PDF] --> Tag[S3 Object Tagger] + Tag --> Process[Processing Pipeline] + Process --> Track[Usage Tracker] + Track --> Metrics[Publish Cost Metrics] + Metrics --> Dashboard[Cost Dashboard] + Dashboard --> Report[Per-User Cost Report] +``` + +**Cost Attribution**: +1. S3 object tagged with user ID +2. All operations track user ID +3. Metrics published with user dimension +4. Dashboard aggregates by user +5. Monthly cost reports generated diff --git a/.gitignore b/.gitignore index a140b69d..cba911f9 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ cdk.out/ __pycache__/ *.pyc javascript_docker/node_modules +lambda/title-generator-lambda/venv lambda/add_title/venv PDF_accessability_UI # IDE and editor files @@ -25,3 +26,9 @@ PDF_accessability_UI # PDF UI (separate repo) PDF_accessability_UI/ + +# Stack export files +existing-stack.json +# Pipeline config files (may contain credentials) +pipeline.conf +*.conf diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..889ad4d1 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,731 @@ +# PDF Accessibility Solutions - AI Assistant Guide + +**Version**: 1.0 +**Last Updated**: 2026-03-02 +**Codebase Commit**: `8d6102bc644641c94f5a695a32ea50c19b3c8d68` + +## Purpose + +This document provides AI coding assistants with essential context about the PDF Accessibility Solutions codebase. It focuses on information not typically found in README.md or CONTRIBUTING.md, including file organization, coding patterns, testing procedures, and package-specific guidance. + +--- + +## Table of Contents + +1. [Project Overview](#project-overview) +2. [Directory Structure](#directory-structure) +3. [Coding Patterns and Conventions](#coding-patterns-and-conventions) +4. [Development Workflow](#development-workflow) +5. [Testing Guidelines](#testing-guidelines) +6. [Package-Specific Guidance](#package-specific-guidance) +7. [Common Tasks](#common-tasks) +8. [Troubleshooting](#troubleshooting) + +--- + +## Project Overview + +### What This Project Does + +PDF Accessibility Solutions provides two complementary approaches to making PDF documents accessible according to WCAG 2.1 Level AA standards: + +1. **PDF-to-PDF Remediation**: Maintains PDF format while adding accessibility features (tags, alt text, structure) +2. **PDF-to-HTML Remediation**: Converts PDFs to accessible HTML with full WCAG compliance + +### Key Technologies + +- **Infrastructure**: AWS CDK (Python & JavaScript) +- **Compute**: AWS Lambda (Python, Java, Node.js), ECS Fargate +- **AI/ML**: Amazon Bedrock (Nova Pro), Bedrock Data Automation +- **Storage**: Amazon S3 +- **Orchestration**: AWS Step Functions +- **Monitoring**: CloudWatch Logs & Metrics + +### Architecture Pattern + +Event-driven, serverless architecture: +- S3 events trigger processing pipelines +- Step Functions orchestrate parallel processing +- ECS Fargate handles heavy compute tasks +- Lambda handles lightweight operations + +--- + +## Directory Structure + +``` +PDF_Accessibility/ +├── .agents/summary/ # AI assistant documentation (this guide's source) +├── cdk/ # CDK infrastructure (Python) +│ ├── usage_metrics_stack.py +│ └── cdk_stack.py +├── lambda/ # Lambda functions +│ ├── pdf-splitter-lambda/ # Python: Splits PDFs into pages +│ ├── pdf-merger-lambda/ # Java: Merges processed PDFs +│ ├── title-generator-lambda/ # Python: Generates titles +│ ├── pre-remediation-accessibility-checker/ # Python +│ ├── post-remediation-accessibility-checker/ # Python +│ ├── s3_object_tagger/ # Python: Tags S3 objects +│ └── shared/ # Shared utilities (metrics_helper.py) +├── pdf2html/ # PDF-to-HTML solution +│ ├── cdk/ # CDK infrastructure (JavaScript) +│ ├── content_accessibility_utility_on_aws/ # Core library +│ │ ├── audit/ # Accessibility auditing +│ │ ├── remediate/ # Accessibility remediation +│ │ ├── pdf2html/ # PDF to HTML conversion +│ │ ├── batch/ # Batch processing +│ │ └── utils/ # Utilities +│ ├── lambda_function.py # Lambda entry point +│ ├── metrics_helper.py # Metrics tracking +│ └── Dockerfile # Lambda container image +├── adobe-autotag-container/ # ECS: Adobe API integration (Python) +├── alt-text-generator-container/ # ECS: Alt text generation (Node.js) +├── docs/ # Documentation +├── app.py # Main CDK app (PDF-to-PDF) +├── deploy.sh # Unified deployment script +└── deploy-local.sh # Local deployment script +``` + +### Key File Locations + +**Infrastructure**: +- PDF-to-PDF CDK: `app.py`, `cdk/usage_metrics_stack.py` +- PDF-to-HTML CDK: `pdf2html/cdk/lib/pdf2html-stack.js` + +**Core Logic**: +- PDF-to-PDF: Lambda functions in `lambda/` + ECS containers +- PDF-to-HTML: `pdf2html/content_accessibility_utility_on_aws/` + +**Shared Code**: +- Metrics: `lambda/shared/metrics_helper.py` (duplicated in containers) +- Configuration: `pdf2html/content_accessibility_utility_on_aws/utils/config.py` + +**Deployment**: +- One-click: `deploy.sh` +- Local: `deploy-local.sh` +- CI/CD: `buildspec-unified.yml` + +--- + +## Coding Patterns and Conventions + +### Python Code Style + +**Formatting**: +- Follow PEP 8 +- Use 4 spaces for indentation +- Max line length: 100 characters (flexible) +- Use type hints where practical + +**Naming Conventions**: +- Functions: `snake_case` +- Classes: `PascalCase` +- Constants: `UPPER_SNAKE_CASE` +- Private methods: `_leading_underscore` + +**Example Pattern**: +```python +from typing import Dict, List, Optional +import boto3 +from metrics_helper import MetricsContext + +def process_pdf_document( + bucket: str, + key: str, + user_id: Optional[str] = None +) -> Dict[str, any]: + """Process a PDF document for accessibility. + + Args: + bucket: S3 bucket name + key: S3 object key + user_id: Optional user identifier for metrics + + Returns: + Dictionary with processing results + """ + with MetricsContext(user_id=user_id, solution="PDF2PDF") as metrics: + try: + # Processing logic + metrics.track_pages_processed(page_count) + return {"status": "success"} + except Exception as e: + metrics.track_error(str(e)) + raise +``` + +### JavaScript Code Style + +**Formatting**: +- Use 2 spaces for indentation +- Semicolons required +- Use `const` by default, `let` when needed +- Async/await for asynchronous code + +**Example Pattern**: +```javascript +const { S3Client, GetObjectCommand } = require('@aws-sdk/client-s3'); +const { BedrockRuntimeClient, InvokeModelCommand } = require('@aws-sdk/client-bedrock-runtime'); + +async function generateAltText(imageBuffer, context) { + const client = new BedrockRuntimeClient({ region: process.env.AWS_REGION }); + + const payload = { + messages: [{ + role: 'user', + content: [ + { text: `Generate alt text for this image. Context: ${context}` }, + { image: { source: { bytes: imageBuffer } } } + ] + }], + inferenceConfig: { maxTokens: 512, temperature: 0.7 } + }; + + const response = await client.send(new InvokeModelCommand({ + modelId: 'amazon.nova-pro-v1:0', + body: JSON.stringify(payload) + })); + + return JSON.parse(response.body).output.message.content[0].text; +} +``` + +### Java Code Style + +**Formatting**: +- Follow Google Java Style Guide +- Use 4 spaces for indentation +- Braces on same line + +**Example Pattern** (PDF Merger): +```java +public class App implements RequestHandler, Map> { + private final S3Client s3Client = S3Client.builder().build(); + + @Override + public Map handleRequest(Map input, Context context) { + String bucket = (String) input.get("bucket"); + List chunks = (List) input.get("chunks"); + + try { + PDDocument mergedDoc = new PDDocument(); + for (String chunk : chunks) { + PDDocument doc = downloadPDF(bucket, chunk); + for (PDPage page : doc.getPages()) { + mergedDoc.addPage(page); + } + doc.close(); + } + + String outputKey = uploadPDF(bucket, mergedDoc); + return Map.of("status", "success", "output_key", outputKey); + } catch (IOException e) { + context.getLogger().log("Error: " + e.getMessage()); + throw new RuntimeException(e); + } + } +} +``` + +### Error Handling Pattern + +**Consistent Error Handling**: +```python +def operation_with_retry(max_retries=3, backoff_rate=2.0): + """Decorator for operations with exponential backoff retry.""" + def decorator(func): + def wrapper(*args, **kwargs): + delay = 1 + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + if attempt == max_retries - 1: + logger.error(f"Failed after {max_retries} attempts: {e}") + raise + logger.warning(f"Attempt {attempt + 1} failed, retrying in {delay}s") + time.sleep(delay) + delay *= backoff_rate + return wrapper + return decorator +``` + +### Metrics Publishing Pattern + +**Always use MetricsContext**: +```python +with MetricsContext(user_id=user_id, solution="PDF2PDF") as metrics: + start_time = time.time() + + # Track input + metrics.track_file_size(file_size_bytes) + + # Perform operation + result = process_document() + + # Track output + metrics.track_pages_processed(page_count) + metrics.track_processing_duration(time.time() - start_time) + + # Track API calls + metrics.track_adobe_api_call() + metrics.track_bedrock_invocation(input_tokens, output_tokens) + + # Estimate costs + metrics.estimate_cost( + adobe_calls=1, + bedrock_input_tokens=input_tokens, + bedrock_output_tokens=output_tokens + ) +``` + +--- + +## Development Workflow + +### Setting Up Local Environment + +**Prerequisites**: +- Python 3.9+ (recommend 3.12) +- Node.js 18+ +- Java 11+ (for PDF merger) +- Docker (for container builds) +- AWS CLI configured + +**Setup Steps**: +```bash +# Clone repository +git clone https://github.com/ASUCICREPO/PDF_Accessibility.git +cd PDF_Accessibility + +# Python setup +python3 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +pip install -r requirements.txt + +# CDK setup +npm install -g aws-cdk +cdk bootstrap # First time only + +# PDF2HTML setup +cd pdf2html +pip install -e . +cd .. +``` + +### Making Changes + +**Workflow**: +1. Create feature branch: `git checkout -b feature/your-feature` +2. Make changes +3. Test locally (see Testing Guidelines) +4. Commit with descriptive message +5. Push and create PR + +**Commit Message Format**: +``` +type(scope): description + +feat(pdf-splitter): add support for encrypted PDFs +fix(remediation): correct heading hierarchy detection +docs(readme): update deployment instructions +``` + +### Local Testing + +**Test Lambda Locally**: +```bash +# PDF Splitter example +cd lambda/pdf-splitter-lambda +python -c "from main import lambda_handler; lambda_handler({'Records': [...]}, None)" +``` + +**Test CDK Synth**: +```bash +cdk synth # Generates CloudFormation templates +cdk diff # Shows changes before deployment +``` + +**Test PDF2HTML Library**: +```bash +cd pdf2html +python -m content_accessibility_utility_on_aws.cli convert input.pdf output/ +``` + +--- + +## Testing Guidelines + +### Unit Testing + +**Framework**: pytest + +**Test Structure**: +``` +tests/ +├── unit/ +│ ├── test_pdf_splitter.py +│ ├── test_auditor.py +│ └── test_remediation.py +├── integration/ +│ └── test_end_to_end.py +└── fixtures/ + └── sample.pdf +``` + +**Example Unit Test**: +```python +import pytest +from unittest.mock import Mock, patch +from pdf_splitter import split_pdf_into_pages + +@pytest.fixture +def mock_s3_client(): + with patch('boto3.client') as mock: + yield mock.return_value + +def test_split_pdf_into_pages(mock_s3_client): + # Arrange + bucket = "test-bucket" + key = "test.pdf" + + # Act + result = split_pdf_into_pages(bucket, key) + + # Assert + assert result['page_count'] > 0 + assert mock_s3_client.put_object.called +``` + +### Integration Testing + +**Test with LocalStack** (for AWS services): +```bash +# Start LocalStack +docker run -d -p 4566:4566 localstack/localstack + +# Run integration tests +AWS_ENDPOINT_URL=http://localhost:4566 pytest tests/integration/ +``` + +### End-to-End Testing + +**Manual E2E Test**: +1. Deploy to test environment +2. Upload test PDF to S3 +3. Monitor CloudWatch Logs +4. Verify output in S3 +5. Check CloudWatch metrics + +**Test PDFs**: Use `tests/fixtures/` for consistent test data + +--- + +## Package-Specific Guidance + +### lambda/pdf-splitter-lambda + +**Purpose**: Splits large PDFs into individual pages for parallel processing + +**Key Functions**: +- `lambda_handler()`: Entry point +- `split_pdf_into_pages()`: Core splitting logic + +**Dependencies**: `pypdf`, `boto3` + +**Testing**: Mock S3 operations, use small test PDFs + +**Common Issues**: +- Memory limits with large PDFs → Increase Lambda memory +- Timeout → Increase timeout or split into smaller chunks + +--- + +### adobe-autotag-container + +**Purpose**: Adds accessibility tags using Adobe PDF Services API + +**Key Functions**: +- `main()`: Entry point +- `autotag_pdf_with_options()`: Calls Adobe API +- `extract_images_from_extract_api()`: Extracts images + +**Dependencies**: `pdfservices-sdk`, `boto3` + +**Configuration**: +- Adobe credentials from Secrets Manager +- Environment variables: `AWS_REGION`, `BUCKET_NAME` + +**Testing**: Use Adobe trial account, mock Secrets Manager + +**Common Issues**: +- Adobe API rate limits → Implement backoff +- Credential errors → Verify Secrets Manager access + +--- + +### alt-text-generator-container + +**Purpose**: Generates alt text for images using Bedrock + +**Key Functions**: +- `startProcess()`: Entry point +- `generateAltText()`: Calls Bedrock +- `modifyPDF()`: Embeds alt text + +**Dependencies**: `pdf-lib`, `@aws-sdk/client-bedrock-runtime` + +**Configuration**: +- Bedrock model: `amazon.nova-pro-v1:0` +- Environment variables: `AWS_REGION`, `BUCKET_NAME` + +**Testing**: Mock Bedrock responses, use sample images + +**Common Issues**: +- Bedrock throttling → Implement rate limiting +- Large images → Resize before sending to Bedrock + +--- + +### pdf2html/content_accessibility_utility_on_aws + +**Purpose**: Core library for PDF-to-HTML conversion and remediation + +**Key Modules**: +- `audit/`: Accessibility auditing (WCAG checks) +- `remediate/`: Accessibility remediation (fixes) +- `pdf2html/`: PDF to HTML conversion (BDA integration) +- `utils/`: Shared utilities + +**Entry Points**: +- CLI: `cli.py` +- API: `api.py` +- Lambda: `../lambda_function.py` + +**Configuration**: `utils/config.py` with environment variables + +**Testing**: Use pytest with fixtures in `tests/` + +**Common Issues**: +- BDA timeouts → Increase polling timeout +- Complex tables → May require manual review +- Image context → Improve surrounding text extraction + +--- + +### cdk/ + +**Purpose**: Infrastructure as Code for PDF-to-PDF solution + +**Key Files**: +- `app.py`: Main CDK app +- `usage_metrics_stack.py`: CloudWatch dashboard + +**Deployment**: +```bash +cdk synth # Generate CloudFormation +cdk deploy --all # Deploy all stacks +``` + +**Testing**: `cdk synth` to validate, `cdk diff` to preview changes + +**Common Issues**: +- Resource limits → Request quota increases +- VPC configuration → Verify subnet availability + +--- + +## Common Tasks + +### Adding a New Lambda Function + +1. Create directory in `lambda/` +2. Add `main.py` (or `App.java`) with handler +3. Add `requirements.txt` (or `pom.xml`) +4. Add `Dockerfile` if using container +5. Update `app.py` to define Lambda resource +6. Add IAM permissions +7. Add CloudWatch log group +8. Deploy with `cdk deploy` + +**Example CDK Code**: +```python +new_lambda = lambda_.Function( + self, "NewFunction", + runtime=lambda_.Runtime.PYTHON_3_12, + handler="main.lambda_handler", + code=lambda_.Code.from_asset("lambda/new-function"), + timeout=Duration.minutes(5), + memory_size=1024, + environment={ + "BUCKET_NAME": bucket.bucket_name + } +) +bucket.grant_read_write(new_lambda) +``` + +### Adding a New Accessibility Check + +1. Create check class in `pdf2html/content_accessibility_utility_on_aws/audit/checks/` +2. Inherit from `AccessibilityCheck` +3. Implement `check()` method +4. Register in `audit/checks/__init__.py` +5. Add test in `tests/unit/audit/checks/` + +**Example Check**: +```python +from audit.base_check import AccessibilityCheck + +class NewCheck(AccessibilityCheck): + def check(self, soup): + issues = [] + elements = soup.find_all('element-type') + for elem in elements: + if not self._meets_criteria(elem): + issues.append(self._create_issue( + type='new_issue_type', + severity='serious', + wcag_criteria=['X.X.X'], + element=elem, + message='Issue description', + suggestion='How to fix' + )) + return issues +``` + +### Adding a New Remediation Strategy + +1. Create strategy file in `pdf2html/content_accessibility_utility_on_aws/remediate/remediation_strategies/` +2. Implement remediation function +3. Register in `remediate/remediation_strategies/__init__.py` +4. Map issue type to strategy in `remediate/remediation_manager.py` +5. Add test + +**Example Strategy**: +```python +def remediate_new_issue(html_updater, issue, bedrock_client=None): + """Remediate new issue type.""" + element = html_updater.get_element_by_selector(issue.selector) + + if bedrock_client: + # AI-powered fix + fix = bedrock_client.generate_fix(element, issue) + html_updater.update_element_content(issue.selector, fix) + else: + # Rule-based fix + html_updater.update_element_attribute(issue.selector, 'attr', 'value') + + return RemediationFix( + issue_id=issue.id, + status=RemediationStatus.FIXED, + method='ai_generated' if bedrock_client else 'rule_based', + original_element=str(element), + fixed_element=str(html_updater.get_element_by_selector(issue.selector)) + ) +``` + +### Updating Dependencies + +**Python**: +```bash +pip install --upgrade package-name +pip freeze > requirements.txt +``` + +**JavaScript**: +```bash +npm update package-name +npm audit fix +``` + +**Java**: +Update version in `pom.xml`, then: +```bash +mvn clean install +``` + +### Adding CloudWatch Metrics + +1. Use `MetricsContext` in your code +2. Call appropriate tracking method +3. Metrics automatically published to `PDFAccessibility` namespace +4. Update dashboard in `cdk/usage_metrics_stack.py` if needed + +--- + +## Troubleshooting + +### Common Issues + +**Issue**: Lambda timeout +**Solution**: Increase timeout in CDK, optimize code, or split into smaller operations + +**Issue**: ECS task fails to start +**Solution**: Check VPC endpoints, verify ECR image exists, check IAM permissions + +**Issue**: Adobe API errors +**Solution**: Verify credentials in Secrets Manager, check API rate limits + +**Issue**: Bedrock throttling +**Solution**: Implement exponential backoff, reduce request rate, request quota increase + +**Issue**: BDA timeout +**Solution**: Increase polling timeout, process smaller page ranges + +**Issue**: S3 access denied +**Solution**: Verify IAM permissions, check bucket policy + +### Debugging Tips + +**Enable Debug Logging**: +```python +import logging +logging.basicConfig(level=logging.DEBUG) +``` + +**Check CloudWatch Logs**: +```bash +aws logs tail /aws/lambda/function-name --follow +``` + +**Test IAM Permissions**: +```bash +aws iam simulate-principal-policy \ + --policy-source-arn arn:aws:iam::account:role/role-name \ + --action-names s3:GetObject \ + --resource-arns arn:aws:s3:::bucket/key +``` + +**Validate CDK**: +```bash +cdk synth --strict # Strict validation +cdk doctor # Check CDK environment +``` + +--- + +## Additional Resources + +**Detailed Documentation**: See `.agents/summary/` directory: +- `index.md`: Knowledge base index +- `architecture.md`: System architecture +- `components.md`: Component details +- `interfaces.md`: API specifications +- `data_models.md`: Data structures +- `workflows.md`: Process flows +- `dependencies.md`: External dependencies + +**External Documentation**: +- [AWS CDK Documentation](https://docs.aws.amazon.com/cdk/) +- [Adobe PDF Services API](https://developer.adobe.com/document-services/docs/) +- [AWS Bedrock Documentation](https://docs.aws.amazon.com/bedrock/) +- [WCAG 2.1 Guidelines](https://www.w3.org/WAI/WCAG21/quickref/) + +**Support**: +- Email: ai-cic@amazon.com +- GitHub Issues: https://github.com/ASUCICREPO/PDF_Accessibility/issues + +--- + +**Last Updated**: 2026-03-02 +**Maintained By**: Arizona State University's AI Cloud Innovation Center diff --git a/README.md b/README.md index 63df9a23..ee00d642 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,16 @@ This solution converts PDF documents to accessible HTML format while preserving - **S3 Events**: Monitor file processing status - **CloudWatch Metrics**: Track function performance +### Usage Metrics & Observability + +Custom metrics are published to the `PDFAccessibility` CloudWatch namespace, providing real-time visibility into usage, costs, and performance: + +- **Per-user tracking**: Usage attributed to individual Cognito users via S3 object tagging +- **Cost estimation**: Automated cost calculation for Adobe API, Bedrock, Lambda, and ECS +- **Usage dashboard**: Dedicated `PDF-Accessibility-Usage-Metrics` CloudWatch dashboard with pages processed, API calls, token usage, error rates, and cost breakdowns + +For full details, see [docs/OBSERVABILITY.md](docs/OBSERVABILITY.md). + ## Troubleshooting ### Common Issues diff --git a/adobe-autotag-container/adobe_autotag_processor.py b/adobe-autotag-container/adobe_autotag_processor.py index 0afd7185..7971c6b0 100644 --- a/adobe-autotag-container/adobe_autotag_processor.py +++ b/adobe-autotag-container/adobe_autotag_processor.py @@ -59,6 +59,7 @@ import openpyxl import ast import os +import io import boto3 import logging import json @@ -86,6 +87,18 @@ from adobe.pdfservices.operation.pdfjobs.params.autotag_pdf.autotag_pdf_params import AutotagPDFParams from adobe.pdfservices.operation.pdfjobs.result.autotag_pdf_result import AutotagPDFResult +# Import metrics helper +try: + from metrics_helper import track_adobe_api_call, track_bedrock_invocation, MetricsContext +except ImportError: + print("Warning: metrics_helper not available") + track_adobe_api_call = lambda *args, **kwargs: None + track_bedrock_invocation = lambda *args, **kwargs: None + class MetricsContext: + def __init__(self, *args, **kwargs): pass + def __enter__(self): return self + def __exit__(self, *args): return False + logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") @@ -177,7 +190,7 @@ def add_viewer_preferences(pdf_path, filename): writer.write(f) logger.info(f'Filename : {filename} | Viewer preferences added to the PDF') -def autotag_pdf_with_options(filename, client_id, client_secret): +def autotag_pdf_with_options(filename, client_id, client_secret, user_id=None, file_name=None): """ Auto-tags a PDF for accessibility using Adobe PDF Services. @@ -185,6 +198,8 @@ def autotag_pdf_with_options(filename, client_id, client_secret): filename (str): The path to the PDF file. client_id (str): Adobe API client ID. client_secret (str): Adobe API client secret. + user_id (str, optional): User ID for metrics tracking. + file_name (str, optional): File name for metrics tracking. Raises: ServiceApiException: If Adobe API returns an error. @@ -195,6 +210,8 @@ def autotag_pdf_with_options(filename, client_id, client_secret): with open(filename, 'rb') as file: input_stream = file.read() + page_count = len(PdfReader(io.BytesIO(input_stream)).pages) + track_adobe_api_call("AutoTag", page_count, user_id, file_name) # Initial setup, create credentials instance credentials = ServicePrincipalCredentials( @@ -248,7 +265,7 @@ def autotag_pdf_with_options(filename, client_id, client_secret): except (ServiceApiException, ServiceUsageException, SdkException) as e: logging.error(f'Filename : {filename} | Adobe Autotag API failed: {e}') raise # Re-raise to stop the container -def extract_api(filename, client_id, client_secret): +def extract_api(filename, client_id, client_secret, user_id=None, file_name=None): """ Extracts text, tables, and figures from a PDF using Adobe PDF Services. @@ -256,6 +273,8 @@ def extract_api(filename, client_id, client_secret): filename (str): The path to the PDF file. client_id (str): Adobe API client ID. client_secret (str): Adobe API client secret. + user_id (str, optional): User ID for metrics tracking. + file_name (str, optional): File name for metrics tracking. Raises: ServiceApiException: If Adobe API returns an error. @@ -266,6 +285,9 @@ def extract_api(filename, client_id, client_secret): with open(filename, 'rb') as file: input_stream = file.read() + page_count = len(PdfReader(io.BytesIO(input_stream)).pages) + track_adobe_api_call("ExtractPDF", page_count, user_id, file_name) + # Initial setup, create credentials instance credentials = ServicePrincipalCredentials( client_id=client_id, @@ -671,13 +693,16 @@ def main(): logging.info(f'Filename : {file_key} | Adding viewer preferences...') add_viewer_preferences(local_file_path, filename) + # Get user_id from environment (passed by Step Functions) + user_id = os.getenv('USER_ID', '') + # Run Adobe Autotag API logging.info(f'Filename : {file_key} | Running Adobe Autotag API...') - autotag_pdf_with_options(filename, client_id, client_secret) + autotag_pdf_with_options(filename, client_id, client_secret, user_id=user_id, file_name=file_key) # Run Adobe Extract API logging.info(f'Filename : {file_key} | Running Adobe Extract API...') - extract_api(filename, client_id, client_secret) + extract_api(filename, client_id, client_secret, user_id=user_id, file_name=file_key) extract_api_zip_path = f"output/ExtractTextInfoFromPDF/extract${filename}.zip" extract_to = f"output/zipfile/{filename}" diff --git a/adobe-autotag-container/metrics_helper.py b/adobe-autotag-container/metrics_helper.py new file mode 100644 index 00000000..6b3d18fe --- /dev/null +++ b/adobe-autotag-container/metrics_helper.py @@ -0,0 +1,257 @@ +""" +CloudWatch Metrics Helper for PDF Accessibility Platform + +This module provides utilities for emitting custom CloudWatch metrics +to track usage, costs, and performance across the PDF accessibility platform. +""" + +import boto3 +import time +from typing import Dict, List, Optional +from datetime import datetime + +cloudwatch = boto3.client('cloudwatch') + +NAMESPACE = "PDFAccessibility" + +def emit_metric( + metric_name: str, + value: float, + unit: str = "None", + dimensions: Optional[Dict[str, str]] = None, + timestamp: Optional[datetime] = None +): + """ + Emit a single metric to CloudWatch. + + Args: + metric_name: Name of the metric + value: Metric value + unit: CloudWatch unit (Count, Milliseconds, Bytes, etc.) + dimensions: Dict of dimension name/value pairs + timestamp: Metric timestamp (defaults to now) + """ + metric_data = { + 'MetricName': metric_name, + 'Value': value, + 'Unit': unit, + 'Timestamp': timestamp or datetime.utcnow() + } + + if dimensions: + metric_data['Dimensions'] = [ + {'Name': k, 'Value': v} for k, v in dimensions.items() + ] + + try: + cloudwatch.put_metric_data( + Namespace=NAMESPACE, + MetricData=[metric_data] + ) + except Exception as e: + print(f"Failed to emit metric {metric_name}: {e}") + +def track_pages_processed( + page_count: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track number of pages processed.""" + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName - aggregate at service/user level only + + emit_metric("PagesProcessed", page_count, "Count", dimensions) + +def track_adobe_api_call( + operation: str, + page_count: int = 0, + user_id: Optional[str] = None, + file_name: Optional[str] = None +): + """Track Adobe API calls and estimated Document Transactions. + + Adobe licensing: + - AutoTag: 10 Document Transactions per page + - ExtractPDF: 1 Document Transaction per 5 pages + """ + dimensions = { + "Service": "pdf2pdf", + "Operation": operation + } + if user_id: + dimensions["UserId"] = user_id + + emit_metric("AdobeAPICalls", 1, "Count", dimensions) + + # Calculate Document Transactions per Adobe licensing + if page_count > 0: + if operation == "AutoTag": + doc_transactions = page_count * 10 + elif operation == "ExtractPDF": + doc_transactions = -(-page_count // 5) # ceiling division + else: + doc_transactions = 1 + emit_metric("AdobeDocTransactions", doc_transactions, "Count", dimensions) + +def track_bedrock_invocation( + model_id: str, + input_tokens: int, + output_tokens: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track Bedrock model invocations and token usage.""" + dimensions = { + "Service": service, + "Model": model_id + } + if user_id: + dimensions["UserId"] = user_id + + emit_metric("BedrockInvocations", 1, "Count", dimensions) + emit_metric("BedrockInputTokens", input_tokens, "Count", dimensions) + emit_metric("BedrockOutputTokens", output_tokens, "Count", dimensions) + +def track_processing_duration( + stage: str, + duration_ms: float, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track processing duration for a specific stage.""" + dimensions = { + "Service": service, + "Stage": stage + } + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("ProcessingDuration", duration_ms, "Milliseconds", dimensions) + +def track_error( + error_type: str, + stage: str, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track errors by type and stage.""" + dimensions = { + "Service": service, + "Stage": stage, + "ErrorType": error_type + } + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("ErrorCount", 1, "Count", dimensions) + +def track_file_size( + size_bytes: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track file size.""" + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("FileSize", size_bytes, "Bytes", dimensions) + +def estimate_cost( + pages: int = 0, + adobe_calls: int = 0, + bedrock_input_tokens: int = 0, + bedrock_output_tokens: int = 0, + lambda_duration_ms: int = 0, + lambda_memory_mb: int = 1024, + ecs_duration_ms: int = 0, + ecs_vcpu: float = 0.25, + ecs_memory_gb: float = 1.0, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +) -> float: + """ + Estimate cost for a processing job and emit metric. + + Pricing (approximate, as of 2024): + - Adobe API: ~$0.05 per operation + - Bedrock Claude Haiku: $0.00025/1K input, $0.00125/1K output + - Bedrock Claude Sonnet: $0.003/1K input, $0.015/1K output + - Lambda: $0.0000166667/GB-sec + - ECS Fargate: $0.04048/vCPU-hr + $0.004445/GB-hr + - BDA: ~$0.01 per page + + Returns: + Estimated cost in USD + """ + cost = 0.0 + + # Adobe API cost + cost += adobe_calls * 0.05 + + # Bedrock cost (assuming Haiku for estimation) + cost += (bedrock_input_tokens / 1000) * 0.00025 + cost += (bedrock_output_tokens / 1000) * 0.00125 + + # Lambda cost + gb_seconds = (lambda_memory_mb / 1024) * (lambda_duration_ms / 1000) + cost += gb_seconds * 0.0000166667 + + # ECS cost + if ecs_duration_ms > 0: + hours = ecs_duration_ms / (1000 * 3600) + cost += (ecs_vcpu * hours * 0.04048) + (ecs_memory_gb * hours * 0.004445) + + # BDA cost (for pdf2html) + if service == "pdf2html": + cost += pages * 0.01 + + # Emit cost metric + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + + emit_metric("EstimatedCost", cost, "None", dimensions) + + return cost + +class MetricsContext: + """Context manager for tracking operation metrics.""" + + def __init__(self, stage: str, user_id: Optional[str] = None, + file_name: Optional[str] = None, service: str = "pdf2pdf"): + self.stage = stage + self.user_id = user_id + self.file_name = file_name + self.service = service + self.start_time = None + + def __enter__(self): + self.start_time = time.time() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + duration_ms = (time.time() - self.start_time) * 1000 + track_processing_duration( + self.stage, duration_ms, + self.user_id, self.file_name, self.service + ) + + if exc_type: + track_error( + exc_type.__name__, self.stage, + self.user_id, self.file_name, self.service + ) + + return False diff --git a/alt-text-generator-container/alt_text_generator.js b/alt-text-generator-container/alt_text_generator.js index 05e39a98..70ef7a91 100644 --- a/alt-text-generator-container/alt_text_generator.js +++ b/alt-text-generator-container/alt_text_generator.js @@ -30,6 +30,7 @@ const { S3Client, GetObjectCommand, PutObjectCommand } = require('@aws-sdk/client-s3'); const { BedrockRuntimeClient, InvokeModelCommand } = require('@aws-sdk/client-bedrock-runtime'); +const { CloudWatchClient, PutMetricDataCommand } = require('@aws-sdk/client-cloudwatch'); const fs = require('fs').promises; const fs_1 = require('fs'); const winston = require('winston'); @@ -64,6 +65,53 @@ const logger = winston.createLogger({ // Create an S3 client instance. const AWS_REGION = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || process.env.CDK_DEFAULT_REGION; const s3Client = new S3Client({ region: AWS_REGION }); +const cloudwatchClient = new CloudWatchClient({ region: AWS_REGION }); + +/** + * Emit a CloudWatch metric to the PDFAccessibility namespace. + */ +async function emitMetric(metricName, value, unit, dimensions) { + try { + const metricData = { + MetricName: metricName, + Value: value, + Unit: unit || 'Count', + Timestamp: new Date(), + }; + if (dimensions) { + metricData.Dimensions = Object.entries(dimensions).map(([Name, Value]) => ({ Name, Value })); + } + await cloudwatchClient.send(new PutMetricDataCommand({ + Namespace: 'PDFAccessibility', + MetricData: [metricData], + })); + } catch (e) { + logger.warn(`Failed to emit metric ${metricName}: ${e.message}`); + } +} + +/** + * Track Bedrock invocation metrics. + */ +async function trackBedrockInvocation(modelId, inputTokens, outputTokens, userId, service) { + const dims = { Service: service || 'pdf2pdf', Model: modelId }; + if (userId) dims.UserId = userId; + await Promise.all([ + emitMetric('BedrockInvocations', 1, 'Count', dims), + emitMetric('BedrockInputTokens', inputTokens, 'Count', dims), + emitMetric('BedrockOutputTokens', outputTokens, 'Count', dims), + ]); +} + +// ============================================================================ +// MODEL CONFIGURATION - Edit these values to change the AI models used +// ============================================================================ +// Model ID for generating alt text for images (requires vision capability) +const MODEL_ID_ALT_TEXT = "us.amazon.nova-pro-v1:0"; + +// Model ID for generating alt text for hyperlinks (text-only, can use lighter model) +const MODEL_ID_LINK_ALT_TEXT = "us.amazon.nova-lite-v1:0"; +// ============================================================================ function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); @@ -137,6 +185,21 @@ const invokeModel = async ( const decodedResponseBody = new TextDecoder("utf-8").decode(apiResponse.body); const responseBody = JSON.parse(decodedResponseBody); logger.info(`response of alt text: ${responseBody.output.message}`); + + // Track Bedrock metrics + try { + const usage = responseBody.usage || {}; + await trackBedrockInvocation( + MODEL_ID_ALT_TEXT, + usage.inputTokens || 0, + usage.outputTokens || 0, + null, + "pdf2pdf" + ); + } catch (e) { + logger.warn(`Failed to track Bedrock metrics: ${e.message}`); + } + return responseBody.output.message; }; @@ -240,11 +303,10 @@ async function generateAltText(imageObject, imageBuffer) { * @throws {Error} - Throws an error if invoking the model fails. */ const invokeModel_alt_text_links = async ( - prompt = "Generate alt text for this link" + prompt = "Generate alt text for this link", ) => { logger.info(`generating link alt text`); const client = new BedrockRuntimeClient({ region: AWS_REGION }); - const payload = { system: [ { @@ -283,6 +345,21 @@ const invokeModel_alt_text_links = async ( const decodedResponseBody = new TextDecoder().decode(apiResponse.body); const responseBody = JSON.parse(decodedResponseBody); logger.info(`response of alt text: ${responseBody.output.message.content[0].text}`); + + // Track Bedrock metrics for link alt-text generation + try { + const usage = responseBody.usage || {}; + await trackBedrockInvocation( + MODEL_ID_LINK_ALT_TEXT, + usage.inputTokens || 0, + usage.outputTokens || 0, + null, + "pdf2pdf" + ); + } catch (e) { + logger.warn(`Failed to track Bedrock metrics: ${e.message}`); + } + return responseBody.output.message.content[0].text; } catch (error) { console.error(`Error invoking model: ${error}`); diff --git a/alt-text-generator-container/package.json b/alt-text-generator-container/package.json index 52640421..d87c70f5 100644 --- a/alt-text-generator-container/package.json +++ b/alt-text-generator-container/package.json @@ -1,7 +1,11 @@ { "dependencies": { "@aws-sdk/client-bedrock-runtime": "^3.632.0", + "@aws-sdk/client-cloudwatch": "^3.632.0", "@aws-sdk/client-s3": "^3.633.0", + "@aws-sdk/util-buffer-from": "^3.374.0", + "aws-sdk": "^2.1678.0", + "pdfjs-dist": "^4.6.82", "better-sqlite3": "^11.8.1", "pdf-lib": "^1.17.1", "winston": "^3.14.2" diff --git a/app.py b/app.py index 1148d0bd..e5b7a7f9 100644 --- a/app.py +++ b/app.py @@ -15,6 +15,7 @@ aws_logs as logs, aws_ecr_assets as ecr_assets, aws_cloudwatch as cloudwatch, + aws_secretsmanager as secretsmanager, ) from constructs import Construct import platform @@ -24,6 +25,14 @@ class PDFAccessibility(Stack): def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) + # Create Lambda Layer for metrics + metrics_layer = lambda_.LayerVersion( + self, "MetricsLayer", + code=lambda_.Code.from_asset("lambda/shared"), + compatible_runtimes=[lambda_.Runtime.PYTHON_3_12], + description="Metrics helper utilities" + ) + # S3 Bucket pdf_processing_bucket = s3.Bucket(self, "pdfaccessibilitybucket1", encryption=s3.BucketEncryption.S3_MANAGED, @@ -31,6 +40,9 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: versioned=True, removal_policy=cdk.RemovalPolicy.RETAIN) + # Expose bucket for other stacks + self.bucket = pdf_processing_bucket + # Get account and region for use throughout the stack account_id = Stack.of(self).account region = Stack.of(self).region @@ -130,6 +142,12 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: actions=["secretsmanager:GetSecretValue"], resources=[f"arn:aws:secretsmanager:{region}:{account_id}:secret:/myapp/*"], )) + + # CloudWatch metrics permissions for observability + ecs_task_role.add_to_policy(iam.PolicyStatement( + actions=["cloudwatch:PutMetricData"], + resources=["*"], + )) # Grant S3 read/write access to ECS Task Role pdf_processing_bucket.grant_read_write(ecs_task_execution_role) # Create ECS Task Log Groups explicitly @@ -194,6 +212,10 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: name="AWS_REGION", value=region ), + tasks.TaskEnvironmentVariable( + name="USER_ID", + value=sfn.JsonPath.string_at("$.user_id") + ), ] )], launch_target=tasks.EcsFargateLaunchTarget( @@ -281,6 +303,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: memory_size=1024, # architecture=lambda_.Architecture.ARM_64 architecture=lambda_arch, + layers=[metrics_layer], ) # Grant the Lambda function read/write permissions to the S3 bucket @@ -315,6 +338,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: timeout=Duration.seconds(900), memory_size=512, architecture=lambda_arch, + layers=[metrics_layer], ) pre_remediation_accessibility_checker.add_to_role_policy( @@ -341,6 +365,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: timeout=Duration.seconds(900), memory_size=512, architecture=lambda_arch, + layers=[metrics_layer], ) post_remediation_accessibility_checker.add_to_role_policy( @@ -388,7 +413,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: handler='main.lambda_handler', code=lambda_.Code.from_docker_build("lambda/pdf-splitter-lambda"), timeout=Duration.seconds(900), - memory_size=1024 + memory_size=1024, + layers=[metrics_layer], ) pdf_splitter_lambda.add_to_role_policy(cloudwatch_metrics_policy) @@ -396,6 +422,12 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: # S3 Permissions for Lambda pdf_processing_bucket.grant_read_write(pdf_splitter_lambda) + # Grant tagging permissions for user attribution + pdf_splitter_lambda.add_to_role_policy(iam.PolicyStatement( + actions=["s3:GetObjectTagging", "s3:PutObjectTagging"], + resources=[f"{pdf_processing_bucket.bucket_arn}/*"] + )) + # Trigger Lambda on S3 Event pdf_processing_bucket.add_event_notification( s3.EventType.OBJECT_CREATED, @@ -414,7 +446,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: pdf_merger_lambda_log_group_name = f"/aws/lambda/{pdf_merger_lambda.function_name}" title_generator_lambda_log_group_name = f"/aws/lambda/{title_generator_lambda.function_name}" pre_remediation_checker_log_group_name = f"/aws/lambda/{pre_remediation_accessibility_checker.function_name}" - post_remediation_checker_log_group_name = f"aws/lambda/{post_remediation_accessibility_checker.function_name}" + post_remediation_checker_log_group_name = f"/aws/lambda/{post_remediation_accessibility_checker.function_name}" @@ -485,6 +517,24 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: ), ) + # Expose log group names for metrics dashboard + self.split_pdf_log_group = pdf_splitter_lambda_log_group_name + self.adobe_autotag_log_group = adobe_autotag_log_group.log_group_name + self.alt_text_generator_log_group = alt_text_generator_log_group.log_group_name + +from cdk.usage_metrics_stack import UsageMetricsDashboard + app = cdk.App() -PDFAccessibility(app, "PDFAccessibility") +pdf_stack = PDFAccessibility(app, "PDFAccessibility") + +# Deploy usage metrics dashboard +UsageMetricsDashboard( + app, "PDFAccessibilityUsageMetrics", + pdf2pdf_bucket=pdf_stack.bucket.bucket_name, + split_pdf_log_group=pdf_stack.split_pdf_log_group, + python_container_log_group=pdf_stack.adobe_autotag_log_group, + javascript_container_log_group=pdf_stack.alt_text_generator_log_group, + pdf2html_log_group="/aws/lambda/Pdf2HtmlPipeline", +) + app.synth() diff --git a/cdk.json b/cdk.json index ad86fb38..db9e7667 100644 --- a/cdk.json +++ b/cdk.json @@ -1,5 +1,5 @@ { - "app": "python app.py", + "app": "python3 app.py", "watch": { "include": [ "**" diff --git a/cdk/usage_metrics_stack.py b/cdk/usage_metrics_stack.py new file mode 100644 index 00000000..98183f04 --- /dev/null +++ b/cdk/usage_metrics_stack.py @@ -0,0 +1,226 @@ +import aws_cdk as cdk +from aws_cdk import ( + Stack, + aws_cloudwatch as cloudwatch, + Duration, +) +from constructs import Construct + +class UsageMetricsDashboard(Stack): + def __init__(self, scope: Construct, construct_id: str, + pdf2pdf_bucket: str = None, + pdf2html_bucket: str = None, + split_pdf_log_group: str = None, + python_container_log_group: str = None, + javascript_container_log_group: str = None, + pdf2html_log_group: str = None, + **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + + region = Stack.of(self).region + + dashboard = cloudwatch.Dashboard( + self, "UsageMetricsDashboard", + dashboard_name="PDF-Accessibility-Usage-Metrics" + ) + + # === HEADER === + dashboard.add_widgets( + cloudwatch.TextWidget( + markdown="# PDF Accessibility Platform - Usage & Cost Metrics", + width=24, height=1 + ) + ) + + # === SECTION 1: AGGREGATE TOTALS === + # These are the working widgets — SUM wraps SEARCH to collapse all users into one line + dashboard.add_widgets( + cloudwatch.GraphWidget( + title="Pages Processed (Hourly)", + left=[cloudwatch.MathExpression( + expression="SUM(SEARCH('{PDFAccessibility,Service,UserId} MetricName=\"PagesProcessed\"', 'Sum', 3600))", + label="Total Pages" + )], + width=12, height=6 + ), + cloudwatch.GraphWidget( + title="Files Processed (Hourly)", + left=[cloudwatch.MathExpression( + expression="SUM(SEARCH('{PDFAccessibility,Service,UserId} MetricName=\"PagesProcessed\"', 'SampleCount', 3600))", + label="Total Files" + )], + width=12, height=6 + ) + ) + + # === SECTION 2: PER-USER BREAKDOWN === + # Log Insights table — queries structured JSON log lines emitted by Lambdas + dashboard.add_widgets( + cloudwatch.TextWidget( + markdown="## Per-User Usage", + width=24, height=1 + ) + ) + + log_groups = [] + if split_pdf_log_group: + log_groups.append(split_pdf_log_group) + if python_container_log_group: + log_groups.append(python_container_log_group) + if javascript_container_log_group: + log_groups.append(javascript_container_log_group) + if pdf2html_log_group: + log_groups.append(pdf2html_log_group) + + dashboard.add_widgets( + cloudwatch.LogQueryWidget( + title="Files & Pages Processed by User", + log_group_names=log_groups, + query_string='''filter event = "file_processed" +| stats count() as files, sum(pageCount) as pages by userId +| sort files desc''', + width=12, height=6 + ), + cloudwatch.LogQueryWidget( + title="Recent Processing Activity", + log_group_names=log_groups, + query_string='''filter event = "file_processed" +| fields @timestamp, userId, fileName, pageCount, service +| sort @timestamp desc +| limit 20''', + width=12, height=6 + ) + ) + + # === SECTION 3: BEDROCK METRICS === + dashboard.add_widgets( + cloudwatch.TextWidget( + markdown="## Amazon Bedrock Usage", + width=24, height=1 + ) + ) + dashboard.add_widgets( + cloudwatch.GraphWidget( + title="Bedrock Model Invocations", + left=[cloudwatch.MathExpression( + expression="SUM(SEARCH('{PDFAccessibility,Service,Model} MetricName=\"BedrockInvocations\"', 'Sum', 3600))", + label="Total Invocations" + )], + width=8, height=6 + ), + cloudwatch.GraphWidget( + title="Bedrock Input Tokens", + left=[cloudwatch.MathExpression( + expression="SUM(SEARCH('{PDFAccessibility,Service,Model} MetricName=\"BedrockInputTokens\"', 'Sum', 3600))", + label="Total Input Tokens" + )], + width=8, height=6 + ), + cloudwatch.GraphWidget( + title="Bedrock Output Tokens", + left=[cloudwatch.MathExpression( + expression="SUM(SEARCH('{PDFAccessibility,Service,Model} MetricName=\"BedrockOutputTokens\"', 'Sum', 3600))", + label="Total Output Tokens" + )], + width=8, height=6 + ) + ) + + # === SECTION 4: ADOBE API (PDF-to-PDF only) === + if pdf2pdf_bucket: + dashboard.add_widgets( + cloudwatch.TextWidget( + markdown="## Adobe PDF Services API Usage\n\n" + "AutoTag: 10 Document Transactions/page | " + "ExtractPDF: 1 Document Transaction/5 pages", + width=24, height=1 + ) + ) + dashboard.add_widgets( + cloudwatch.GraphWidget( + title="Adobe API Calls by Operation", + left=[cloudwatch.MathExpression( + expression="SEARCH('{PDFAccessibility,Service,Operation,UserId} MetricName=\"AdobeAPICalls\"', 'Sum', 3600)", + label="" + )], + width=8, height=6, + legend_position=cloudwatch.LegendPosition.RIGHT + ), + cloudwatch.GraphWidget( + title="Adobe Document Transactions (Quota Usage)", + left=[cloudwatch.MathExpression( + expression="SEARCH('{PDFAccessibility,Service,Operation,UserId} MetricName=\"AdobeDocTransactions\"', 'Sum', 3600)", + label="" + )], + width=8, height=6, + legend_position=cloudwatch.LegendPosition.RIGHT + ), + cloudwatch.SingleValueWidget( + title="Document Transactions (24h)", + metrics=[cloudwatch.MathExpression( + expression="SUM(SEARCH('{PDFAccessibility,Service,Operation,UserId} MetricName=\"AdobeDocTransactions\"', 'Sum', 86400))", + label="Doc Transactions" + )], + width=8, height=6 + ) + ) + + # === SECTION 5: PROCESSING PERFORMANCE === + dashboard.add_widgets( + cloudwatch.TextWidget( + markdown="## Processing Performance", + width=24, height=1 + ) + ) + dashboard.add_widgets( + cloudwatch.GraphWidget( + title="Lambda Processing Duration (avg ms)", + left=[cloudwatch.Metric( + namespace="AWS/Lambda", metric_name="Duration", + statistic="Average", period=Duration.minutes(5) + )], + width=12, height=6 + ), + cloudwatch.GraphWidget( + title="ECS Task CPU Utilization" if pdf2pdf_bucket else "Lambda Concurrent Executions", + left=[cloudwatch.Metric( + namespace="AWS/ECS" if pdf2pdf_bucket else "AWS/Lambda", + metric_name="CPUUtilization" if pdf2pdf_bucket else "ConcurrentExecutions", + statistic="Average", period=Duration.minutes(5) + )], + width=12, height=6 + ) + ) + + # === SECTION 6: ERROR MONITORING === + dashboard.add_widgets( + cloudwatch.TextWidget( + markdown="## Error Monitoring", + width=24, height=1 + ) + ) + dashboard.add_widgets( + cloudwatch.GraphWidget( + title="Lambda Errors", + left=[cloudwatch.Metric( + namespace="AWS/Lambda", metric_name="Errors", + statistic="Sum", period=Duration.hours(1) + )], + width=12, height=6 + ), + cloudwatch.GraphWidget( + title="Step Function Failed Executions", + left=[cloudwatch.Metric( + namespace="AWS/States", metric_name="ExecutionsFailed", + statistic="Sum", period=Duration.hours(1) + )], + width=12, height=6 + ) + ) + + # Output dashboard URL + cdk.CfnOutput( + self, "DashboardURL", + value=f"https://console.aws.amazon.com/cloudwatch/home?region={region}#dashboards:name=PDF-Accessibility-Usage-Metrics", + description="CloudWatch Dashboard URL for Usage Metrics" + ) diff --git a/deploy-local.sh b/deploy-local.sh new file mode 100755 index 00000000..ec864f95 --- /dev/null +++ b/deploy-local.sh @@ -0,0 +1,353 @@ +#!/bin/bash +set -e + +# ======================================================================== +# PDF Accessibility Solutions - Local Deployment Script +# ======================================================================== +# Mirrors the behavior of deploy.sh but deploys directly from the local +# repo instead of via CodeBuild / GitHub. +# ======================================================================== + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +print_status() { echo -e "${BLUE}[INFO]${NC} $1"; } +print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } +print_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } +print_error() { echo -e "${RED}[ERROR]${NC} $1"; } +print_header() { echo -e "\n${CYAN}$1${NC}"; } + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Deploy PDF Accessibility solutions from local repo." + echo "" + echo "Options:" + echo " --pdf2pdf Deploy PDF-to-PDF only" + echo " --pdf2html Deploy PDF-to-HTML only" + echo " --all Deploy both" + echo " --profile NAME AWS CLI profile to use" + echo " --region REGION AWS region (default: from AWS config)" + echo " -h, --help Show this help" + echo "" + echo "If no solution flag is given, you will be prompted to choose." + echo "" + echo "Examples:" + echo " $0 # Interactive mode" + echo " $0 --all # Deploy everything" + echo " $0 --pdf2html # Deploy only pdf2html" + echo " $0 --profile myprofile # Use specific AWS profile" +} + +# Parse arguments +DEPLOY_PDF2PDF=false +DEPLOY_PDF2HTML=false +AWS_PROFILE_ARG="" +REGION_ARG="" + +while [[ $# -gt 0 ]]; do + case $1 in + --pdf2pdf) DEPLOY_PDF2PDF=true; shift ;; + --pdf2html) DEPLOY_PDF2HTML=true; shift ;; + --all) DEPLOY_PDF2PDF=true; DEPLOY_PDF2HTML=true; shift ;; + --profile) AWS_PROFILE_ARG="--profile $2"; export AWS_PROFILE="$2"; shift 2 ;; + --region) REGION_ARG="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) print_error "Unknown option: $1"; usage; exit 1 ;; + esac +done + +# ============================================================ +# Interactive selection if no flags given +# ============================================================ +if [ "$DEPLOY_PDF2PDF" = false ] && [ "$DEPLOY_PDF2HTML" = false ]; then + echo "" + print_header "Welcome to PDF Accessibility Solutions Local Deployment" + print_header "=======================================================" + echo "" + echo "This tool deploys PDF accessibility solutions from your local repo:" + echo "" + echo "1. PDF-to-PDF Remediation" + echo " - Maintains original PDF format" + echo " - Uses Adobe PDF Services API" + echo " - ECS + Step Functions processing" + echo "" + echo "2. PDF-to-HTML Remediation" + echo " - Converts PDFs to accessible HTML" + echo " - Uses AWS Bedrock Data Automation" + echo " - Serverless Lambda-based processing" + echo "" + + while true; do + echo "Which solution would you like to deploy?" + echo "1) PDF-to-PDF Remediation" + echo "2) PDF-to-HTML Remediation" + echo "3) Both" + echo "" + read -p "Enter your choice (1, 2, or 3): " SOLUTION_CHOICE + + case $SOLUTION_CHOICE in + 1) DEPLOY_PDF2PDF=true; break ;; + 2) DEPLOY_PDF2HTML=true; break ;; + 3) DEPLOY_PDF2PDF=true; DEPLOY_PDF2HTML=true; break ;; + *) print_error "Invalid choice. Please enter 1, 2, or 3."; echo "" ;; + esac + done +fi + +# ============================================================ +# Resolve AWS account and region +# ============================================================ +print_status "Verifying AWS credentials..." +ACCOUNT_ID=$(aws sts get-caller-identity $AWS_PROFILE_ARG --query "Account" --output text 2>/dev/null || { + print_error "Failed to get AWS account ID. Please ensure AWS CLI is configured." + exit 1 +}) + +REGION=${REGION_ARG:-${AWS_DEFAULT_REGION:-$(aws configure get region $AWS_PROFILE_ARG 2>/dev/null || echo "")}} + +if [ -z "$REGION" ]; then + print_error "Could not determine AWS region. Please set your region:" + print_error " export AWS_DEFAULT_REGION=us-east-1" + print_error " OR use: $0 --region us-east-1" + exit 1 +fi + +# If Pdf2HtmlStack exists, use its region for consistency +if [ -z "$REGION_ARG" ]; then + EXISTING_REGION=$(aws cloudformation describe-stacks --stack-name Pdf2HtmlStack $AWS_PROFILE_ARG --region "$REGION" --query 'Stacks[0].StackId' --output text 2>/dev/null | grep -oP ':\K[a-z]+-[a-z]+-[0-9]+' | head -1) + if [ -n "$EXISTING_REGION" ] && [ "$EXISTING_REGION" != "$REGION" ]; then + print_warning "Profile region is $REGION but Pdf2HtmlStack exists in $EXISTING_REGION" + print_warning "Using $EXISTING_REGION for consistency. Override with --region if needed." + REGION="$EXISTING_REGION" + fi +fi + +print_success "Account: $ACCOUNT_ID, Region: $REGION" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# ============================================================ +# PDF-to-PDF Deployment +# ============================================================ +if [ "$DEPLOY_PDF2PDF" = true ]; then + print_header "=== Deploying PDF-to-PDF Remediation ===" + + # --- Adobe credentials setup --- + # Check if secret already exists in Secrets Manager + EXISTING_SECRET=$(aws secretsmanager get-secret-value --secret-id /myapp/client_credentials \ + $AWS_PROFILE_ARG --region "$REGION" --query 'SecretString' --output text 2>/dev/null || echo "") + + if [ -z "$EXISTING_SECRET" ] || [ "$EXISTING_SECRET" = "None" ]; then + print_status "Adobe PDF Services API credentials are required." + print_status "(These will be stored securely in AWS Secrets Manager)" + echo "" + + if [ -z "$ADOBE_CLIENT_ID" ]; then + read -p " Enter Adobe API Client ID: " ADOBE_CLIENT_ID + fi + if [ -z "$ADOBE_CLIENT_SECRET" ]; then + read -sp " Enter Adobe API Client Secret: " ADOBE_CLIENT_SECRET + echo + fi + + JSON_TEMPLATE='{"client_credentials":{"PDF_SERVICES_CLIENT_ID":"","PDF_SERVICES_CLIENT_SECRET":""}}' + SECRET_JSON=$(echo "$JSON_TEMPLATE" | jq --arg cid "$ADOBE_CLIENT_ID" --arg csec "$ADOBE_CLIENT_SECRET" \ + '.client_credentials.PDF_SERVICES_CLIENT_ID = $cid | .client_credentials.PDF_SERVICES_CLIENT_SECRET = $csec') + + if aws secretsmanager create-secret --name /myapp/client_credentials \ + --secret-string "$SECRET_JSON" $AWS_PROFILE_ARG --region "$REGION" 2>/dev/null; then + print_success "Adobe credentials stored in Secrets Manager" + else + aws secretsmanager update-secret --secret-id /myapp/client_credentials \ + --secret-string "$SECRET_JSON" $AWS_PROFILE_ARG --region "$REGION" + print_success "Adobe credentials updated in Secrets Manager" + fi + else + print_success "Adobe credentials already configured in Secrets Manager" + fi + + # --- Install Python CDK dependencies --- + print_status "Installing Python dependencies..." + pip3 install -r requirements.txt -q + + # --- Bootstrap CDK if needed --- + print_status "Ensuring CDK is bootstrapped..." + cdk bootstrap aws://$ACCOUNT_ID/$REGION $AWS_PROFILE_ARG 2>/dev/null || true + + # --- Sync shared files to Docker build contexts --- + print_status "Syncing shared files to Docker build contexts..." + cp lambda/shared/python/metrics_helper.py adobe-autotag-container/metrics_helper.py + + # --- Deploy CDK stacks --- + print_status "Deploying CDK stacks (PDFAccessibility + UsageMetrics)..." + print_status " CDK will automatically build adobe-autotag-container and alt-text-generator-container images" + print_status " This may take 3-5 minutes..." + + for i in {1..3}; do + print_status "CDK deploy attempt $i/3..." + if cdk deploy PDFAccessibility PDFAccessibilityUsageMetrics --require-approval never $AWS_PROFILE_ARG; then + print_success "PDF-to-PDF deployed successfully" + break + else + if [ $i -eq 3 ]; then + print_error "All CDK deploy attempts failed" + exit 1 + fi + print_warning "CDK deploy failed on attempt $i, retrying in 30s..." + sleep 30 + fi + done +fi + +# ============================================================ +# PDF-to-HTML Deployment +# ============================================================ +if [ "$DEPLOY_PDF2HTML" = true ]; then + print_header "=== Deploying PDF-to-HTML Remediation ===" + + BUCKET_NAME="pdf2html-bucket-$ACCOUNT_ID-$REGION" + REPO_NAME="pdf2html-lambda" + REPO_URI="$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME" + + # --- Create ECR repository if it doesn't exist --- + if ! aws ecr describe-repositories --repository-names $REPO_NAME --region "$REGION" $AWS_PROFILE_ARG 2>/dev/null; then + print_status "Creating ECR repository: $REPO_NAME" + aws ecr create-repository --repository-name $REPO_NAME --region "$REGION" $AWS_PROFILE_ARG + print_success "ECR repository created" + else + print_success "ECR repository $REPO_NAME already exists" + fi + + # --- Create S3 bucket if it doesn't exist --- + if ! aws s3api head-bucket --bucket $BUCKET_NAME $AWS_PROFILE_ARG 2>/dev/null; then + print_status "Creating S3 bucket: $BUCKET_NAME" + if [ "$REGION" = "us-east-1" ]; then + aws s3api create-bucket --bucket $BUCKET_NAME --region "$REGION" $AWS_PROFILE_ARG + else + aws s3api create-bucket --bucket $BUCKET_NAME --region "$REGION" $AWS_PROFILE_ARG \ + --create-bucket-configuration LocationConstraint=$REGION + fi + aws s3api put-bucket-versioning --bucket $BUCKET_NAME --versioning-configuration Status=Enabled $AWS_PROFILE_ARG + aws s3api put-object --bucket $BUCKET_NAME --key uploads/ $AWS_PROFILE_ARG + aws s3api put-object --bucket $BUCKET_NAME --key output/ $AWS_PROFILE_ARG + aws s3api put-object --bucket $BUCKET_NAME --key remediated/ $AWS_PROFILE_ARG + print_success "S3 bucket created" + else + print_success "S3 bucket $BUCKET_NAME already exists" + fi + + # Always apply CORS policy + aws s3api put-bucket-cors --bucket $BUCKET_NAME $AWS_PROFILE_ARG --cors-configuration '{ + "CORSRules": [{"AllowedHeaders": ["*"], "AllowedMethods": ["GET", "HEAD", "PUT", "POST", "DELETE"], "AllowedOrigins": ["*"], "ExposeHeaders": []}] + }' + + # --- Resolve BDA project ARN --- + # Try existing CloudFormation stack first + BDA_PROJECT_ARN=$(aws cloudformation describe-stacks --stack-name Pdf2HtmlStack \ + $AWS_PROFILE_ARG --region "$REGION" \ + --query 'Stacks[0].Parameters[?ParameterKey==`BdaProjectArn`].ParameterValue' \ + --output text 2>/dev/null || echo "") + + # Fallback: most recent BDA project + if [ -z "$BDA_PROJECT_ARN" ] || [ "$BDA_PROJECT_ARN" = "None" ]; then + BDA_PROJECT_ARN=$(aws bedrock-data-automation list-data-automation-projects \ + $AWS_PROFILE_ARG --region "$REGION" \ + --query 'projects | sort_by(@, &creationTime) | [-1].projectArn' --output text 2>/dev/null || echo "") + fi + + # If no BDA project exists, create one + if [ -z "$BDA_PROJECT_ARN" ] || [ "$BDA_PROJECT_ARN" = "None" ]; then + print_status "No existing BDA project found. Creating one..." + BDA_PROJECT_NAME="pdf2html-bda-project-$(date +%Y%m%d-%H%M%S)" + BDA_RESPONSE=$(aws bedrock-data-automation create-data-automation-project \ + --project-name "$BDA_PROJECT_NAME" \ + --standard-output-configuration '{ + "document": { + "extraction": { + "granularity": {"types": ["DOCUMENT", "PAGE", "ELEMENT"]}, + "boundingBox": {"state": "ENABLED"} + }, + "generativeField": {"state": "DISABLED"}, + "outputFormat": { + "textFormat": {"types": ["HTML"]}, + "additionalFileFormat": {"state": "ENABLED"} + } + } + }' $AWS_PROFILE_ARG --region "$REGION" 2>/dev/null || { + print_error "Failed to create BDA project. Ensure you have bedrock-data-automation permissions." + exit 1 + }) + BDA_PROJECT_ARN=$(echo $BDA_RESPONSE | jq -r '.projectArn') + print_success "BDA project created: $BDA_PROJECT_ARN" + else + print_success "Using existing BDA project: $BDA_PROJECT_ARN" + fi + + # --- Sync shared files into Docker build context --- + print_status "Syncing shared files to Docker build context..." + cp lambda/shared/python/metrics_helper.py pdf2html/metrics_helper.py + + # --- Build and push Docker image --- + print_status "Building pdf2html Docker image..." + aws ecr get-login-password --region "$REGION" $AWS_PROFILE_ARG | \ + docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com + + docker build --platform linux/amd64 -t $REPO_URI:latest pdf2html/ + + print_status "Pushing Docker image to ECR..." + if ! docker push $REPO_URI:latest; then + print_warning "Push failed, refreshing ECR login and retrying..." + aws ecr get-login-password --region "$REGION" $AWS_PROFILE_ARG | \ + docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com + docker push $REPO_URI:latest + fi + print_success "Docker image pushed to ECR" + + # Verify image exists + print_status "Verifying image in ECR..." + aws ecr describe-images --repository-name $REPO_NAME --region "$REGION" $AWS_PROFILE_ARG \ + --image-ids imageTag=latest > /dev/null + print_success "Image verified in ECR" + + # --- Bootstrap CDK and deploy pdf2html stack --- + print_status "Deploying Pdf2HtmlStack..." + cd pdf2html/cdk + npm install --silent + + export CDK_DEFAULT_ACCOUNT=$ACCOUNT_ID + export CDK_DEFAULT_REGION=$REGION + + npx cdk bootstrap aws://$ACCOUNT_ID/$REGION $AWS_PROFILE_ARG 2>/dev/null || true + npx cdk deploy --app "node bin/app.js" \ + --parameters BdaProjectArn=$BDA_PROJECT_ARN \ + --parameters BucketName=$BUCKET_NAME \ + --require-approval never + cd ../.. + + # --- Force Lambda to pick up the new image digest --- + print_status "Updating Lambda to use new image..." + IMAGE_DIGEST=$(aws ecr describe-images --repository-name $REPO_NAME --region "$REGION" $AWS_PROFILE_ARG \ + --query 'imageDetails | sort_by(@, &imagePushedAt) | [-1].imageDigest' --output text) + aws lambda update-function-code \ + --function-name Pdf2HtmlPipeline \ + --image-uri "$REPO_URI@$IMAGE_DIGEST" \ + $AWS_PROFILE_ARG --region "$REGION" > /dev/null + print_success "PDF-to-HTML deployed and Lambda updated" +fi + +# ============================================================ +# Summary +# ============================================================ +print_header "=== Deployment Complete ===" +[ "$DEPLOY_PDF2PDF" = true ] && print_success "PDF-to-PDF: deployed (CDK stacks + Docker images)" +[ "$DEPLOY_PDF2HTML" = true ] && print_success "PDF-to-HTML: deployed (Docker image + CDK stack + Lambda)" +echo "" +print_status "Monitor your deployment in the AWS Console:" +print_status " https://console.aws.amazon.com/cloudformation" +echo "" diff --git a/deploy-private.sh b/deploy-private.sh new file mode 100755 index 00000000..243869e5 --- /dev/null +++ b/deploy-private.sh @@ -0,0 +1,982 @@ +#!/usr/bin/env bash +# ============================================================================= +# deploy-private.sh — Private CI/CD Pipeline Setup for PDF Accessibility +# ============================================================================= +# Configures AWS CodeBuild to deploy from a private repository with support +# for multi-environment branch-based deployments, non-interactive mode, +# and cleanup/teardown. +# ============================================================================= + +set -euo pipefail + +# Resolve script directory and source helpers +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/lib/pipeline-helpers.sh" + +# --------------------------------------------------------------------------- +# Colors +# --------------------------------------------------------------------------- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +print_status() { echo -e "${BLUE}[INFO]${NC} $1"; } +print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } +print_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } +print_error() { echo -e "${RED}[ERROR]${NC} $1"; } +print_header() { echo -e "${CYAN}$1${NC}"; } + +# --------------------------------------------------------------------------- +# Global State +# --------------------------------------------------------------------------- +NON_INTERACTIVE="false" +CONFIG_FILE="" +CLI_BUILDSPEC="" +CLI_PROJECT_NAME="" +CLI_BRANCH_ENV_MAP="" +CLI_PROFILE="" +DO_CLEANUP="false" +CLI_ENVIRONMENT="" +DEPLOYED_SOLUTIONS=() +PDF2PDF_BUCKET="" +PDF2HTML_BUCKET="" + +# --------------------------------------------------------------------------- +# Usage +# --------------------------------------------------------------------------- +show_help() { + cat <<'EOF' +Usage: deploy-private.sh [OPTIONS] + +Deploy PDF Accessibility solutions from a private repository. + +Options: + --config Path to key-value config file + --non-interactive Fail with error instead of prompting + --buildspec Custom buildspec file (default: buildspec-unified.yml) + --project-name Custom CodeBuild project name (default: pdfremediation-{timestamp}) + --branch-env-map JSON mapping of branches to environments + Example: '{"main":"prod","dev":"dev","feature/*":"dev"}' + --profile AWS CLI named profile to use for all AWS operations + --cleanup List and delete pipeline resources + --environment Target a specific environment for cleanup (with --cleanup) + --help Show this help message + +Environment Variables (non-interactive mode): + PRIVATE_REPO_URL Git repository URL (required) + SOURCE_PROVIDER github, codecommit, bitbucket, or gitlab (required) + DEPLOYMENT_TYPE pdf2pdf or pdf2html (required) + TARGET_BRANCH Branch name (default: main) + CONNECTION_ARN CodeConnections ARN (required for non-CodeCommit) + ADOBE_CLIENT_ID Adobe API Client ID (required for pdf2pdf) + ADOBE_CLIENT_SECRET Adobe API Client Secret (required for pdf2pdf) + BUCKET_NAME Override S3 bucket name for pdf2html + BDA_PROJECT_ARN Use existing BDA project for pdf2html + BRANCH_ENV_MAP JSON branch-to-environment mapping + AWS_PROFILE AWS CLI named profile (same as --profile flag) + +Config File Format: + PRIVATE_REPO_URL=https://github.com/myorg/my-fork.git + SOURCE_PROVIDER=github + DEPLOYMENT_TYPE=pdf2pdf + TARGET_BRANCH=main + CONNECTION_ARN=arn:aws:codeconnections:us-east-1:123456789:connection/abc-123 +EOF +} + +# --------------------------------------------------------------------------- +# Argument Parsing +# --------------------------------------------------------------------------- +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --config) + CONFIG_FILE="$2"; shift 2 ;; + --non-interactive) + NON_INTERACTIVE="true"; shift ;; + --buildspec) + CLI_BUILDSPEC="$2"; shift 2 ;; + --project-name) + CLI_PROJECT_NAME="$2"; shift 2 ;; + --branch-env-map) + CLI_BRANCH_ENV_MAP="$2"; shift 2 ;; + --profile) + CLI_PROFILE="$2"; shift 2 ;; + --cleanup) + DO_CLEANUP="true"; shift ;; + --environment) + CLI_ENVIRONMENT="$2"; shift 2 ;; + --help) + show_help; exit 0 ;; + *) + print_error "Unknown option: $1" + show_help; exit 1 ;; + esac + done + + # Validate --environment requires --cleanup + if [[ -n "$CLI_ENVIRONMENT" && "$DO_CLEANUP" != "true" ]]; then + print_error "--environment requires --cleanup" + exit 1 + fi +} + +# --------------------------------------------------------------------------- +# Interactive Prompts +# --------------------------------------------------------------------------- +prompt_or_fail() { + local param_name="$1" + local prompt_text="$2" + local current_value="${3:-}" + + if [[ -n "$current_value" ]]; then + echo "$current_value" + return 0 + fi + + if [[ "$NON_INTERACTIVE" == "true" ]]; then + print_error "Missing required parameter: $param_name" + exit 1 + fi + + local response + read -rp "$prompt_text" response + echo "$response" +} + +collect_parameters() { + PRIVATE_REPO_URL="$(prompt_or_fail "PRIVATE_REPO_URL" \ + "Enter your private repository URL: " "${PRIVATE_REPO_URL:-}")" + + if [[ -z "${SOURCE_PROVIDER:-}" ]]; then + if [[ "$NON_INTERACTIVE" == "true" ]]; then + print_error "Missing required parameter: SOURCE_PROVIDER" + exit 1 + fi + echo "" + echo "Select your source provider:" + echo " 1) github" + echo " 2) codecommit" + echo " 3) bitbucket" + echo " 4) gitlab" + local choice + read -rp "Enter choice (1-4): " choice + case "$choice" in + 1) SOURCE_PROVIDER="github" ;; + 2) SOURCE_PROVIDER="codecommit" ;; + 3) SOURCE_PROVIDER="bitbucket" ;; + 4) SOURCE_PROVIDER="gitlab" ;; + *) print_error "Invalid choice: $choice"; exit 1 ;; + esac + fi + + if [[ -z "${DEPLOYMENT_TYPE:-}" ]]; then + if [[ "$NON_INTERACTIVE" == "true" ]]; then + print_error "Missing required parameter: DEPLOYMENT_TYPE" + exit 1 + fi + echo "" + echo "Select deployment type:" + echo " 1) pdf2pdf — PDF-to-PDF Remediation" + echo " 2) pdf2html — PDF-to-HTML Remediation" + local dt_choice + read -rp "Enter choice (1-2): " dt_choice + case "$dt_choice" in + 1) DEPLOYMENT_TYPE="pdf2pdf" ;; + 2) DEPLOYMENT_TYPE="pdf2html" ;; + *) print_error "Invalid choice: $dt_choice"; exit 1 ;; + esac + fi + + TARGET_BRANCH="$(resolve_branch "${TARGET_BRANCH:-}")" + + # Conditional: Connection ARN for non-CodeCommit providers + if [[ "$SOURCE_PROVIDER" != "codecommit" && -z "${CONNECTION_ARN:-}" ]]; then + CONNECTION_ARN="$(prompt_or_fail "CONNECTION_ARN" \ + "Enter your AWS CodeConnections ARN: " "${CONNECTION_ARN:-}")" + fi + + # Conditional: Adobe credentials for pdf2pdf + if [[ "$DEPLOYMENT_TYPE" == "pdf2pdf" ]]; then + ADOBE_CLIENT_ID="$(prompt_or_fail "ADOBE_CLIENT_ID" \ + "Enter Adobe API Client ID: " "${ADOBE_CLIENT_ID:-}")" + ADOBE_CLIENT_SECRET="$(prompt_or_fail "ADOBE_CLIENT_SECRET" \ + "Enter Adobe API Client Secret: " "${ADOBE_CLIENT_SECRET:-}")" + fi +} + +# --------------------------------------------------------------------------- +# Input Validation +# --------------------------------------------------------------------------- +validate_inputs() { + # Validate provider + case "${SOURCE_PROVIDER:-}" in + github|codecommit|bitbucket|gitlab) ;; + *) print_error "Invalid source provider: '${SOURCE_PROVIDER:-}'. Supported: github, codecommit, bitbucket, gitlab"; exit 1 ;; + esac + + # Validate deployment type + case "${DEPLOYMENT_TYPE:-}" in + pdf2pdf|pdf2html) ;; + *) print_error "Invalid deployment type: '${DEPLOYMENT_TYPE:-}'. Supported: pdf2pdf, pdf2html"; exit 1 ;; + esac + + # Validate URL format + if ! validate_repo_url "$SOURCE_PROVIDER" "$PRIVATE_REPO_URL"; then + print_error "Invalid repository URL for provider '$SOURCE_PROVIDER': $PRIVATE_REPO_URL" + exit 1 + fi + + # Validate connection for non-CodeCommit + if [[ "$SOURCE_PROVIDER" != "codecommit" ]]; then + if [[ -z "${CONNECTION_ARN:-}" ]]; then + print_error "CONNECTION_ARN is required for provider '$SOURCE_PROVIDER'" + exit 1 + fi + # Validate ARN format + if [[ ! "$CONNECTION_ARN" =~ ^arn:aws:codeconnections:[a-z0-9-]+:[0-9]+:connection/.+$ ]]; then + print_error "Invalid Connection ARN format: $CONNECTION_ARN" + print_error "Expected format: arn:aws:codeconnections:{region}:{account}:connection/{id}" + exit 1 + fi + # Check connection status + local conn_status + conn_status="$(aws codeconnections get-connection \ + --connection-arn "$CONNECTION_ARN" \ + --query 'Connection.ConnectionStatus' \ + --output text 2>/dev/null)" || { + print_error "Failed to retrieve connection status for: $CONNECTION_ARN" + exit 1 + } + if ! validate_connection_status "$conn_status"; then + print_error "Connection is not AVAILABLE (current status: $conn_status)" + print_error "Complete the connection handshake in the AWS Console" + exit 1 + fi + print_success "Connection verified: AVAILABLE" + + # Ensure the connection is registered as a CodeBuild source credential. + # This is required for CodeBuild to use the connection when pulling source. + local existing_cred + existing_cred="$(aws codebuild list-source-credentials \ + --query "sourceCredentialsInfos[?resource=='${CONNECTION_ARN}'].arn" \ + --output text 2>/dev/null || echo "")" + + if [[ -z "$existing_cred" || "$existing_cred" == "None" ]]; then + print_status "Registering connection as CodeBuild source credential..." + local server_type + case "$SOURCE_PROVIDER" in + github) server_type="GITHUB" ;; + bitbucket) server_type="BITBUCKET" ;; + gitlab) server_type="GITLAB" ;; + esac + aws codebuild import-source-credentials \ + --server-type "$server_type" \ + --auth-type CODECONNECTIONS \ + --token "$CONNECTION_ARN" > /dev/null 2>&1 || { + print_error "Failed to register connection as source credential" + exit 1 + } + print_success "Connection registered as source credential" + else + print_success "Connection already registered as source credential" + fi + fi +} + +# --------------------------------------------------------------------------- +# Solution-Specific Prerequisites (Task 11.1, 11.2) +# --------------------------------------------------------------------------- +setup_pdf2pdf_prereqs() { + print_status "Setting up Adobe API credentials in Secrets Manager..." + + local json_template='{ + "client_credentials": { + "PDF_SERVICES_CLIENT_ID": "", + "PDF_SERVICES_CLIENT_SECRET": "" + } + }' + + local secret_json + secret_json="$(echo "$json_template" | jq \ + --arg cid "$ADOBE_CLIENT_ID" \ + --arg csec "$ADOBE_CLIENT_SECRET" \ + '.client_credentials.PDF_SERVICES_CLIENT_ID = $cid | + .client_credentials.PDF_SERVICES_CLIENT_SECRET = $csec')" + + local tmp_file + tmp_file="$(mktemp)" + echo "$secret_json" > "$tmp_file" + + if aws secretsmanager create-secret \ + --name /myapp/client_credentials \ + --description "Client credentials for PDF services" \ + --secret-string "file://$tmp_file" 2>/dev/null; then + print_success "Secret created in Secrets Manager" + else + aws secretsmanager update-secret \ + --secret-id /myapp/client_credentials \ + --secret-string "file://$tmp_file" 2>/dev/null + print_success "Secret updated in Secrets Manager" + fi + rm -f "$tmp_file" +} + +setup_pdf2html_prereqs() { + print_status "Setting up PDF-to-HTML prerequisites..." + + # Create BDA project + local bda_name="pdf2html-bda-project-$(date +%Y%m%d-%H%M%S)" + print_status "Creating Bedrock Data Automation project: $bda_name" + + local bda_response + bda_response="$(aws bedrock-data-automation create-data-automation-project \ + --project-name "$bda_name" \ + --standard-output-configuration '{ + "document": { + "extraction": { + "granularity": { "types": ["DOCUMENT", "PAGE", "ELEMENT"] }, + "boundingBox": { "state": "ENABLED" } + }, + "generativeField": { "state": "DISABLED" }, + "outputFormat": { + "textFormat": { "types": ["HTML"] }, + "additionalFileFormat": { "state": "ENABLED" } + } + } + }' \ + --region "$REGION" 2>/dev/null)" || { + print_error "Failed to create BDA project. Ensure you have bedrock-data-automation permissions." + exit 1 + } + + BDA_PROJECT_ARN="$(echo "$bda_response" | jq -r '.projectArn')" + BUCKET_NAME="${BUCKET_NAME:-$(generate_bucket_name "$ACCOUNT_ID" "$REGION")}" + + print_success "BDA project created: $BDA_PROJECT_ARN" + + # Create S3 bucket if needed + if ! aws s3api head-bucket --bucket "$BUCKET_NAME" 2>/dev/null; then + print_status "Creating S3 bucket: $BUCKET_NAME" + if [[ "$REGION" == "us-east-1" ]]; then + aws s3api create-bucket --bucket "$BUCKET_NAME" --region "$REGION" + else + aws s3api create-bucket --bucket "$BUCKET_NAME" --region "$REGION" \ + --create-bucket-configuration "LocationConstraint=$REGION" + fi + aws s3api put-bucket-versioning --bucket "$BUCKET_NAME" \ + --versioning-configuration Status=Enabled + aws s3api put-object --bucket "$BUCKET_NAME" --key uploads/ + aws s3api put-object --bucket "$BUCKET_NAME" --key output/ + aws s3api put-object --bucket "$BUCKET_NAME" --key remediated/ + print_success "S3 bucket created: $BUCKET_NAME" + else + print_success "S3 bucket already exists: $BUCKET_NAME" + fi +} + +# --------------------------------------------------------------------------- +# IAM Role and Policy (Task 11.3, 11.4) +# --------------------------------------------------------------------------- +create_iam_role() { + local role_name="$1" + + print_status "Setting up IAM role: $role_name" + if aws iam get-role --role-name "$role_name" >/dev/null 2>&1; then + print_success "Role '$role_name' already exists, reusing" + ROLE_ARN="$(aws iam get-role --role-name "$role_name" --output json | jq -r '.Role.Arn')" + return 0 + fi + + local trust_policy + trust_policy="$(generate_trust_policy)" + + local create_output + create_output="$(aws iam create-role \ + --role-name "$role_name" \ + --assume-role-policy-document "$trust_policy" \ + --output json)" || { + print_error "Failed to create IAM role: $role_name" + exit 1 + } + + ROLE_ARN="$(echo "$create_output" | jq -r '.Role.Arn')" + print_success "Role created: $ROLE_ARN" + + print_status "Waiting 15s for IAM propagation..." + sleep 15 +} + +create_iam_policy() { + local policy_name="$1" + local deployment_type="$2" + local env_prefix="${3:-}" + + local policy_doc + if [[ "$deployment_type" == "pdf2pdf" ]]; then + policy_doc='{ + "Version":"2012-10-17", + "Statement":[ + {"Sid":"S3","Effect":"Allow","Action":"s3:*","Resource":["arn:aws:s3:::cdk-*","arn:aws:s3:::cdk-*/*","arn:aws:s3:::pdfaccessibility*","arn:aws:s3:::pdfaccessibility*/*"]}, + {"Sid":"ECR","Effect":"Allow","Action":"ecr:*","Resource":"arn:aws:ecr:*:*:repository/cdk-*"}, + {"Sid":"ECRAuth","Effect":"Allow","Action":"ecr:GetAuthorizationToken","Resource":"*"}, + {"Sid":"Lambda","Effect":"Allow","Action":"lambda:*","Resource":"arn:aws:lambda:*:*:function:*"}, + {"Sid":"ECS","Effect":"Allow","Action":"ecs:*","Resource":"*"}, + {"Sid":"EC2","Effect":"Allow","Action":"ec2:*","Resource":"*"}, + {"Sid":"SFN","Effect":"Allow","Action":"states:*","Resource":"arn:aws:states:*:*:stateMachine:*"}, + {"Sid":"IAMRole","Effect":"Allow","Action":["iam:CreateRole","iam:DeleteRole","iam:GetRole","iam:PassRole","iam:AttachRolePolicy","iam:DetachRolePolicy","iam:PutRolePolicy","iam:GetRolePolicy","iam:DeleteRolePolicy","iam:TagRole","iam:UntagRole","iam:ListRolePolicies","iam:ListAttachedRolePolicies","iam:UpdateAssumeRolePolicy","iam:ListRoleTags"],"Resource":["arn:aws:iam::*:role/PDFAccessibility*","arn:aws:iam::*:role/cdk-*"]}, + {"Sid":"IAMPolicy","Effect":"Allow","Action":["iam:CreatePolicy","iam:DeletePolicy","iam:GetPolicy","iam:GetPolicyVersion","iam:CreatePolicyVersion","iam:DeletePolicyVersion","iam:ListPolicyVersions"],"Resource":"arn:aws:iam::*:policy/*"}, + {"Sid":"CFN","Effect":"Allow","Action":"cloudformation:*","Resource":["arn:aws:cloudformation:*:*:stack/PDFAccessibility*/*","arn:aws:cloudformation:*:*:stack/CDKToolkit/*"]}, + {"Sid":"Logs","Effect":"Allow","Action":"logs:*","Resource":["arn:aws:logs:*:*:log-group:/aws/codebuild/*","arn:aws:logs:*:*:log-group:/aws/codebuild/*:*","arn:aws:logs:*:*:log-group:/aws/lambda/*","arn:aws:logs:*:*:log-group:/aws/lambda/*:*","arn:aws:logs:*:*:log-group:/ecs/*","arn:aws:logs:*:*:log-group:/ecs/*:*","arn:aws:logs:*:*:log-group:/aws/states/*","arn:aws:logs:*:*:log-group:/aws/states/*:*"]}, + {"Sid":"CW","Effect":"Allow","Action":["cloudwatch:PutMetricData","cloudwatch:PutDashboard","cloudwatch:DeleteDashboards","cloudwatch:GetDashboard"],"Resource":"*"}, + {"Sid":"SM","Effect":"Allow","Action":["secretsmanager:CreateSecret","secretsmanager:UpdateSecret","secretsmanager:GetSecretValue","secretsmanager:DescribeSecret"],"Resource":"arn:aws:secretsmanager:*:*:secret:/myapp/*"}, + {"Sid":"STS","Effect":"Allow","Action":["sts:GetCallerIdentity","sts:AssumeRole"],"Resource":"*"}, + {"Sid":"SSM","Effect":"Allow","Action":["ssm:GetParameter","ssm:GetParameters","ssm:PutParameter"],"Resource":"arn:aws:ssm:*:*:parameter/cdk-bootstrap/*"}, + {"Sid":"CC","Effect":"Allow","Action":["codeconnections:UseConnection","codeconnections:GetConnection","codeconnections:GetConnectionToken","codeconnections:PassConnectionToService"],"Resource":"arn:aws:codeconnections:*:*:connection/*"} + ] + }' + else + policy_doc='{ + "Version":"2012-10-17", + "Statement":[ + {"Sid":"S3","Effect":"Allow","Action":"s3:*","Resource":["arn:aws:s3:::cdk-*","arn:aws:s3:::cdk-*/*","arn:aws:s3:::pdf2html-*","arn:aws:s3:::pdf2html-*/*"]}, + {"Sid":"ECR","Effect":"Allow","Action":"ecr:*","Resource":["arn:aws:ecr:*:*:repository/cdk-*","arn:aws:ecr:*:*:repository/pdf2html-*"]}, + {"Sid":"ECRAuth","Effect":"Allow","Action":"ecr:GetAuthorizationToken","Resource":"*"}, + {"Sid":"Lambda","Effect":"Allow","Action":"lambda:*","Resource":["arn:aws:lambda:*:*:function:Pdf2Html*","arn:aws:lambda:*:*:function:pdf2html*"]}, + {"Sid":"IAMRole","Effect":"Allow","Action":["iam:CreateRole","iam:DeleteRole","iam:GetRole","iam:PassRole","iam:AttachRolePolicy","iam:DetachRolePolicy","iam:PutRolePolicy","iam:GetRolePolicy","iam:DeleteRolePolicy","iam:TagRole","iam:UntagRole","iam:ListRolePolicies","iam:ListAttachedRolePolicies","iam:UpdateAssumeRolePolicy","iam:ListRoleTags"],"Resource":["arn:aws:iam::*:role/Pdf2Html*","arn:aws:iam::*:role/pdf2html*","arn:aws:iam::*:role/cdk-*"]}, + {"Sid":"IAMPolicy","Effect":"Allow","Action":["iam:CreatePolicy","iam:DeletePolicy","iam:GetPolicy","iam:GetPolicyVersion","iam:CreatePolicyVersion","iam:DeletePolicyVersion","iam:ListPolicyVersions"],"Resource":"arn:aws:iam::*:policy/*"}, + {"Sid":"CFN","Effect":"Allow","Action":"cloudformation:*","Resource":["arn:aws:cloudformation:*:*:stack/Pdf2Html*/*","arn:aws:cloudformation:*:*:stack/pdf2html*/*","arn:aws:cloudformation:*:*:stack/CDKToolkit/*"]}, + {"Sid":"Bedrock","Effect":"Allow","Action":["bedrock:CreateDataAutomationProject","bedrock:GetDataAutomationProject","bedrock:DeleteDataAutomationProject","bedrock:UpdateDataAutomationProject","bedrock:ListDataAutomationProjects"],"Resource":"*"}, + {"Sid":"Logs","Effect":"Allow","Action":"logs:*","Resource":["arn:aws:logs:*:*:log-group:/aws/codebuild/*","arn:aws:logs:*:*:log-group:/aws/codebuild/*:*","arn:aws:logs:*:*:log-group:/aws/lambda/Pdf2Html*","arn:aws:logs:*:*:log-group:/aws/lambda/Pdf2Html*:*"]}, + {"Sid":"STS","Effect":"Allow","Action":["sts:GetCallerIdentity","sts:AssumeRole"],"Resource":"*"}, + {"Sid":"SSM","Effect":"Allow","Action":["ssm:GetParameter","ssm:GetParameters","ssm:PutParameter"],"Resource":"arn:aws:ssm:*:*:parameter/cdk-bootstrap/*"}, + {"Sid":"CC","Effect":"Allow","Action":["codeconnections:UseConnection","codeconnections:GetConnection","codeconnections:GetConnectionToken","codeconnections:PassConnectionToService"],"Resource":"arn:aws:codeconnections:*:*:connection/*"} + ] + }' + fi + + print_status "Creating IAM policy: $policy_name" + local policy_response + policy_response="$(aws iam create-policy \ + --policy-name "$policy_name" \ + --policy-document "$policy_doc" \ + --description "Scoped policy for $deployment_type CodeBuild deployment" 2>/dev/null || \ + aws iam get-policy --policy-arn "arn:aws:iam::${ACCOUNT_ID}:policy/$policy_name" 2>/dev/null)" || { + print_error "Failed to create or retrieve IAM policy: $policy_name" + exit 1 + } + + POLICY_ARN="$(echo "$policy_response" | jq -r '.Policy.Arn')" + print_success "Policy ready: $policy_name" + + aws iam attach-role-policy --role-name "$ROLE_NAME" --policy-arn "$POLICY_ARN" || { + print_error "Failed to attach policy to role" + exit 1 + } +} + +# --------------------------------------------------------------------------- +# CodeBuild Project Creation (Task 11.6) +# --------------------------------------------------------------------------- +create_codebuild_project() { + local project_name="$1" + local env_prefix="${2:-}" + local branch="${3:-$TARGET_BRANCH}" + local env_name="${4:-}" + local is_production="${5:-false}" + + print_status "Creating CodeBuild project: $project_name" + + # Build source JSON + local source_json + source_json="$(configure_source "$SOURCE_PROVIDER" "$PRIVATE_REPO_URL" \ + "$branch" "${CONNECTION_ARN:-}" "$BUILDSPEC_FILE")" + + # Build environment + local build_image compute_type + if [[ "$DEPLOYMENT_TYPE" == "pdf2pdf" ]]; then + build_image="aws/codebuild/amazonlinux-x86_64-standard:5.0" + compute_type="BUILD_GENERAL1_SMALL" + else + build_image="aws/codebuild/amazonlinux2-x86_64-standard:5.0" + compute_type="BUILD_GENERAL1_LARGE" + fi + + local env_json="{\"type\":\"LINUX_CONTAINER\",\"image\":\"$build_image\",\"computeType\":\"$compute_type\",\"privilegedMode\":true}" + + # Build environment variables + local env_vars="[{\"name\":\"DEPLOYMENT_TYPE\",\"value\":\"$DEPLOYMENT_TYPE\"}" + if [[ -n "$env_name" ]]; then + env_vars+=",{\"name\":\"TARGET_ENVIRONMENT\",\"value\":\"$env_name\"}" + fi + if [[ "$DEPLOYMENT_TYPE" == "pdf2html" ]]; then + env_vars+=",{\"name\":\"ACCOUNT_ID\",\"value\":\"$ACCOUNT_ID\"}" + env_vars+=",{\"name\":\"REGION\",\"value\":\"$REGION\"}" + env_vars+=",{\"name\":\"BUCKET_NAME\",\"value\":\"${BUCKET_NAME:-}\"}" + env_vars+=",{\"name\":\"BDA_PROJECT_ARN\",\"value\":\"${BDA_PROJECT_ARN:-}\"}" + fi + env_vars+="]" + + env_json="$(echo "$env_json" | jq --argjson ev "$env_vars" '.environmentVariables = $ev')" + + # Create project + local create_output + create_output="$(aws codebuild create-project \ + --name "$project_name" \ + --source "$source_json" \ + --source-version "$branch" \ + --artifacts '{"type":"NO_ARTIFACTS"}' \ + --environment "$env_json" \ + --service-role "$ROLE_ARN" \ + --output json 2>&1)" || { + # Check if it's a genuine "already exists" case + if echo "$create_output" | grep -qi "already exists"; then + print_warning "CodeBuild project '$project_name' already exists, reusing" + else + print_error "Failed to create CodeBuild project: $project_name" + print_error "$create_output" + exit 1 + fi + } + + # Verify the project actually exists before proceeding + if ! aws codebuild batch-get-projects --names "$project_name" \ + --query 'projects[0].name' --output text 2>/dev/null | grep -q "$project_name"; then + print_error "CodeBuild project '$project_name' not found after creation attempt" + print_error "Check IAM permissions and source configuration" + exit 1 + fi + + # Configure webhooks if branch-env-map is in use + if [[ -n "$env_name" ]]; then + local webhook_json + webhook_json="$(configure_webhooks "$branch" "$env_name" "$is_production")" + local filter_groups + filter_groups="$(echo "$webhook_json" | jq -c '.filterGroups')" + + aws codebuild create-webhook \ + --project-name "$project_name" \ + --filter-groups "$filter_groups" \ + --output json > /dev/null 2>&1 || { + print_warning "Webhook may already exist for $project_name" + } + print_success "Webhook configured for branch '$branch' → environment '$env_name'" + fi + + print_success "CodeBuild project ready: $project_name" +} + +# --------------------------------------------------------------------------- +# Build Monitoring (Task 12.1, 12.2) +# --------------------------------------------------------------------------- +show_build_logs() { + local project_name="$1" + local log_group="/aws/codebuild/$project_name" + + sleep 5 + local latest_stream + latest_stream="$(aws logs describe-log-streams \ + --log-group-name "$log_group" \ + --order-by LastEventTime --descending --max-items 1 \ + --query 'logStreams[0].logStreamName' --output text 2>/dev/null || echo "")" + + if [[ -n "$latest_stream" && "$latest_stream" != "None" ]]; then + print_error "Recent build logs:" + aws logs get-log-events \ + --log-group-name "$log_group" \ + --log-stream-name "$latest_stream" \ + --query 'events[-30:].message' --output text 2>/dev/null || \ + print_error "Could not retrieve logs" + else + print_error "Could not retrieve build logs. Check CodeBuild console." + fi +} + +start_and_monitor_build() { + local project_name="$1" + local source_version="$2" + + print_status "Starting build for project '$project_name'..." + + local build_response + build_response="$(aws codebuild start-build \ + --project-name "$project_name" \ + --source-version "$source_version" \ + --output json)" || { + print_error "Failed to start build" + exit 1 + } + + local build_id + build_id="$(echo "$build_response" | jq -r '.build.id')" + print_success "Build started: $build_id" + + print_status "Monitoring build progress..." + local dots=0 last_status="" + while true; do + local build_status + build_status="$(aws codebuild batch-get-builds --ids "$build_id" \ + --query 'builds[0].buildStatus' --output text)" + + if [[ "$build_status" != "$last_status" ]]; then + echo "" + print_status "Build status: $build_status" + last_status="$build_status" + dots=0 + fi + + case "$build_status" in + SUCCEEDED) + echo "" + print_success "Build completed successfully!" + return 0 + ;; + FAILED|FAULT|STOPPED|TIMED_OUT) + echo "" + print_error "Build failed with status: $build_status" + show_build_logs "$project_name" + return 1 + ;; + IN_PROGRESS) + printf "." + dots=$((dots + 1)) + if [[ $dots -eq 60 ]]; then + echo "" + print_status "Still building..." + dots=0 + fi + sleep 5 + ;; + *) + printf "." + sleep 3 + ;; + esac + done +} + +# --------------------------------------------------------------------------- +# UI Deployment (Task 12.3) +# --------------------------------------------------------------------------- +deploy_ui() { + if [[ ${#DEPLOYED_SOLUTIONS[@]} -eq 0 ]]; then + print_error "No backend solutions deployed. Cannot deploy UI without backend." + return 1 + fi + + local pdf2pdf_bucket="${PDF2PDF_BUCKET:-Null}" + local pdf2html_bucket="${PDF2HTML_BUCKET:-Null}" + + if [[ "$pdf2pdf_bucket" == "Null" && "$pdf2html_bucket" == "Null" ]]; then + print_error "No backend bucket available for UI deployment." + return 1 + fi + + local ui_repo_url + ui_repo_url="$(prompt_or_fail "UI_REPO_URL" \ + "Enter UI repository URL (or press Enter for default): " \ + "${UI_REPO_URL:-}")" + ui_repo_url="${ui_repo_url:-https://github.com/ASUCICREPO/PDF_accessability_UI}" + + print_status "Deploying UI from: $ui_repo_url" + + local ui_env + ui_env="$(build_ui_env "$pdf2pdf_bucket" "$pdf2html_bucket")" + print_status "UI environment:" + echo "$ui_env" | while read -r line; do print_status " $line"; done + + local original_dir + original_dir="$(pwd)" + local ui_temp="/tmp/pdf-ui-deployment-$$" + + if ! git clone -b main "$ui_repo_url" "$ui_temp" 2>/dev/null; then + print_error "Failed to clone UI repository" + return 1 + fi + + cd "$ui_temp" || return 1 + export PDF_TO_PDF_BUCKET="$pdf2pdf_bucket" + export PDF_TO_HTML_BUCKET="$pdf2html_bucket" + + if [[ -f "deploy.sh" ]]; then + chmod +x deploy.sh + ./deploy.sh || { print_error "UI deployment failed"; cd "$original_dir"; rm -rf "$ui_temp"; return 1; } + print_success "UI deployment completed!" + else + print_error "UI deploy.sh not found in repository" + fi + + cd "$original_dir" + rm -rf "$ui_temp" +} + +# --------------------------------------------------------------------------- +# Cleanup (Task 12.4) +# --------------------------------------------------------------------------- +cleanup_resources() { + print_header "Cleaning up pipeline resources..." + + local pattern="pdfremediation-*" + if [[ -n "$CLI_ENVIRONMENT" ]]; then + local env_prefix + env_prefix="$(generate_env_prefix "$CLI_ENVIRONMENT")" + pattern="${env_prefix}-pdfremediation-*" + print_status "Filtering by environment: $CLI_ENVIRONMENT (prefix: $env_prefix)" + fi + + # List matching CodeBuild projects + local all_projects + all_projects="$(aws codebuild list-projects --query 'projects' --output text 2>/dev/null | tr '\t' '\n')" + local matching + matching="$(filter_projects_by_pattern "$all_projects" "$pattern")" + + if [[ -z "$matching" ]]; then + print_status "No matching resources found." + return 0 + fi + + echo "" + print_status "Resources to delete:" + echo "$matching" | while read -r p; do + [[ -n "$p" ]] && print_status " - CodeBuild project: $p" + done + + # Confirm unless non-interactive + if [[ "$NON_INTERACTIVE" != "true" ]]; then + local confirm + read -rp "Proceed with deletion? (y/N): " confirm + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + print_status "Cleanup cancelled." + return 0 + fi + fi + + local failed=() + while IFS= read -r project; do + [[ -z "$project" ]] && continue + print_status "Deleting: $project" + + # Delete CodeBuild project + aws codebuild delete-project --name "$project" 2>/dev/null || { + print_warning "Failed to delete project: $project" + failed+=("$project") + } + + # Derive and delete IAM resources + local role_name="${project}-codebuild-service-role" + local policy_pdf2pdf="${project}-pdf2pdf-codebuild-policy" + local policy_pdf2html="${project}-pdf2html-codebuild-policy" + + for policy_name in "$policy_pdf2pdf" "$policy_pdf2html"; do + local policy_arn="arn:aws:iam::${ACCOUNT_ID}:policy/$policy_name" + aws iam detach-role-policy --role-name "$role_name" --policy-arn "$policy_arn" 2>/dev/null || true + aws iam delete-policy --policy-arn "$policy_arn" 2>/dev/null || true + done + + aws iam delete-role --role-name "$role_name" 2>/dev/null || { + print_warning "Failed to delete role: $role_name" + failed+=("$role_name") + } + done <<< "$matching" + + if [[ ${#failed[@]} -gt 0 ]]; then + echo "" + print_warning "Failed to delete ${#failed[@]} resource(s):" + for f in "${failed[@]}"; do print_warning " - $f"; done + return 1 + fi + + print_success "Cleanup complete!" +} + +# --------------------------------------------------------------------------- +# Main Orchestration (Task 13) +# --------------------------------------------------------------------------- +main() { + parse_args "$@" + + # Handle cleanup mode + if [[ "$DO_CLEANUP" == "true" ]]; then + # Apply AWS profile if specified (CLI flag > env var > config file) + if [[ -n "$CLI_PROFILE" ]]; then + export AWS_PROFILE="$CLI_PROFILE" + print_status "Using AWS profile: $AWS_PROFILE" + fi + # Get account ID for policy ARN construction + ACCOUNT_ID="$(aws sts get-caller-identity --query 'Account' --output text 2>/dev/null)" || { + print_error "AWS CLI not configured. Run 'aws configure' first." + exit 1 + } + cleanup_resources + exit $? + fi + + # Welcome + echo "" + print_header "🔒 PDF Accessibility — Private Pipeline Setup" + print_header "================================================" + echo "" + + # Apply AWS profile if specified (CLI flag > env var > config file) + if [[ -n "$CLI_PROFILE" ]]; then + export AWS_PROFILE="$CLI_PROFILE" + print_status "Using AWS profile: $AWS_PROFILE" + elif [[ -n "${AWS_PROFILE:-}" ]]; then + print_status "Using AWS profile from environment: $AWS_PROFILE" + fi + + # Get AWS identity + print_status "Verifying AWS credentials..." + ACCOUNT_ID="$(aws sts get-caller-identity --query 'Account' --output text 2>/dev/null)" || { + print_error "AWS CLI not configured. Run 'aws configure' first." + exit 1 + } + REGION="$(aws configure get region 2>/dev/null || echo "us-east-1")" + print_success "Account: $ACCOUNT_ID, Region: $REGION" + + # Load config file if provided + if [[ -n "$CONFIG_FILE" ]]; then + print_status "Loading config from: $CONFIG_FILE" + eval "$(parse_config_file "$CONFIG_FILE")" || exit 1 + # Apply profile from config if not already set by CLI flag + if [[ -z "$CLI_PROFILE" && -n "${AWS_PROFILE:-}" ]]; then + print_status "Using AWS profile from config: $AWS_PROFILE" + fi + fi + + # Also check BRANCH_ENV_MAP env var + if [[ -z "$CLI_BRANCH_ENV_MAP" && -n "${BRANCH_ENV_MAP:-}" ]]; then + CLI_BRANCH_ENV_MAP="$BRANCH_ENV_MAP" + fi + + # Collect parameters (interactive or from env) + collect_parameters + + # Validate + local missing_output + missing_output="$(check_required_params "$NON_INTERACTIVE")" || { + print_error "$missing_output" + exit 1 + } + + # Resolve CLI defaults + local cli_defaults + cli_defaults="$(resolve_cli_defaults "$CLI_BUILDSPEC" "$CLI_PROJECT_NAME")" + BUILDSPEC_FILE="$(echo "$cli_defaults" | head -1)" + PROJECT_NAME="$(echo "$cli_defaults" | tail -1)" + + # Validate inputs (URL, provider, connection) + validate_inputs + + # Setup prerequisites + print_header "Setting up prerequisites..." + if [[ "$DEPLOYMENT_TYPE" == "pdf2pdf" ]]; then + setup_pdf2pdf_prereqs + else + setup_pdf2html_prereqs + fi + + # Multi-environment or single deployment + if [[ -n "$CLI_BRANCH_ENV_MAP" ]]; then + deploy_multi_environment + else + deploy_single_environment + fi + + # Offer UI deployment + echo "" + if [[ "$NON_INTERACTIVE" != "true" ]]; then + local deploy_ui_choice + read -rp "Deploy Frontend UI? (y/N): " deploy_ui_choice + if [[ "$deploy_ui_choice" == "y" || "$deploy_ui_choice" == "Y" ]]; then + deploy_ui + fi + fi + + echo "" + print_success "Pipeline setup complete!" +} + +deploy_single_environment() { + local role_name="${PROJECT_NAME}-codebuild-service-role" + ROLE_NAME="$role_name" + + create_iam_role "$role_name" + + local policy_name="${PROJECT_NAME}-${DEPLOYMENT_TYPE}-codebuild-policy" + create_iam_policy "$policy_name" "$DEPLOYMENT_TYPE" + + create_codebuild_project "$PROJECT_NAME" "" "$TARGET_BRANCH" + + if start_and_monitor_build "$PROJECT_NAME" "$TARGET_BRANCH"; then + DEPLOYED_SOLUTIONS+=("$DEPLOYMENT_TYPE") + # Collect bucket info + if [[ "$DEPLOYMENT_TYPE" == "pdf2pdf" ]]; then + PDF2PDF_BUCKET="$(aws s3api list-buckets \ + --query 'Buckets[?contains(Name, `pdfaccessibility`)] | sort_by(@, &CreationDate) | [-1].Name' \ + --output text 2>/dev/null || echo "")" + else + PDF2HTML_BUCKET="${BUCKET_NAME:-}" + fi + fi +} + +deploy_multi_environment() { + print_header "Multi-environment deployment..." + + # Parse and validate branch-env-map + local map_lines + map_lines="$(parse_branch_env_map "$CLI_BRANCH_ENV_MAP")" || exit 1 + validate_branch_env_map "$map_lines" || exit 1 + + # Determine production branch (maps to "prod" environment) + local prod_branch="" + while IFS='=' read -r branch env_name; do + [[ -z "$branch" ]] && continue + if [[ "$env_name" == "prod" ]]; then + prod_branch="$branch" + fi + done <<< "$map_lines" + + # Create resources for each environment + while IFS='=' read -r branch env_name; do + [[ -z "$branch" ]] && continue + + local env_prefix + env_prefix="$(generate_env_prefix "$env_name")" + local is_production="false" + [[ "$env_name" == "prod" ]] && is_production="true" + + local env_project_name + env_project_name="$(generate_env_resource_name "$env_prefix" "$PROJECT_NAME")" + local env_role_name="${env_project_name}-codebuild-service-role" + ROLE_NAME="$env_role_name" + + print_header "Setting up environment: $env_name (branch: $branch)" + + create_iam_role "$env_role_name" + + local env_policy_name="${env_project_name}-${DEPLOYMENT_TYPE}-codebuild-policy" + create_iam_policy "$env_policy_name" "$DEPLOYMENT_TYPE" "$env_prefix" + + create_codebuild_project "$env_project_name" "$env_prefix" "$branch" "$env_name" "$is_production" + + print_success "Environment '$env_name' configured for branch '$branch'" + echo "" + done <<< "$map_lines" + + DEPLOYED_SOLUTIONS+=("$DEPLOYMENT_TYPE") + print_success "All environments configured!" + print_status "Builds will trigger automatically via webhooks on branch pushes." + if [[ -n "$prod_branch" ]]; then + print_status "Production deploys on PR merge to '$prod_branch'." + fi +} + +# --------------------------------------------------------------------------- +# Entry Point +# --------------------------------------------------------------------------- +main "$@" diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md new file mode 100644 index 00000000..8ff3b500 --- /dev/null +++ b/docs/OBSERVABILITY.md @@ -0,0 +1,137 @@ +# Observability & Usage Metrics + +This document describes the observability features added to the PDF Accessibility platform, including custom CloudWatch metrics, per-user usage tracking, cost estimation, and a dedicated monitoring dashboard. + +## Overview + +All metrics are published to the `PDFAccessibility` CloudWatch namespace. A dedicated dashboard (`PDF-Accessibility-Usage-Metrics`) provides real-time visibility into usage, costs, and performance across both the PDF-to-PDF and PDF-to-HTML solutions. + +### Key Principles + +- **Metrics over Log Insights** — CloudWatch metrics provide 15-month retention, real-time dashboards, and alarm support without parsing log formats. +- **Consistent dimensions** — All metrics use `Service` + `UserId` dimensions. `FileName` is intentionally excluded to avoid unbounded cardinality. +- **Graceful degradation** — Metrics emission failures are caught and logged without interrupting PDF processing. + +## Metrics Reference + +| Metric | Unit | Description | +|--------|------|-------------| +| `PagesProcessed` | Count | Number of PDF pages processed | +| `AdobeAPICalls` | Count | Adobe API invocations | +| `AdobeDocTransactions` | Count | Adobe Document Transactions (10/page for AutoTag, 1/5 pages for ExtractPDF) | +| `BedrockInvocations` | Count | Bedrock model calls | +| `BedrockInputTokens` | Count | Tokens sent to Bedrock | +| `BedrockOutputTokens` | Count | Tokens received from Bedrock | +| `ProcessingDuration` | Milliseconds | Processing time per stage | +| `ErrorCount` | Count | Errors by type and stage | +| `FileSize` | Bytes | Input file sizes | +| `EstimatedCost` | None | Estimated USD cost per job | + +### Dimensions + +| Dimension | Values | Used By | +|-----------|--------|---------| +| `Service` | `pdf2pdf`, `pdf2html` | All metrics | +| `UserId` | Cognito sub or `anonymous` | All metrics | +| `Stage` | `split`, `autotag`, `alt-text`, `merge`, `title`, `a11y-check` | ProcessingDuration, ErrorCount | +| `Operation` | `AutoTag`, `ExtractPDF` | AdobeAPICalls, AdobeDocTransactions | +| `Model` | Bedrock model ID | BedrockInvocations, token metrics | +| `ErrorType` | Exception class name | ErrorCount | + +## Per-User Tracking + +User attribution works through S3 object tagging: + +1. **Cognito uploads** — The UI sets S3 metadata (`user-sub`, `user-groups`, `upload-timestamp`) on uploaded files. +2. **S3 Object Tagger Lambda** (`PDFAccessibility-S3ObjectTagger`) — Triggers on S3 `ObjectCreated` events in the `pdf/` prefix, reads the `user-sub` metadata, and applies a `UserId` S3 tag. +3. **Processing Lambdas/ECS** — Read the `UserId` tag from the S3 object and pass it as a dimension to all emitted metrics. +4. **Direct uploads** (no Cognito) — Default to `UserId: anonymous`. + +## Cost Estimation + +Approximate pricing (2024): + +| Service | Rate | +|---------|------| +| Adobe API | $0.05 per operation | +| Bedrock Claude Haiku | $0.00025/1K input, $0.00125/1K output | +| Bedrock Claude Sonnet | $0.003/1K input, $0.015/1K output | +| Lambda | $0.0000166667/GB-sec | +| ECS Fargate | $0.04048/vCPU-hr + $0.004445/GB-hr | +| Bedrock Data Automation | $0.01 per page | + +The `estimate_cost()` function in `metrics_helper.py` calculates and emits an `EstimatedCost` metric per job. + +## Integration + +### Metrics Helper Library + +The shared library `lambda/shared/python/metrics_helper.py` provides: + +- `emit_metric()` — Low-level CloudWatch PutMetricData wrapper +- `track_pages_processed()` — Page count tracking +- `track_adobe_api_call()` — Adobe API call and Document Transaction tracking +- `track_bedrock_invocation()` — Bedrock model invocation and token tracking +- `track_processing_duration()` — Stage timing +- `track_error()` — Error tracking by type and stage +- `track_file_size()` — Input file size tracking +- `estimate_cost()` — Cost estimation and metric emission +- `MetricsContext` — Context manager for automatic duration and error tracking + +### Integrated Components + +| Component | File | Metrics Tracked | +|-----------|------|-----------------| +| PDF Splitter Lambda | `lambda/pdf-splitter-lambda/main.py` | PagesProcessed, FileSize, ProcessingDuration, ErrorCount | +| Adobe AutoTag (ECS) | `adobe-autotag-container/adobe_autotag_processor.py` | AdobeAPICalls, AdobeDocTransactions, BedrockInvocations | +| Alt Text Generator (ECS) | `alt-text-generator-container/alt_text_generator.js` | BedrockInvocations, BedrockInputTokens, BedrockOutputTokens | +| PDF-to-HTML Lambda | `pdf2html/lambda_function.py` | PagesProcessed, ProcessingDuration, EstimatedCost | +| S3 Object Tagger | `lambda/s3_object_tagger/main.py` | User attribution via S3 tags | + +### Lambda Layer Deployment + +The metrics helper is deployed as a Lambda layer. The `deploy-local.sh` script handles copying `lambda/shared/python/metrics_helper.py` to the build contexts that need it. + +## Dashboard + +The `PDFAccessibilityUsageMetrics` CDK stack (`cdk/usage_metrics_stack.py`) creates a CloudWatch dashboard with: + +- Total pages processed and documents by service +- Adobe API calls and Document Transactions +- Bedrock invocations and token usage +- Processing duration percentiles +- Error rates +- Estimated costs +- Per-user usage breakdown + +### Deployment + +The dashboard deploys automatically with the main stack: + +```bash +cdk deploy --all +``` + +Or deploy just the dashboard: + +```bash +cdk deploy PDFAccessibilityUsageMetrics +``` + +### Verification + +```bash +# Check metrics are flowing +aws cloudwatch list-metrics --namespace PDFAccessibility + +# Check dashboard exists +aws cloudwatch list-dashboards +``` + +## Recommended Alarms + +| Alarm | Metric | Threshold | +|-------|--------|-----------| +| High error rate | ErrorCount | > 5 per 5 minutes | +| Processing stalled | PagesProcessed | < 1 per hour (when expected) | +| Cost spike | EstimatedCost | > daily budget threshold | diff --git a/docs/PRIVATE_PIPELINE_GUIDE.md b/docs/PRIVATE_PIPELINE_GUIDE.md new file mode 100644 index 00000000..b384f77d --- /dev/null +++ b/docs/PRIVATE_PIPELINE_GUIDE.md @@ -0,0 +1,283 @@ +# Private CI/CD Pipeline — Getting Started Guide + +Deploy the PDF Accessibility solution from your own private repository with full CI/CD integration, multi-environment support, and automated branch-based deployments. + +## Prerequisites + +| Tool | Minimum Version | Check Command | +|---|---|---| +| AWS CLI | v2.x | `aws --version` | +| AWS CDK | v2.x | `cdk --version` | +| Docker | 20.x+ | `docker --version` | +| jq | 1.6+ | `jq --version` | +| Git | 2.x | `git --version` | + +**IAM Permissions** needed to run the setup script: +- `sts:GetCallerIdentity` +- `iam:CreateRole`, `iam:CreatePolicy`, `iam:AttachRolePolicy`, `iam:GetRole` +- `codebuild:CreateProject`, `codebuild:StartBuild`, `codebuild:BatchGetBuilds`, `codebuild:CreateWebhook` +- `codeconnections:GetConnection` (for GitHub/Bitbucket/GitLab) +- `secretsmanager:CreateSecret`, `secretsmanager:UpdateSecret` (for pdf2pdf) +- `bedrock:CreateDataAutomationProject` (for pdf2html) +- `s3:CreateBucket`, `s3api:PutBucketVersioning` (for pdf2html) +- `logs:DescribeLogStreams`, `logs:GetLogEvents` + +## Step 1: Clone the Public Repository + +### GitHub (Private) + +```bash +git clone https://github.com/ASUCICREPO/PDF_Accessibility.git my-pdf-accessibility +cd my-pdf-accessibility +git remote rename origin upstream +# Create your private repo on GitHub, then: +git remote add origin https://github.com/YOUR_ORG/your-private-repo.git +git push -u origin main +``` + +### AWS CodeCommit + +```bash +git clone https://github.com/ASUCICREPO/PDF_Accessibility.git my-pdf-accessibility +cd my-pdf-accessibility +git remote rename origin upstream +# Create a CodeCommit repo in your AWS account, then: +git remote add origin https://git-codecommit.us-east-1.amazonaws.com/v1/repos/your-repo +git push -u origin main +``` + +### Bitbucket + +```bash +git clone https://github.com/ASUCICREPO/PDF_Accessibility.git my-pdf-accessibility +cd my-pdf-accessibility +git remote rename origin upstream +git remote add origin https://bitbucket.org/YOUR_ORG/your-private-repo.git +git push -u origin main +``` + +### GitLab + +```bash +git clone https://github.com/ASUCICREPO/PDF_Accessibility.git my-pdf-accessibility +cd my-pdf-accessibility +git remote rename origin upstream +git remote add origin https://gitlab.com/YOUR_ORG/your-private-repo.git +git push -u origin main +``` + +### Pulling Future Updates from Upstream + +```bash +git fetch upstream +git merge upstream/main --no-edit +git push origin main +``` + +## Step 2: Set Up AWS CodeConnections (GitHub/Bitbucket/GitLab only) + +CodeCommit uses IAM authentication natively — skip this step if using CodeCommit. + +### GitHub + +1. Open the [AWS CodeConnections console](https://console.aws.amazon.com/codesuite/settings/connections) +2. Click **Create connection** +3. Select **GitHub** as the provider +4. Name the connection (e.g., `my-github-connection`) +5. Click **Connect to GitHub** and authorize AWS in the OAuth flow +6. Complete the handshake — status should show **Available** +7. Copy the **Connection ARN** + +### Bitbucket + +1. Open the CodeConnections console +2. Click **Create connection** → select **Bitbucket** +3. Name the connection and authorize via OAuth +4. Verify status is **Available** +5. Copy the Connection ARN + +### GitLab + +1. Open the CodeConnections console +2. Click **Create connection** → select **GitLab** +3. Name the connection and authorize via OAuth +4. Verify status is **Available** +5. Copy the Connection ARN + +## Step 3: Deploy — Interactive Mode + +```bash +cd my-pdf-accessibility +./deploy-private.sh +``` + +The script will prompt you for: +1. **Repository URL** — your private repo URL +2. **Source provider** — github, codecommit, bitbucket, or gitlab +3. **Deployment type** — pdf2pdf or pdf2html +4. **Branch** — defaults to `main` +5. **Connection ARN** — (if not CodeCommit) +6. **Adobe credentials** — (if pdf2pdf) + +## Step 4: Deploy — Non-Interactive Mode + +Set environment variables and use the `--non-interactive` flag: + +```bash +export PRIVATE_REPO_URL="https://github.com/myorg/my-fork.git" +export SOURCE_PROVIDER="github" +export DEPLOYMENT_TYPE="pdf2pdf" +export TARGET_BRANCH="main" +export CONNECTION_ARN="arn:aws:codeconnections:us-east-1:123456789:connection/abc-123" +export ADOBE_CLIENT_ID="your-client-id" +export ADOBE_CLIENT_SECRET="your-client-secret" + +./deploy-private.sh --non-interactive +``` + +Or use a config file: + +```bash +# Create pipeline.conf +cat > pipeline.conf << 'EOF' +PRIVATE_REPO_URL=https://github.com/myorg/my-fork.git +SOURCE_PROVIDER=github +DEPLOYMENT_TYPE=pdf2pdf +TARGET_BRANCH=main +CONNECTION_ARN=arn:aws:codeconnections:us-east-1:123456789:connection/abc-123 +ADOBE_CLIENT_ID=your-client-id +ADOBE_CLIENT_SECRET=your-client-secret +EOF + +./deploy-private.sh --config pipeline.conf --non-interactive +``` + +## Step 5: Multi-Environment Deployment + +Deploy different branches to different environments with automatic webhook triggers: + +```bash +./deploy-private.sh \ + --branch-env-map '{"main":"prod","dev":"dev","staging":"staging","feature/*":"dev"}' \ + --non-interactive +``` + +This creates: +- **prod** environment — triggered by pushes to `main` and PR merges to `main` +- **dev** environment — triggered by pushes to `dev` and `feature/*` branches +- **staging** environment — triggered by pushes to `staging` + +Each environment gets isolated resources (separate CloudFormation stacks, S3 buckets, IAM roles) prefixed with the environment name. + +Default mapping (when `--branch-env-map` is not provided and multi-env is not used): +```json +{"main": "prod", "dev": "dev", "test": "test", "staging": "staging"} +``` + +## Customization + +### Custom Buildspec + +```bash +./deploy-private.sh --buildspec my-custom-buildspec.yml +``` + +### Custom Project Name + +```bash +./deploy-private.sh --project-name my-project-name +``` + +### Using a Named AWS CLI Profile + +```bash +./deploy-private.sh --profile my-aws-profile +``` + +Or via environment variable: +```bash +export AWS_PROFILE=my-aws-profile +./deploy-private.sh +``` + +Or in a config file: +``` +AWS_PROFILE=my-aws-profile +PRIVATE_REPO_URL=https://github.com/myorg/my-fork.git +... +``` + +### Modifying Infrastructure + +Edit CDK stack files in your private repo. On the next build (push or manual trigger), CodeBuild will deploy the updated stacks automatically. + +### Modifying Container Code + +Edit Docker container code in your private repo. CodeBuild rebuilds and pushes updated images to ECR on each build. + +## Cleanup + +### Delete All Pipeline Resources + +```bash +./deploy-private.sh --cleanup +``` + +### Delete a Specific Environment + +```bash +./deploy-private.sh --cleanup --environment dev +``` + +### Non-Interactive Cleanup + +```bash +./deploy-private.sh --cleanup --non-interactive +``` + +## Troubleshooting + +### Connection Not in AVAILABLE Status + +**Symptom:** `Connection is not AVAILABLE (current status: PENDING)` + +**Fix:** Complete the OAuth handshake in the AWS Console: +1. Go to CodeConnections console +2. Find your connection +3. Click **Update pending connection** +4. Complete the authorization flow + +### Insufficient IAM Permissions + +**Symptom:** `AccessDenied` errors during setup + +**Fix:** Ensure your AWS CLI user/role has the permissions listed in the Prerequisites section. The script creates IAM roles and policies, which requires `iam:CreateRole` and `iam:CreatePolicy`. + +### CDK Bootstrap Failures + +**Symptom:** `CDKToolkit stack not found` or bootstrap errors + +**Fix:** The buildspec handles CDK bootstrap automatically. If it fails: +```bash +cdk bootstrap aws://ACCOUNT_ID/REGION +``` + +### Docker Build Failures + +**Symptom:** Build fails during Docker image creation + +**Fix:** +1. Ensure Docker is running locally if testing +2. Check that Dockerfiles exist in the expected paths +3. Verify ECR repository permissions +4. Check CodeBuild compute type — pdf2html requires `BUILD_GENERAL1_LARGE` for Docker builds + +### Build Fails with No Logs + +**Symptom:** Build status is FAILED but no logs are shown + +**Fix:** Check the CodeBuild console directly: +1. Go to AWS CodeBuild console +2. Find your project (name starts with `pdfremediation-`) +3. Click the failed build +4. Review the build logs in the **Build logs** tab diff --git a/docs/TROUBLESHOOTING_CDK_DEPLOY.md b/docs/TROUBLESHOOTING_CDK_DEPLOY.md index 9e8a50ab..308e9eb8 100644 --- a/docs/TROUBLESHOOTING_CDK_DEPLOY.md +++ b/docs/TROUBLESHOOTING_CDK_DEPLOY.md @@ -45,8 +45,8 @@ cdk deploy ### 3. Ensure Image is Properly Built and Pushed If the image size in ECR is `0.0`, try the following: -- Open `docker_autotag.py`, add an empty space or a newline, then save the file. -- Do the same for your `alt-text.js`. +- Open `adobe_autotag_processor.py`, add an empty space or a newline, then save the file. +- Do the same for your `alt_text_generator.js`. - Re-run `cdk deploy` to force rebuilding and pushing the images. ### 4. Verify ECR Image Sizes @@ -57,7 +57,7 @@ If the image size in ECR is `0.0`, try the following: If issues persist, retry the above steps in different orders: 1. Update CDK 2. Delete `cdk.out` & ECR assets -3. Modify `docker_autotag.py` & `alt-text.js` +3. Modify `adobe_autotag_processor.py` & `alt_text_generator.js` 4. Ensure images are properly built and pushed ## Expected Outcome diff --git a/lambda/pdf-splitter-lambda/main.py b/lambda/pdf-splitter-lambda/main.py index b9b0fcab..6aa1c7a3 100644 --- a/lambda/pdf-splitter-lambda/main.py +++ b/lambda/pdf-splitter-lambda/main.py @@ -14,6 +14,7 @@ import urllib.parse import io import os +import sys # Initialize AWS clients cloudwatch = boto3.client('cloudwatch') @@ -22,6 +23,18 @@ state_machine_arn = os.environ['STATE_MACHINE_ARN'] +# Import metrics helper +try: + from metrics_helper import track_pages_processed, track_file_size, MetricsContext +except ImportError: + print("Warning: metrics_helper not available, metrics will not be tracked") + track_pages_processed = lambda *args, **kwargs: None + track_file_size = lambda *args, **kwargs: None + class MetricsContext: + def __init__(self, *args, **kwargs): pass + def __enter__(self): return self + def __exit__(self, *args): return False + def log_chunk_created(filename): """ Logs the creation of a PDF chunk. @@ -120,8 +133,10 @@ def lambda_handler(event, context): Returns: dict: HTTP response indicating the success or failure of the Lambda function execution. """ + file_basename = None + pdf_file_key = None + try: - print("Received event: " + json.dumps(event, indent=2)) # Access the S3 event structure @@ -133,29 +148,85 @@ def lambda_handler(event, context): raise ValueError("Event does not contain 'Records'. Check the S3 event structure.") file_basename = pdf_file_key.split('/')[-1].rsplit('.', 1)[0] + # Apply user tags from metadata (for UI uploads) + try: + head_response = s3_client.head_object(Bucket=bucket_name, Key=pdf_file_key) + metadata = head_response.get('Metadata', {}) + user_sub = metadata.get('user-sub', 'anonymous') + + # Get existing tags + try: + existing_tags = s3_client.get_object_tagging(Bucket=bucket_name, Key=pdf_file_key) + tags = {tag['Key']: tag['Value'] for tag in existing_tags.get('TagSet', [])} + except Exception: + tags = {} + + # Add UserId tag if not already present + if 'UserId' not in tags: + tags['UserId'] = user_sub + if metadata.get('user-groups'): + tags['UserGroups'] = metadata.get('user-groups') + + s3_client.put_object_tagging( + Bucket=bucket_name, + Key=pdf_file_key, + Tagging={'TagSet': [{'Key': k, 'Value': v} for k, v in tags.items()]} + ) + print(f"Tagged object with UserId: {user_sub}") + except Exception as e: + print(f"Could not apply user tags: {e}") - s3 = boto3.client('s3') - stepfunctions = boto3.client('stepfunctions') + # Get user from S3 tags + user_id = None + try: + tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=pdf_file_key) + for tag in tags_response.get('TagSet', []): + if tag['Key'] == 'UserId': + user_id = tag['Value'] + break + except Exception as e: + print(f"Could not get user tags: {e}") - # Get the PDF file from S3 - response = s3.get_object(Bucket=bucket_name, Key=pdf_file_key) - print(f'Filename - {pdf_file_key} | The response is: {response}') - pdf_file_content = response['Body'].read() - - # Split the PDF into pages and upload them to S3 - chunks = split_pdf_into_pages(pdf_file_content, pdf_file_key, s3, bucket_name, 200) - - log_chunk_created(file_basename) - - # Trigger Step Function with the list of chunks - response = stepfunctions.start_execution( - stateMachineArn=state_machine_arn, - input=json.dumps({"chunks": chunks, "s3_bucket": bucket_name}) - ) - print(f"Filename - {pdf_file_key} | Step Function started: {response['executionArn']}") + with MetricsContext("split", user_id, pdf_file_key, "pdf2pdf"): + s3 = boto3.client('s3') + stepfunctions = boto3.client('stepfunctions') + + # Get the PDF file from S3 + response = s3.get_object(Bucket=bucket_name, Key=pdf_file_key) + print(f'Filename - {pdf_file_key} | The response is: {response}') + pdf_file_content = response['Body'].read() + file_size = len(pdf_file_content) + + # Track file size + track_file_size(file_size, user_id, pdf_file_key, "pdf2pdf") + + # Split the PDF into pages and upload them to S3 + chunks = split_pdf_into_pages(pdf_file_content, pdf_file_key, s3, bucket_name, 200) + + # Track pages processed + from pypdf import PdfReader + reader = PdfReader(io.BytesIO(pdf_file_content)) + num_pages = len(reader.pages) + track_pages_processed(num_pages, user_id, pdf_file_key, "pdf2pdf") + + # Structured log for dashboard table queries + import json as _json + print(_json.dumps({"event": "file_processed", "userId": user_id or "anonymous", "fileName": pdf_file_key.split("/")[-1], "pageCount": num_pages, "service": "pdf2pdf"})) + + log_chunk_created(file_basename) + + # Trigger Step Function with the list of chunks + # Add user_id to each chunk so Map state can pass it to ECS + for chunk in chunks: + chunk["user_id"] = user_id or "" + + response = stepfunctions.start_execution( + stateMachineArn=state_machine_arn, + input=json.dumps({"chunks": chunks, "s3_bucket": bucket_name, "user_id": user_id, "file_name": pdf_file_key}) + ) + print(f"Filename - {pdf_file_key} | Step Function started: {response['executionArn']}") except KeyError as e: - print(f"File: {file_basename}, Status: Failed in split lambda function") print(f"Filename - {pdf_file_key} | KeyError: {str(e)}") return { diff --git a/lambda/s3_object_tagger/main.py b/lambda/s3_object_tagger/main.py new file mode 100644 index 00000000..6e0329b0 --- /dev/null +++ b/lambda/s3_object_tagger/main.py @@ -0,0 +1,67 @@ +""" +Lambda function to tag S3 objects with user information for metrics tracking. + +Triggered on S3 ObjectCreated events to: +1. Extract user info from object metadata (UI uploads) +2. Apply UserId tag for consistent metrics tracking +3. Support both Cognito-authenticated and direct uploads +""" +import json +import boto3 +import os +from urllib.parse import unquote_plus + +s3_client = boto3.client('s3') + +def lambda_handler(event, context): + """ + Tag S3 objects with UserId for metrics tracking. + + Extracts user-sub from object metadata (set by UI) and applies as UserId tag. + Falls back to 'anonymous' for direct uploads without metadata. + """ + + for record in event['Records']: + bucket = record['s3']['bucket']['name'] + key = unquote_plus(record['s3']['object']['key']) + + print(f"Processing: s3://{bucket}/{key}") + + try: + # Get object metadata + response = s3_client.head_object(Bucket=bucket, Key=key) + metadata = response.get('Metadata', {}) + + # Extract user identifier + user_id = metadata.get('user-sub', 'anonymous') + user_groups = metadata.get('user-groups', '') + + # Get existing tags + try: + existing_tags = s3_client.get_object_tagging(Bucket=bucket, Key=key) + tags = existing_tags.get('TagSet', []) + except Exception: + tags = [] + + # Add/update UserId tag + tag_dict = {tag['Key']: tag['Value'] for tag in tags} + tag_dict['UserId'] = user_id + + if user_groups: + tag_dict['UserGroups'] = user_groups + + # Apply tags + new_tags = [{'Key': k, 'Value': v} for k, v in tag_dict.items()] + s3_client.put_object_tagging( + Bucket=bucket, + Key=key, + Tagging={'TagSet': new_tags} + ) + + print(f"Tagged with UserId: {user_id}") + + except Exception as e: + print(f"Error tagging object: {e}") + # Don't fail - let processing continue + + return {'statusCode': 200} diff --git a/lambda/shared/metrics_helper.py b/lambda/shared/metrics_helper.py new file mode 100644 index 00000000..6b3d18fe --- /dev/null +++ b/lambda/shared/metrics_helper.py @@ -0,0 +1,257 @@ +""" +CloudWatch Metrics Helper for PDF Accessibility Platform + +This module provides utilities for emitting custom CloudWatch metrics +to track usage, costs, and performance across the PDF accessibility platform. +""" + +import boto3 +import time +from typing import Dict, List, Optional +from datetime import datetime + +cloudwatch = boto3.client('cloudwatch') + +NAMESPACE = "PDFAccessibility" + +def emit_metric( + metric_name: str, + value: float, + unit: str = "None", + dimensions: Optional[Dict[str, str]] = None, + timestamp: Optional[datetime] = None +): + """ + Emit a single metric to CloudWatch. + + Args: + metric_name: Name of the metric + value: Metric value + unit: CloudWatch unit (Count, Milliseconds, Bytes, etc.) + dimensions: Dict of dimension name/value pairs + timestamp: Metric timestamp (defaults to now) + """ + metric_data = { + 'MetricName': metric_name, + 'Value': value, + 'Unit': unit, + 'Timestamp': timestamp or datetime.utcnow() + } + + if dimensions: + metric_data['Dimensions'] = [ + {'Name': k, 'Value': v} for k, v in dimensions.items() + ] + + try: + cloudwatch.put_metric_data( + Namespace=NAMESPACE, + MetricData=[metric_data] + ) + except Exception as e: + print(f"Failed to emit metric {metric_name}: {e}") + +def track_pages_processed( + page_count: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track number of pages processed.""" + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName - aggregate at service/user level only + + emit_metric("PagesProcessed", page_count, "Count", dimensions) + +def track_adobe_api_call( + operation: str, + page_count: int = 0, + user_id: Optional[str] = None, + file_name: Optional[str] = None +): + """Track Adobe API calls and estimated Document Transactions. + + Adobe licensing: + - AutoTag: 10 Document Transactions per page + - ExtractPDF: 1 Document Transaction per 5 pages + """ + dimensions = { + "Service": "pdf2pdf", + "Operation": operation + } + if user_id: + dimensions["UserId"] = user_id + + emit_metric("AdobeAPICalls", 1, "Count", dimensions) + + # Calculate Document Transactions per Adobe licensing + if page_count > 0: + if operation == "AutoTag": + doc_transactions = page_count * 10 + elif operation == "ExtractPDF": + doc_transactions = -(-page_count // 5) # ceiling division + else: + doc_transactions = 1 + emit_metric("AdobeDocTransactions", doc_transactions, "Count", dimensions) + +def track_bedrock_invocation( + model_id: str, + input_tokens: int, + output_tokens: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track Bedrock model invocations and token usage.""" + dimensions = { + "Service": service, + "Model": model_id + } + if user_id: + dimensions["UserId"] = user_id + + emit_metric("BedrockInvocations", 1, "Count", dimensions) + emit_metric("BedrockInputTokens", input_tokens, "Count", dimensions) + emit_metric("BedrockOutputTokens", output_tokens, "Count", dimensions) + +def track_processing_duration( + stage: str, + duration_ms: float, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track processing duration for a specific stage.""" + dimensions = { + "Service": service, + "Stage": stage + } + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("ProcessingDuration", duration_ms, "Milliseconds", dimensions) + +def track_error( + error_type: str, + stage: str, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track errors by type and stage.""" + dimensions = { + "Service": service, + "Stage": stage, + "ErrorType": error_type + } + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("ErrorCount", 1, "Count", dimensions) + +def track_file_size( + size_bytes: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track file size.""" + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("FileSize", size_bytes, "Bytes", dimensions) + +def estimate_cost( + pages: int = 0, + adobe_calls: int = 0, + bedrock_input_tokens: int = 0, + bedrock_output_tokens: int = 0, + lambda_duration_ms: int = 0, + lambda_memory_mb: int = 1024, + ecs_duration_ms: int = 0, + ecs_vcpu: float = 0.25, + ecs_memory_gb: float = 1.0, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +) -> float: + """ + Estimate cost for a processing job and emit metric. + + Pricing (approximate, as of 2024): + - Adobe API: ~$0.05 per operation + - Bedrock Claude Haiku: $0.00025/1K input, $0.00125/1K output + - Bedrock Claude Sonnet: $0.003/1K input, $0.015/1K output + - Lambda: $0.0000166667/GB-sec + - ECS Fargate: $0.04048/vCPU-hr + $0.004445/GB-hr + - BDA: ~$0.01 per page + + Returns: + Estimated cost in USD + """ + cost = 0.0 + + # Adobe API cost + cost += adobe_calls * 0.05 + + # Bedrock cost (assuming Haiku for estimation) + cost += (bedrock_input_tokens / 1000) * 0.00025 + cost += (bedrock_output_tokens / 1000) * 0.00125 + + # Lambda cost + gb_seconds = (lambda_memory_mb / 1024) * (lambda_duration_ms / 1000) + cost += gb_seconds * 0.0000166667 + + # ECS cost + if ecs_duration_ms > 0: + hours = ecs_duration_ms / (1000 * 3600) + cost += (ecs_vcpu * hours * 0.04048) + (ecs_memory_gb * hours * 0.004445) + + # BDA cost (for pdf2html) + if service == "pdf2html": + cost += pages * 0.01 + + # Emit cost metric + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + + emit_metric("EstimatedCost", cost, "None", dimensions) + + return cost + +class MetricsContext: + """Context manager for tracking operation metrics.""" + + def __init__(self, stage: str, user_id: Optional[str] = None, + file_name: Optional[str] = None, service: str = "pdf2pdf"): + self.stage = stage + self.user_id = user_id + self.file_name = file_name + self.service = service + self.start_time = None + + def __enter__(self): + self.start_time = time.time() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + duration_ms = (time.time() - self.start_time) * 1000 + track_processing_duration( + self.stage, duration_ms, + self.user_id, self.file_name, self.service + ) + + if exc_type: + track_error( + exc_type.__name__, self.stage, + self.user_id, self.file_name, self.service + ) + + return False diff --git a/lambda/shared/python/metrics_helper.py b/lambda/shared/python/metrics_helper.py new file mode 100644 index 00000000..6b3d18fe --- /dev/null +++ b/lambda/shared/python/metrics_helper.py @@ -0,0 +1,257 @@ +""" +CloudWatch Metrics Helper for PDF Accessibility Platform + +This module provides utilities for emitting custom CloudWatch metrics +to track usage, costs, and performance across the PDF accessibility platform. +""" + +import boto3 +import time +from typing import Dict, List, Optional +from datetime import datetime + +cloudwatch = boto3.client('cloudwatch') + +NAMESPACE = "PDFAccessibility" + +def emit_metric( + metric_name: str, + value: float, + unit: str = "None", + dimensions: Optional[Dict[str, str]] = None, + timestamp: Optional[datetime] = None +): + """ + Emit a single metric to CloudWatch. + + Args: + metric_name: Name of the metric + value: Metric value + unit: CloudWatch unit (Count, Milliseconds, Bytes, etc.) + dimensions: Dict of dimension name/value pairs + timestamp: Metric timestamp (defaults to now) + """ + metric_data = { + 'MetricName': metric_name, + 'Value': value, + 'Unit': unit, + 'Timestamp': timestamp or datetime.utcnow() + } + + if dimensions: + metric_data['Dimensions'] = [ + {'Name': k, 'Value': v} for k, v in dimensions.items() + ] + + try: + cloudwatch.put_metric_data( + Namespace=NAMESPACE, + MetricData=[metric_data] + ) + except Exception as e: + print(f"Failed to emit metric {metric_name}: {e}") + +def track_pages_processed( + page_count: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track number of pages processed.""" + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName - aggregate at service/user level only + + emit_metric("PagesProcessed", page_count, "Count", dimensions) + +def track_adobe_api_call( + operation: str, + page_count: int = 0, + user_id: Optional[str] = None, + file_name: Optional[str] = None +): + """Track Adobe API calls and estimated Document Transactions. + + Adobe licensing: + - AutoTag: 10 Document Transactions per page + - ExtractPDF: 1 Document Transaction per 5 pages + """ + dimensions = { + "Service": "pdf2pdf", + "Operation": operation + } + if user_id: + dimensions["UserId"] = user_id + + emit_metric("AdobeAPICalls", 1, "Count", dimensions) + + # Calculate Document Transactions per Adobe licensing + if page_count > 0: + if operation == "AutoTag": + doc_transactions = page_count * 10 + elif operation == "ExtractPDF": + doc_transactions = -(-page_count // 5) # ceiling division + else: + doc_transactions = 1 + emit_metric("AdobeDocTransactions", doc_transactions, "Count", dimensions) + +def track_bedrock_invocation( + model_id: str, + input_tokens: int, + output_tokens: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track Bedrock model invocations and token usage.""" + dimensions = { + "Service": service, + "Model": model_id + } + if user_id: + dimensions["UserId"] = user_id + + emit_metric("BedrockInvocations", 1, "Count", dimensions) + emit_metric("BedrockInputTokens", input_tokens, "Count", dimensions) + emit_metric("BedrockOutputTokens", output_tokens, "Count", dimensions) + +def track_processing_duration( + stage: str, + duration_ms: float, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track processing duration for a specific stage.""" + dimensions = { + "Service": service, + "Stage": stage + } + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("ProcessingDuration", duration_ms, "Milliseconds", dimensions) + +def track_error( + error_type: str, + stage: str, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track errors by type and stage.""" + dimensions = { + "Service": service, + "Stage": stage, + "ErrorType": error_type + } + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("ErrorCount", 1, "Count", dimensions) + +def track_file_size( + size_bytes: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track file size.""" + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("FileSize", size_bytes, "Bytes", dimensions) + +def estimate_cost( + pages: int = 0, + adobe_calls: int = 0, + bedrock_input_tokens: int = 0, + bedrock_output_tokens: int = 0, + lambda_duration_ms: int = 0, + lambda_memory_mb: int = 1024, + ecs_duration_ms: int = 0, + ecs_vcpu: float = 0.25, + ecs_memory_gb: float = 1.0, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +) -> float: + """ + Estimate cost for a processing job and emit metric. + + Pricing (approximate, as of 2024): + - Adobe API: ~$0.05 per operation + - Bedrock Claude Haiku: $0.00025/1K input, $0.00125/1K output + - Bedrock Claude Sonnet: $0.003/1K input, $0.015/1K output + - Lambda: $0.0000166667/GB-sec + - ECS Fargate: $0.04048/vCPU-hr + $0.004445/GB-hr + - BDA: ~$0.01 per page + + Returns: + Estimated cost in USD + """ + cost = 0.0 + + # Adobe API cost + cost += adobe_calls * 0.05 + + # Bedrock cost (assuming Haiku for estimation) + cost += (bedrock_input_tokens / 1000) * 0.00025 + cost += (bedrock_output_tokens / 1000) * 0.00125 + + # Lambda cost + gb_seconds = (lambda_memory_mb / 1024) * (lambda_duration_ms / 1000) + cost += gb_seconds * 0.0000166667 + + # ECS cost + if ecs_duration_ms > 0: + hours = ecs_duration_ms / (1000 * 3600) + cost += (ecs_vcpu * hours * 0.04048) + (ecs_memory_gb * hours * 0.004445) + + # BDA cost (for pdf2html) + if service == "pdf2html": + cost += pages * 0.01 + + # Emit cost metric + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + + emit_metric("EstimatedCost", cost, "None", dimensions) + + return cost + +class MetricsContext: + """Context manager for tracking operation metrics.""" + + def __init__(self, stage: str, user_id: Optional[str] = None, + file_name: Optional[str] = None, service: str = "pdf2pdf"): + self.stage = stage + self.user_id = user_id + self.file_name = file_name + self.service = service + self.start_time = None + + def __enter__(self): + self.start_time = time.time() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + duration_ms = (time.time() - self.start_time) * 1000 + track_processing_duration( + self.stage, duration_ms, + self.user_id, self.file_name, self.service + ) + + if exc_type: + track_error( + exc_type.__name__, self.stage, + self.user_id, self.file_name, self.service + ) + + return False diff --git a/lambda/title-generator-lambda/title_generator.py b/lambda/title-generator-lambda/title_generator.py index 9aa9ac0c..b51887d4 100644 --- a/lambda/title-generator-lambda/title_generator.py +++ b/lambda/title-generator-lambda/title_generator.py @@ -5,6 +5,13 @@ import random import fitz # PyMuPDF +# Import metrics helper +try: + from metrics_helper import track_bedrock_invocation +except ImportError: + print("Warning: metrics_helper not available") + track_bedrock_invocation = lambda *args, **kwargs: None + # Helper function for exponential backoff and retry def exponential_backoff_retry( func, @@ -195,6 +202,16 @@ def generate_title(extracted_text, current_title): # Extract and return the generated title generated_title = response['output']['message']['content'][0]['text'] + + # Track Bedrock usage metrics + try: + usage = response.get('usage', {}) + input_tokens = usage.get('inputTokens', 0) + output_tokens = usage.get('outputTokens', 0) + track_bedrock_invocation(model_name, input_tokens, output_tokens, service="pdf2pdf") + except Exception as e: + print(f"(generate_title) Failed to track Bedrock metrics: {e}") + return generated_title.strip('"') diff --git a/lib/pipeline-helpers.sh b/lib/pipeline-helpers.sh new file mode 100644 index 00000000..bc7b8fe2 --- /dev/null +++ b/lib/pipeline-helpers.sh @@ -0,0 +1,454 @@ +#!/usr/bin/env bash +# ============================================================================= +# pipeline-helpers.sh — Pure functions for the private CI/CD pipeline setup +# ============================================================================= +# Sourced by deploy-private.sh. Contains testable logic with no side effects +# (no AWS CLI calls, no prompts). AWS-interacting wrappers live in the main script. +# ============================================================================= + +# Guard against direct execution +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + echo "ERROR: This file should be sourced, not executed directly." >&2 + exit 1 +fi + +# --------------------------------------------------------------------------- +# URL Validation +# --------------------------------------------------------------------------- + +# Validate a repository URL against the expected format for a given provider. +# Arguments: $1 = provider (github|codecommit|bitbucket|gitlab), $2 = url +# Returns: 0 if valid, 1 if invalid +validate_repo_url() { + local provider="$1" + local url="$2" + + case "$provider" in + github) + [[ "$url" =~ ^https://github\.com/[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+(\.git)?$ ]] && return 0 + ;; + codecommit) + [[ "$url" =~ ^https://git-codecommit\.[a-z0-9-]+\.amazonaws\.com/v1/repos/[a-zA-Z0-9._-]+$ ]] && return 0 + ;; + bitbucket) + [[ "$url" =~ ^https://bitbucket\.org/[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+(\.git)?$ ]] && return 0 + ;; + gitlab) + [[ "$url" =~ ^https://gitlab\.com/[a-zA-Z0-9._/-]+(\.git)?$ ]] && return 0 + ;; + esac + return 1 +} + +# --------------------------------------------------------------------------- +# Branch Resolution +# --------------------------------------------------------------------------- + +# Resolve a branch name, defaulting to "main" when input is empty. +# Arguments: $1 = branch (may be empty) +# Outputs: resolved branch name to stdout +resolve_branch() { + local input="${1:-}" + if [[ -n "$input" ]]; then + echo "$input" + else + echo "main" + fi +} + +# --------------------------------------------------------------------------- +# Connection Validation +# --------------------------------------------------------------------------- + +# Check whether a CodeConnections connection status is AVAILABLE. +# Arguments: $1 = status string +# Returns: 0 if AVAILABLE, 1 otherwise +validate_connection_status() { + local status="$1" + [[ "$status" == "AVAILABLE" ]] && return 0 + return 1 +} + +# --------------------------------------------------------------------------- +# IAM +# --------------------------------------------------------------------------- + +# Generate the JSON trust policy for a CodeBuild service role. +# Outputs: JSON string to stdout +generate_trust_policy() { + cat <<'EOF' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { "Service": "codebuild.amazonaws.com" }, + "Action": "sts:AssumeRole" + } + ] +} +EOF +} + +# --------------------------------------------------------------------------- +# S3 Bucket Naming +# --------------------------------------------------------------------------- + +# Generate the S3 bucket name for pdf2html deployments. +# Arguments: $1 = account_id, $2 = region +# Outputs: bucket name to stdout +generate_bucket_name() { + local account_id="$1" + local region="$2" + echo "pdf2html-bucket-${account_id}-${region}" +} + +# --------------------------------------------------------------------------- +# Build Status +# --------------------------------------------------------------------------- + +# Check whether a build status indicates failure. +# Arguments: $1 = status string +# Returns: 0 if failure status (FAILED|FAULT|STOPPED|TIMED_OUT), 1 otherwise +is_failure_status() { + local status="$1" + case "$status" in + FAILED|FAULT|STOPPED|TIMED_OUT) return 0 ;; + *) return 1 ;; + esac +} + +# --------------------------------------------------------------------------- +# Parameter Resolution +# --------------------------------------------------------------------------- + +# Merge parameters from env vars, config file, and CLI args. +# Precedence: CLI > env > config > defaults +# Arguments: (implementation-specific) +# Outputs: resolved key=value pairs to stdout +resolve_params() { + # Merge parameters with precedence: CLI > env > config > defaults + # Reads from: CONFIG_FILE (path), CLI_* vars, and existing env vars + # Sets global variables for each parameter + + local config_file="${CONFIG_FILE:-}" + + # Load config file values (lowest precedence after defaults) + if [[ -n "$config_file" && -f "$config_file" ]]; then + local config_output + config_output="$(parse_config_file "$config_file")" || return 1 + while IFS='=' read -r key value; do + [[ -z "$key" ]] && continue + # Only set if not already set by env var or CLI + local cli_var="CLI_${key}" + if [[ -z "${!cli_var:-}" && -z "${!key:-}" ]]; then + export "$key=$value" + fi + done <<< "$config_output" + fi + + # CLI overrides take highest precedence + [[ -n "${CLI_PRIVATE_REPO_URL:-}" ]] && export PRIVATE_REPO_URL="$CLI_PRIVATE_REPO_URL" + [[ -n "${CLI_SOURCE_PROVIDER:-}" ]] && export SOURCE_PROVIDER="$CLI_SOURCE_PROVIDER" + [[ -n "${CLI_DEPLOYMENT_TYPE:-}" ]] && export DEPLOYMENT_TYPE="$CLI_DEPLOYMENT_TYPE" + [[ -n "${CLI_TARGET_BRANCH:-}" ]] && export TARGET_BRANCH="$CLI_TARGET_BRANCH" + [[ -n "${CLI_CONNECTION_ARN:-}" ]] && export CONNECTION_ARN="$CLI_CONNECTION_ARN" + [[ -n "${CLI_ADOBE_CLIENT_ID:-}" ]] && export ADOBE_CLIENT_ID="$CLI_ADOBE_CLIENT_ID" + [[ -n "${CLI_ADOBE_CLIENT_SECRET:-}" ]] && export ADOBE_CLIENT_SECRET="$CLI_ADOBE_CLIENT_SECRET" + + # Apply defaults + TARGET_BRANCH="$(resolve_branch "${TARGET_BRANCH:-}")" + export TARGET_BRANCH +} + +# Parse a key-value config file, skipping comments and blank lines. +# Arguments: $1 = file path +# Outputs: KEY=VALUE pairs to stdout +parse_config_file() { + local path="$1" + if [[ ! -f "$path" ]]; then + echo "ERROR: Config file not found: $path" >&2 + return 1 + fi + local line_num=0 + while IFS= read -r line || [[ -n "$line" ]]; do + line_num=$((line_num + 1)) + # Skip blank lines and comments + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + # Strip leading/trailing whitespace + line="$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')" + # Validate KEY=VALUE format + if [[ "$line" =~ ^[A-Za-z_][A-Za-z0-9_]*=.* ]]; then + echo "$line" + else + echo "ERROR: Malformed line $line_num: $line" >&2 + return 1 + fi + done < "$path" +} + +# Validate that all required parameters are present. +# Arguments: $1 = non_interactive flag (true/false) +# Globals: reads PRIVATE_REPO_URL, SOURCE_PROVIDER, DEPLOYMENT_TYPE +# Outputs: list of missing params to stdout +# Returns: 0 if all present, 1 if any missing +check_required_params() { + local non_interactive="${1:-false}" + local missing=() + [[ -z "${PRIVATE_REPO_URL:-}" ]] && missing+=("PRIVATE_REPO_URL") + [[ -z "${SOURCE_PROVIDER:-}" ]] && missing+=("SOURCE_PROVIDER") + [[ -z "${DEPLOYMENT_TYPE:-}" ]] && missing+=("DEPLOYMENT_TYPE") + if [[ ${#missing[@]} -gt 0 ]]; then + echo "Missing required parameters: ${missing[*]}" + return 1 + fi + return 0 +} + +# Resolve CLI flag defaults for buildspec and project name. +# Arguments: $1 = buildspec_flag (may be empty), $2 = project_name_flag (may be empty) +# Outputs: two lines — resolved buildspec, resolved project name +resolve_cli_defaults() { + local buildspec_flag="${1:-}" + local project_name_flag="${2:-}" + if [[ -n "$buildspec_flag" ]]; then + echo "$buildspec_flag" + else + echo "buildspec-unified.yml" + fi + if [[ -n "$project_name_flag" ]]; then + echo "$project_name_flag" + else + echo "pdfremediation-$(date +%s)" + fi +} + +# --------------------------------------------------------------------------- +# UI Environment +# --------------------------------------------------------------------------- + +# Build environment variable assignments for UI deployment. +# Arguments: $1 = pdf2pdf_bucket, $2 = pdf2html_bucket +# Outputs: export statements to stdout +build_ui_env() { + local pdf2pdf_bucket="${1:-Null}" + local pdf2html_bucket="${2:-Null}" + echo "PDF_TO_PDF_BUCKET=${pdf2pdf_bucket}" + echo "PDF_TO_HTML_BUCKET=${pdf2html_bucket}" +} + +# --------------------------------------------------------------------------- +# Cleanup / Resource Filtering +# --------------------------------------------------------------------------- + +# Filter a list of project names by a glob pattern. +# Arguments: $1 = newline-separated project list, $2 = glob pattern +# Outputs: matching project names to stdout +filter_projects_by_pattern() { + local project_list="$1" + local pattern="$2" + while IFS= read -r project; do + [[ -z "$project" ]] && continue + # Use bash pattern matching (glob) + # shellcheck disable=SC2254 + case "$project" in + $pattern) echo "$project" ;; + esac + done <<< "$project_list" +} + +# --------------------------------------------------------------------------- +# Source Configuration +# --------------------------------------------------------------------------- + +# Build CodeBuild source JSON for a given provider. +# Arguments: $1=provider, $2=url, $3=branch, $4=connection_arn, $5=buildspec +# Outputs: JSON string to stdout +configure_source() { + local provider="$1" + local url="$2" + local branch="$3" + local connection_arn="${4:-}" + local buildspec="${5:-buildspec-unified.yml}" + + if [[ "$provider" == "codecommit" ]]; then + cat <&2; return 1 ;; + esac + # Note: Do NOT include an "auth" block here. CodeBuild uses the + # account-level source credential registered via import-source-credentials. + # Including auth inline causes OAuthProviderException conflicts. + cat <&2 + return 1 + fi + # Validate JSON and extract key-value pairs + local parsed + parsed="$(echo "$json_string" | jq -r 'to_entries[] | "\(.key)=\(.value)"' 2>/dev/null)" || { + echo "ERROR: Invalid JSON in branch-env-map: $json_string" >&2 + return 1 + } + if [[ -z "$parsed" ]]; then + echo "ERROR: Empty branch-env-map" >&2 + return 1 + fi + echo "$parsed" +} + +# Validate a Branch_Environment_Map for duplicates and empty entries. +# Arguments: reads from stdin or associative array +# Returns: 0 if valid, 1 if invalid (outputs error message to stderr) +validate_branch_env_map() { + local map_lines="$1" + if [[ -z "$map_lines" ]]; then + echo "ERROR: Branch-env-map is empty" >&2 + return 1 + fi + # Collect environment prefixes and check for duplicates + local -A seen_prefixes=() + while IFS='=' read -r branch env_name; do + [[ -z "$branch" ]] && continue + if [[ -z "$env_name" ]]; then + echo "ERROR: Empty environment name for branch '$branch'" >&2 + return 1 + fi + local prefix + prefix="$(generate_env_prefix "$env_name")" + if [[ -n "${seen_prefixes[$prefix]:-}" ]]; then + echo "ERROR: Duplicate Environment_Prefix '$prefix' from branches '${seen_prefixes[$prefix]}' and '$branch'" >&2 + return 1 + fi + seen_prefixes["$prefix"]="$branch" + done <<< "$map_lines" + return 0 +} + +# Resolve a branch name to its target environment. +# Arguments: $1 = branch name, $2 = serialized map (KEY=VALUE lines) +# Outputs: environment name to stdout (empty if no match) +resolve_environment() { + local branch="$1" + local map="${2:-}" + + # Use default map if none provided + if [[ -z "$map" ]]; then + map="main=prod +dev=dev +test=test +staging=staging" + fi + + # First pass: exact matches + while IFS='=' read -r pattern env_name; do + [[ -z "$pattern" ]] && continue + if [[ "$branch" == "$pattern" ]]; then + echo "$env_name" + return 0 + fi + done <<< "$map" + + # Second pass: glob pattern matches + while IFS='=' read -r pattern env_name; do + [[ -z "$pattern" ]] && continue + # Skip exact patterns (already checked) + [[ "$pattern" != *'*'* && "$pattern" != *'?'* ]] && continue + # shellcheck disable=SC2254 + case "$branch" in + $pattern) echo "$env_name"; return 0 ;; + esac + done <<< "$map" + + # No match + echo "" + return 1 +} + +# Generate an Environment_Prefix from an environment name. +# Arguments: $1 = environment name (e.g., "prod", "dev") +# Outputs: prefix string to stdout +generate_env_prefix() { + local env_name="$1" + # Lowercase, strip non-alphanumeric except hyphens + echo "$env_name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g' +} + +# Generate an environment-prefixed resource name. +# Arguments: $1 = env_prefix, $2 = base_name +# Outputs: prefixed name to stdout +generate_env_resource_name() { + local env_prefix="$1" + local base_name="$2" + echo "${env_prefix}-${base_name}" +} + +# Generate CodeBuild webhook filter JSON for a branch pattern. +# Arguments: $1 = branch pattern, $2 = env_name, $3 = is_production (true/false) +# Outputs: JSON filter groups to stdout +configure_webhooks() { + local branch="$1" + local env_name="$2" + local is_production="${3:-false}" + + # Convert glob pattern to regex for HEAD_REF + local head_ref_pattern + head_ref_pattern="^refs/heads/$(echo "$branch" | sed 's/\*/.\*/g')$" + + if [[ "$is_production" == "true" ]]; then + cat < 0: + track_bedrock_invocation( + model_id, input_tokens, output_tokens, + user_id, key, "pdf2html" + ) + + # Estimate cost + total_input_tokens = sum(m.get('input_tokens', 0) for m in bedrock_usage.values()) + total_output_tokens = sum(m.get('output_tokens', 0) for m in bedrock_usage.values()) + + estimate_cost( + pages=page_count, + bedrock_input_tokens=total_input_tokens, + bedrock_output_tokens=total_output_tokens, + lambda_duration_ms=context.get_remaining_time_in_millis(), + lambda_memory_mb=context.memory_limit_in_mb, + user_id=user_id, + file_name=key, + service="pdf2html" + ) + except Exception as metrics_error: + print(f"[WARNING] Failed to track metrics: {metrics_error}") + + # Structured log — always emit, independent of metrics tracking + try: + if page_count == 0: + try: + from pypdf import PdfReader as _PdfReader + _reader = _PdfReader(local_in) + page_count = len(_reader.pages) + except Exception: + page_count = 1 + + if page_count > 0: + track_pages_processed(page_count, user_id, key, "pdf2html") + + print(json.dumps({"event": "file_processed", "userId": user_id or "anonymous", "fileName": key.split("/")[-1], "pageCount": page_count, "service": "pdf2html"})) + except Exception as log_error: + print(f"[WARNING] Failed to emit structured log: {log_error}") except Exception as e: print(f"[ERROR] Processing {key} failed: {e}") print(traceback.format_exc()) @@ -316,6 +430,7 @@ def lambda_handler(event, context): print(traceback.format_exc()) return {"status": "error", "message": f"Zip creation or upload failed: {e}"} + metrics_ctx.__exit__(None, None, None) return { "status": "done", "execution_id": context.aws_request_id, diff --git a/pdf2html/metrics_helper.py b/pdf2html/metrics_helper.py new file mode 100644 index 00000000..6b3d18fe --- /dev/null +++ b/pdf2html/metrics_helper.py @@ -0,0 +1,257 @@ +""" +CloudWatch Metrics Helper for PDF Accessibility Platform + +This module provides utilities for emitting custom CloudWatch metrics +to track usage, costs, and performance across the PDF accessibility platform. +""" + +import boto3 +import time +from typing import Dict, List, Optional +from datetime import datetime + +cloudwatch = boto3.client('cloudwatch') + +NAMESPACE = "PDFAccessibility" + +def emit_metric( + metric_name: str, + value: float, + unit: str = "None", + dimensions: Optional[Dict[str, str]] = None, + timestamp: Optional[datetime] = None +): + """ + Emit a single metric to CloudWatch. + + Args: + metric_name: Name of the metric + value: Metric value + unit: CloudWatch unit (Count, Milliseconds, Bytes, etc.) + dimensions: Dict of dimension name/value pairs + timestamp: Metric timestamp (defaults to now) + """ + metric_data = { + 'MetricName': metric_name, + 'Value': value, + 'Unit': unit, + 'Timestamp': timestamp or datetime.utcnow() + } + + if dimensions: + metric_data['Dimensions'] = [ + {'Name': k, 'Value': v} for k, v in dimensions.items() + ] + + try: + cloudwatch.put_metric_data( + Namespace=NAMESPACE, + MetricData=[metric_data] + ) + except Exception as e: + print(f"Failed to emit metric {metric_name}: {e}") + +def track_pages_processed( + page_count: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track number of pages processed.""" + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName - aggregate at service/user level only + + emit_metric("PagesProcessed", page_count, "Count", dimensions) + +def track_adobe_api_call( + operation: str, + page_count: int = 0, + user_id: Optional[str] = None, + file_name: Optional[str] = None +): + """Track Adobe API calls and estimated Document Transactions. + + Adobe licensing: + - AutoTag: 10 Document Transactions per page + - ExtractPDF: 1 Document Transaction per 5 pages + """ + dimensions = { + "Service": "pdf2pdf", + "Operation": operation + } + if user_id: + dimensions["UserId"] = user_id + + emit_metric("AdobeAPICalls", 1, "Count", dimensions) + + # Calculate Document Transactions per Adobe licensing + if page_count > 0: + if operation == "AutoTag": + doc_transactions = page_count * 10 + elif operation == "ExtractPDF": + doc_transactions = -(-page_count // 5) # ceiling division + else: + doc_transactions = 1 + emit_metric("AdobeDocTransactions", doc_transactions, "Count", dimensions) + +def track_bedrock_invocation( + model_id: str, + input_tokens: int, + output_tokens: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track Bedrock model invocations and token usage.""" + dimensions = { + "Service": service, + "Model": model_id + } + if user_id: + dimensions["UserId"] = user_id + + emit_metric("BedrockInvocations", 1, "Count", dimensions) + emit_metric("BedrockInputTokens", input_tokens, "Count", dimensions) + emit_metric("BedrockOutputTokens", output_tokens, "Count", dimensions) + +def track_processing_duration( + stage: str, + duration_ms: float, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track processing duration for a specific stage.""" + dimensions = { + "Service": service, + "Stage": stage + } + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("ProcessingDuration", duration_ms, "Milliseconds", dimensions) + +def track_error( + error_type: str, + stage: str, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track errors by type and stage.""" + dimensions = { + "Service": service, + "Stage": stage, + "ErrorType": error_type + } + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("ErrorCount", 1, "Count", dimensions) + +def track_file_size( + size_bytes: int, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +): + """Track file size.""" + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + # Don't include FileName + + emit_metric("FileSize", size_bytes, "Bytes", dimensions) + +def estimate_cost( + pages: int = 0, + adobe_calls: int = 0, + bedrock_input_tokens: int = 0, + bedrock_output_tokens: int = 0, + lambda_duration_ms: int = 0, + lambda_memory_mb: int = 1024, + ecs_duration_ms: int = 0, + ecs_vcpu: float = 0.25, + ecs_memory_gb: float = 1.0, + user_id: Optional[str] = None, + file_name: Optional[str] = None, + service: str = "pdf2pdf" +) -> float: + """ + Estimate cost for a processing job and emit metric. + + Pricing (approximate, as of 2024): + - Adobe API: ~$0.05 per operation + - Bedrock Claude Haiku: $0.00025/1K input, $0.00125/1K output + - Bedrock Claude Sonnet: $0.003/1K input, $0.015/1K output + - Lambda: $0.0000166667/GB-sec + - ECS Fargate: $0.04048/vCPU-hr + $0.004445/GB-hr + - BDA: ~$0.01 per page + + Returns: + Estimated cost in USD + """ + cost = 0.0 + + # Adobe API cost + cost += adobe_calls * 0.05 + + # Bedrock cost (assuming Haiku for estimation) + cost += (bedrock_input_tokens / 1000) * 0.00025 + cost += (bedrock_output_tokens / 1000) * 0.00125 + + # Lambda cost + gb_seconds = (lambda_memory_mb / 1024) * (lambda_duration_ms / 1000) + cost += gb_seconds * 0.0000166667 + + # ECS cost + if ecs_duration_ms > 0: + hours = ecs_duration_ms / (1000 * 3600) + cost += (ecs_vcpu * hours * 0.04048) + (ecs_memory_gb * hours * 0.004445) + + # BDA cost (for pdf2html) + if service == "pdf2html": + cost += pages * 0.01 + + # Emit cost metric + dimensions = {"Service": service} + if user_id: + dimensions["UserId"] = user_id + + emit_metric("EstimatedCost", cost, "None", dimensions) + + return cost + +class MetricsContext: + """Context manager for tracking operation metrics.""" + + def __init__(self, stage: str, user_id: Optional[str] = None, + file_name: Optional[str] = None, service: str = "pdf2pdf"): + self.stage = stage + self.user_id = user_id + self.file_name = file_name + self.service = service + self.start_time = None + + def __enter__(self): + self.start_time = time.time() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + duration_ms = (time.time() - self.start_time) * 1000 + track_processing_duration( + self.stage, duration_ms, + self.user_id, self.file_name, self.service + ) + + if exc_type: + track_error( + exc_type.__name__, self.stage, + self.user_id, self.file_name, self.service + ) + + return False diff --git a/test/lib/test-helpers.sh b/test/lib/test-helpers.sh new file mode 100644 index 00000000..fafa3788 --- /dev/null +++ b/test/lib/test-helpers.sh @@ -0,0 +1,229 @@ +#!/usr/bin/env bash +# ============================================================================= +# test-helpers.sh — Shared test utilities for bats-core property-based tests +# ============================================================================= + +# Resolve the project root relative to this file +TEST_HELPERS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$TEST_HELPERS_DIR/../.." && pwd)" +LIB_DIR="$PROJECT_ROOT/lib" + +# Source the helpers under test +source "$LIB_DIR/pipeline-helpers.sh" + +# --------------------------------------------------------------------------- +# PBT Configuration +# --------------------------------------------------------------------------- +PBT_ITERATIONS="${PBT_ITERATIONS:-100}" +PBT_SEED="${PBT_SEED:-$RANDOM}" + +# Log seed for reproducibility +pbt_log_seed() { + echo "# PBT seed: $PBT_SEED (iterations: $PBT_ITERATIONS)" >&3 2>/dev/null || true +} + +# Run a property test function N times with seeded randomness. +# Arguments: $1 = test function name +# The function is called with the current iteration index. +run_property_test() { + local test_fn="$1" + RANDOM=$PBT_SEED + pbt_log_seed + for ((i = 0; i < PBT_ITERATIONS; i++)); do + "$test_fn" "$i" + done +} + +# --------------------------------------------------------------------------- +# Random Generators +# --------------------------------------------------------------------------- + +# Random alphanumeric string of given length (default 8) +random_string() { + local len="${1:-8}" + tr -dc 'a-zA-Z0-9' /dev/null | head -c "$len" || \ + cat /dev/urandom | LC_ALL=C tr -dc 'a-zA-Z0-9' | head -c "$len" +} + +# Random lowercase string +random_lower_string() { + local len="${1:-8}" + tr -dc 'a-z0-9' /dev/null | head -c "$len" || \ + cat /dev/urandom | LC_ALL=C tr -dc 'a-z0-9' | head -c "$len" +} + +# Random 12-digit AWS account ID +random_account_id() { + printf '%012d' $((RANDOM * RANDOM % 1000000000000)) +} + +# Random AWS region from a realistic set +random_region() { + local regions=( + "us-east-1" "us-east-2" "us-west-1" "us-west-2" + "eu-west-1" "eu-west-2" "eu-central-1" + "ap-southeast-1" "ap-southeast-2" "ap-northeast-1" + ) + echo "${regions[$((RANDOM % ${#regions[@]}))]}" +} + +# --------------------------------------------------------------------------- +# Random URL Generators (per provider) +# --------------------------------------------------------------------------- + +# Generate a valid GitHub URL with random org/repo +random_github_url() { + local org + org="$(random_lower_string 6)" + local repo + repo="$(random_lower_string 8)" + local suffix="" + if (( RANDOM % 2 == 0 )); then suffix=".git"; fi + echo "https://github.com/${org}/${repo}${suffix}" +} + +# Generate a valid CodeCommit URL with random region/repo +random_codecommit_url() { + local region + region="$(random_region)" + local repo + repo="$(random_lower_string 8)" + echo "https://git-codecommit.${region}.amazonaws.com/v1/repos/${repo}" +} + +# Generate a valid Bitbucket URL with random org/repo +random_bitbucket_url() { + local org + org="$(random_lower_string 6)" + local repo + repo="$(random_lower_string 8)" + local suffix="" + if (( RANDOM % 2 == 0 )); then suffix=".git"; fi + echo "https://bitbucket.org/${org}/${repo}${suffix}" +} + +# Generate a valid GitLab URL with random org/repo +random_gitlab_url() { + local org + org="$(random_lower_string 6)" + local repo + repo="$(random_lower_string 8)" + local suffix="" + if (( RANDOM % 2 == 0 )); then suffix=".git"; fi + echo "https://gitlab.com/${org}/${repo}${suffix}" +} + +# Generate a valid URL for a random provider +random_valid_url() { + local provider="$1" + case "$provider" in + github) random_github_url ;; + codecommit) random_codecommit_url ;; + bitbucket) random_bitbucket_url ;; + gitlab) random_gitlab_url ;; + *) echo "https://invalid.example.com/repo" ;; + esac +} + +# Generate a random invalid URL (not matching any provider pattern) +random_invalid_url() { + local variants=( + "http://github.com/org/repo" + "https://github.com/" + "https://github.com/org" + "ftp://github.com/org/repo" + "https://notgithub.com/org/repo" + "git@github.com:org/repo.git" + "https://bitbucket.org/" + "https://codecommit.us-east-1.amazonaws.com/repos/test" + "" + "not-a-url" + ) + echo "${variants[$((RANDOM % ${#variants[@]}))]}" +} + +# --------------------------------------------------------------------------- +# Random Provider +# --------------------------------------------------------------------------- + +random_provider() { + local providers=("github" "codecommit" "bitbucket" "gitlab") + echo "${providers[$((RANDOM % ${#providers[@]}))]}" +} + +# --------------------------------------------------------------------------- +# Random Branch Names +# --------------------------------------------------------------------------- + +random_branch() { + local branches=("main" "dev" "test" "staging" "feature/my-thing" "release/1.0" "hotfix/bug-123") + echo "${branches[$((RANDOM % ${#branches[@]}))]}" +} + +# --------------------------------------------------------------------------- +# Random Environment Names +# --------------------------------------------------------------------------- + +random_env_name() { + local envs=("prod" "dev" "test" "staging" "qa" "uat") + echo "${envs[$((RANDOM % ${#envs[@]}))]}" +} + +# --------------------------------------------------------------------------- +# Temp File Helpers +# --------------------------------------------------------------------------- + +# Create a temp file and echo its path. Caller is responsible for cleanup. +create_temp_file() { + local prefix="${1:-pbt-test}" + mktemp "/tmp/${prefix}.XXXXXX" +} + +# Create a temp directory and echo its path. +create_temp_dir() { + local prefix="${1:-pbt-test}" + mktemp -d "/tmp/${prefix}.XXXXXX" +} + +# --------------------------------------------------------------------------- +# Assertion Helpers +# --------------------------------------------------------------------------- + +# Assert two values are equal, with descriptive failure message +assert_equal() { + local expected="$1" + local actual="$2" + local msg="${3:-}" + if [[ "$expected" != "$actual" ]]; then + echo "ASSERTION FAILED${msg:+: $msg}" >&2 + echo " expected: '$expected'" >&2 + echo " actual: '$actual'" >&2 + return 1 + fi +} + +# Assert a string starts with a prefix +assert_starts_with() { + local prefix="$1" + local actual="$2" + local msg="${3:-}" + if [[ "$actual" != "${prefix}"* ]]; then + echo "ASSERTION FAILED${msg:+: $msg}" >&2 + echo " expected to start with: '$prefix'" >&2 + echo " actual: '$actual'" >&2 + return 1 + fi +} + +# Assert a command exits with a specific code +assert_exit_code() { + local expected_code="$1" + shift + local actual_code=0 + "$@" || actual_code=$? + if [[ "$actual_code" -ne "$expected_code" ]]; then + echo "ASSERTION FAILED: expected exit code $expected_code, got $actual_code" >&2 + echo " command: $*" >&2 + return 1 + fi +}