Skip to content

Commit 3f1a858

Browse files
author
Manus Sandbox
committed
Checkpoint: v3.45.0 - Complete implementation of 3 major features:
1. **PO Box Normalization** - Detects all PO Box variations (P.O. Box, PO Box, POBox, P O Box, P.O.Box) - Normalizes to standard "PO Box XXX" format - Extracts box number (supports alphanumeric and hyphenated formats) - Continues extracting city/state/ZIP (doesn't skip) - Added "box" to STREET_SUFFIXES for proper parsing - Special handling in parseRunOnAddress for PO Box addresses 2. **ZIP Code Validation** - Integrated @mardillu/us-cities-utils npm package - Created ZIPValidationService with methods: * lookup(zip) → { city, state, lat, long } * validateZIPState(zip, state) → boolean * getStateFromZIP(zip) → string * isValidZIP(zip) → boolean - Validates ZIP against extracted state - Flags ZIP/state mismatches - Supports ZIP+4 format 3. **Confidence Scoring System** - Created ConfidenceScorer class with scoring methods: * scoreStreet() - 0-1 based on street components * scoreCity() - 0-1 based on city validation * scoreState() - 0-1 based on format and ZIP match * scoreZIP() - 0-1 based on format and state match * scoreOverall() - average of all components * getConfidenceLevel() - returns "high", "medium", or "low" - Generates flags for issues: missing_street, missing_city, missing_state, missing_zip, zip_state_mismatch, ambiguous_city - Returns comprehensive scoring in AddressParseResult - Supports ambiguous city detection **Test Results:** - 37/37 v3.45.0 tests passing (100%) - All PO Box variations tested - All confidence scoring scenarios tested - ZIP validation tests passing - Backward compatibility verified (v3.43 and v3.44 tests still passing) **Files Modified:** - shared/normalization/addresses/AddressParser.ts (enhanced with PO Box handling) - shared/normalization/addresses/ZIPValidationService.ts (new) - shared/normalization/addresses/POBoxDetector.ts (new) - shared/normalization/addresses/ConfidenceScorer.ts (new) - tests/normalization/addresses/AddressParser.v3.45.test.ts (new, 37 tests) - todo.md (updated with completion status)
1 parent 1a4d739 commit 3f1a858

12 files changed

Lines changed: 1100 additions & 50 deletions
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"hash":"addb5a228eaf1848","duration":8866}
37.9 KB
Binary file not shown.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"@aws-sdk/client-s3": "^3.693.0",
2323
"@aws-sdk/s3-request-presigner": "^3.693.0",
2424
"@devmehq/email-validator-js": "^2.10.2",
25+
"@mardillu/us-cities-utils": "^1.2.7",
2526
"@normalization/core": "workspace:*",
2627
"@radix-ui/react-accordion": "^1.2.12",
2728
"@radix-ui/react-alert-dialog": "^1.1.15",

pnpm-lock.yaml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

shared/normalization/addresses/AddressParser.ts

Lines changed: 126 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
* Enhanced Address Parser
33
*
44
* Handles:
5-
* 1. Secondary address component stripping (Apt, Suite, Unit, #, Bldg, etc.)
6-
* 2. Run-on address parsing (city/state extraction without commas)
7-
* 3. Title case normalization
8-
* 4. Street suffix abbreviations
5+
* 1. PO Box detection and normalization
6+
* 2. Secondary address component stripping (Apt, Suite, Unit, #, Bldg, etc.)
7+
* 3. Run-on address parsing (city/state extraction without commas)
8+
* 4. Title case normalization
9+
* 5. Street suffix abbreviations
10+
* 6. Confidence scoring
911
*/
1012

1113
// US State Abbreviations
@@ -39,7 +41,7 @@ export const STREET_SUFFIXES = [
3941
'street', 'st', 'avenue', 'ave', 'road', 'rd', 'boulevard', 'blvd',
4042
'drive', 'dr', 'lane', 'ln', 'court', 'ct', 'circle', 'cir',
4143
'place', 'pl', 'way', 'highway', 'hwy', 'parkway', 'pkwy',
42-
'trail', 'trl', 'terrace', 'ter', 'plaza', 'plz'
44+
'trail', 'trl', 'terrace', 'ter', 'plaza', 'plz', 'box' // PO Box support
4345
];
4446

4547
// Secondary address indicators
@@ -56,6 +58,27 @@ export interface ParsedAddress {
5658
zip: string;
5759
}
5860

61+
export interface NormalizedAddress {
62+
street: string;
63+
city: string;
64+
state: string;
65+
zip: string;
66+
isPOBox?: boolean;
67+
boxNumber?: string;
68+
}
69+
70+
export interface AddressParseResult extends NormalizedAddress {
71+
confidence?: {
72+
street: number;
73+
city: number;
74+
state: number;
75+
zip: number;
76+
overall: number;
77+
};
78+
flags?: string[];
79+
confidence_level?: 'high' | 'medium' | 'low';
80+
}
81+
5982
/**
6083
* Strip secondary address components (Apt, Suite, Unit, etc.)
6184
*
@@ -102,6 +125,41 @@ export function stripSecondaryAddress(address: string): string {
102125
return cleaned;
103126
}
104127

128+
/**
129+
* Normalize PO Box format to standard "PO Box XXX"
130+
*
131+
* @param address - Raw address string
132+
* @returns Object with normalized address and PO Box info
133+
*/
134+
function normalizePOBox(address: string): { address: string; isPOBox: boolean; boxNumber: string } {
135+
if (!address) {
136+
return { address: '', isPOBox: false, boxNumber: '' };
137+
}
138+
139+
// Pattern matches various PO Box formats:
140+
// - P.O. Box 123
141+
// - PO Box 123
142+
// - POBox 123
143+
// - P O Box 123
144+
// - P.O.Box 123
145+
// - P.O. BOX 123
146+
// - etc.
147+
const poBoxPattern = /\b(p\.?\s*o\.?\s*box|pobox|p\s+o\s+box)\s+([a-z0-9\-]+)\b/gi;
148+
const match = poBoxPattern.exec(address);
149+
150+
if (match) {
151+
const boxNumber = match[2].trim();
152+
// Replace with standard format
153+
const normalized = address.replace(
154+
/\b(p\.?\s*o\.?\s*box|pobox|p\s+o\s+box)\s+/gi,
155+
'PO Box '
156+
);
157+
return { address: normalized, isPOBox: true, boxNumber: boxNumber };
158+
}
159+
160+
return { address: address, isPOBox: false, boxNumber: '' };
161+
}
162+
105163
/**
106164
* Parse run-on address (extract city, state, ZIP from address without commas)
107165
*
@@ -149,6 +207,22 @@ export function parseRunOnAddress(address: string): ParsedAddress {
149207
const words = remaining.split(/\s+/);
150208
let streetEndIndex = -1;
151209

210+
// Special handling for PO Box: "PO Box XXX" should be treated as complete street
211+
if (words.length >= 3 && words[0].toUpperCase() === 'PO' && words[1].toLowerCase() === 'box') {
212+
// PO Box format: take "PO Box" + box number as street
213+
street = words.slice(0, 3).join(' '); // "PO Box 456"
214+
// Everything after box number = city/state/zip
215+
const remaining2 = words.slice(3).join(' ');
216+
if (remaining2) {
217+
// Parse city from remaining
218+
const cityWords = remaining2.split(/\s+/);
219+
if (cityWords.length > 0) {
220+
city = cityWords[0]; // First word after box number is city
221+
}
222+
}
223+
return { street: street.trim(), city: city.trim(), state: state.trim(), zip: zip.trim() };
224+
}
225+
152226
for (let i = words.length - 1; i >= 0; i--) {
153227
const word = words[i].toLowerCase().replace(/[.,]/g, '');
154228
// Check if entire word is a street suffix (don't split on hyphens)
@@ -174,7 +248,7 @@ export function parseRunOnAddress(address: string): ParsedAddress {
174248
// Likely format: "123 Main Durham" or "456 Maple Dr Springfield"
175249
// Check if second-to-last word looks like a street suffix abbreviation
176250
const secondToLast = words.length >= 2 ? words[words.length - 2].toLowerCase().replace(/[.,]/g, '') : '';
177-
const isCommonAbbr = ['dr', 'st', 'ave', 'rd', 'ln', 'ct', 'blvd', 'way'].includes(secondToLast);
251+
const isCommonAbbr = ['dr', 'st', 'ave', 'rd', 'ln', 'ct', 'blvd', 'way', 'box'].includes(secondToLast);
178252

179253
if (isCommonAbbr && words.length >= 3) {
180254
// Format: "456 Maple Dr Springfield" - last word is city, everything before is street
@@ -215,8 +289,20 @@ export function parseRunOnAddress(address: string): ParsedAddress {
215289
export function titleCase(str: string): string {
216290
if (!str) return '';
217291

218-
return str
219-
.toLowerCase()
292+
const lowerStr = str.toLowerCase();
293+
294+
// Special case: preserve PO Box format - check for space after
295+
if (lowerStr.startsWith('po box ')) {
296+
// Already normalized, just ensure uppercase PO Box
297+
return 'PO Box ' + titleCase(str.slice(7));
298+
}
299+
300+
// Also handle "PO Box" without trailing space (end of string)
301+
if (lowerStr === 'po box') {
302+
return 'PO Box';
303+
}
304+
305+
return lowerStr
220306
.split(/\s+/)
221307
.map(word => {
222308
// Remove periods from abbreviations (W. → W, St. → St)
@@ -242,32 +328,31 @@ export function titleCase(str: string): string {
242328
.join(' ');
243329
}
244330

245-
export interface NormalizedAddress {
246-
street: string;
247-
city: string;
248-
state: string;
249-
zip: string;
250-
}
251-
252331
/**
253332
* Normalize address (full pipeline)
254333
*
255-
* 1. Strip secondary address components
256-
* 2. Parse run-on address (if needed)
257-
* 3. Apply title case
258-
* 4. Return cleaned street address with extracted city/state/ZIP
334+
* 1. Detect and normalize PO Box
335+
* 2. Strip secondary address components
336+
* 3. Parse run-on address (if needed)
337+
* 4. Apply title case
338+
* 5. Return cleaned street address with extracted city/state/ZIP
259339
*
260340
* @param address - Raw address string
261341
* @returns Normalized address with separate street, city, state, ZIP
262342
*/
263343
export function normalizeAddress(address: string): NormalizedAddress {
264344
if (!address) {
265-
return { street: '', city: '', state: '', zip: '' };
345+
return { street: '', city: '', state: '', zip: '', isPOBox: false, boxNumber: '' };
266346
}
267347

348+
// Step 0: Detect and normalize PO Box
349+
const poBoxNormalized = normalizePOBox(address);
350+
const isPOBox = poBoxNormalized.isPOBox;
351+
const boxNumber = poBoxNormalized.boxNumber;
352+
268353
// Step 1: Strip secondary address components FIRST (before parsing)
269354
// This prevents "Apt 402" from being detected as part of city
270-
const cleanedAddress = stripSecondaryAddress(address);
355+
const cleanedAddress = stripSecondaryAddress(poBoxNormalized.address);
271356

272357
// Step 2: Parse run-on address to extract all components
273358
const parsed = parseRunOnAddress(cleanedAddress);
@@ -280,7 +365,9 @@ export function normalizeAddress(address: string): NormalizedAddress {
280365
street: normalizedStreet,
281366
city: normalizedCity,
282367
state: parsed.state.toUpperCase(), // Ensure state is uppercase abbreviation
283-
zip: parsed.zip
368+
zip: parsed.zip,
369+
isPOBox: isPOBox,
370+
boxNumber: boxNumber
284371
};
285372
}
286373

@@ -295,6 +382,23 @@ export function normalizeAddressString(address: string): string {
295382
return normalized.street;
296383
}
297384

385+
/**
386+
* Format address for display
387+
*
388+
* @param address - Normalized address object
389+
* @returns Formatted address string
390+
*/
391+
export function formatAddress(address: NormalizedAddress): string {
392+
const parts: string[] = [];
393+
394+
if (address.street) parts.push(address.street);
395+
if (address.city) parts.push(address.city);
396+
if (address.state) parts.push(address.state);
397+
if (address.zip) parts.push(address.zip);
398+
399+
return parts.join(', ');
400+
}
401+
298402
/**
299403
* Parse location string into city and state
300404
* Handles formats:

0 commit comments

Comments
 (0)