Skip to content

Commit 277439c

Browse files
committed
Fix issue #1: Download US states data with Python and save as JSON.
1 parent 1f8b11e commit 277439c

5 files changed

Lines changed: 96 additions & 86 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ data/*.json
44
data/*.txt
55
data/*.zip
66

7+
geonamescache/data/us_states.json
8+
79

810
# Byte-compiled / optimized / DLL files
911
__pycache__/

bin/us_states.js

Lines changed: 0 additions & 14 deletions
This file was deleted.

bin/us_states.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/usr/bin/env python
2+
from pathlib import Path
3+
import json
4+
import re
5+
6+
from bs4 import BeautifulSoup
7+
import httpx
8+
9+
10+
def extract_us_states_data():
11+
"""
12+
Downloads HTML from GeoNames US administrative divisions page
13+
and extracts state information into a dictionary structure.
14+
"""
15+
16+
# Download the HTML content
17+
url = 'http://www.geonames.org/US/administrative-division-united-states.html'
18+
19+
try:
20+
response = httpx.get(url)
21+
response.raise_for_status() # Raise an exception for bad status codes
22+
html_content = response.text
23+
except httpx.RequestError as e:
24+
print(f'Error downloading the page: {e}')
25+
return {}
26+
27+
# Parse the HTML
28+
soup = BeautifulSoup(html_content, 'html.parser')
29+
30+
# Find the table with id 'subdivtable1'
31+
table = soup.find('table', id='subdivtable1')
32+
if not table:
33+
print("Table with id 'subdivtable1' not found")
34+
return {}
35+
36+
# Dictionary to store the results
37+
states_data = {}
38+
39+
# Find all table rows
40+
rows = table.find_all('tr')
41+
42+
for row in rows:
43+
# Find span with id containing "isoSpan" for the state code
44+
iso_span = row.find('span', id=re.compile(r'.*isoSpan.*'))
45+
if iso_span and iso_span.get_text(strip=True):
46+
code = iso_span.get_text(strip=True)
47+
48+
# Find span with id containing "fipsSpan" for FIPS code
49+
fips_span = row.find('span', id=re.compile(r'.*fipsSpan.*'))
50+
fips = fips_span.get_text(strip=True) if fips_span else ""
51+
52+
# Find span with id containing "nameSpan" and get the link inside it
53+
name_span = row.find('span', id=re.compile(r'.*nameSpan.*'))
54+
if name_span:
55+
link = name_span.find('a')
56+
if link and link.get('href'):
57+
href = link.get('href')
58+
name = link.get_text(strip=True)
59+
60+
# Extract geoname ID from href (assuming format like "/path/path/ID/name")
61+
href_parts = href.split('/')
62+
if len(href_parts) > 3:
63+
try:
64+
geoname_id = int(href_parts[3])
65+
66+
# Create the data structure matching the JavaScript output
67+
states_data[code] = {
68+
'code': code,
69+
'name': name,
70+
'fips': fips,
71+
'geonameid': geoname_id
72+
}
73+
74+
except (ValueError, IndexError):
75+
# Skip if geoname ID extraction fails
76+
continue
77+
78+
return states_data
79+
80+
81+
if __name__ == '__main__':
82+
states_data = extract_us_states_data()
83+
if states_data:
84+
print(f'Total states extracted: {len(states_data)}')
85+
Path('geonamescache/data/us_states.json').write_text(json.dumps(states_data))
86+
else:
87+
print('No data extracted. Please check the URL and HTML structure.')

geonamescache/__init__.py

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
__title__ = 'geonamescache'
2-
__version__ = '2.0.0'
2+
__version__ = '3.0.0'
33
__author__ = 'Ramiro Gómez'
44
__license__ = 'MIT'
55

@@ -9,7 +9,6 @@
99
from collections.abc import Mapping
1010
from typing import Any, ClassVar, TypeVar
1111

12-
from geonamescache import geonamesdata
1312
from geonamescache.types import (
1413
City,
1514
CitySearchAttribute,
@@ -28,13 +27,13 @@
2827

2928

3029
class GeonamesCache:
31-
us_states: dict[USStateCode, USState] = geonamesdata.us_states
3230
continents: dict[ContinentCode, Continent] | None = None
3331
countries: dict[ISOStr, Country] | None = None
3432
cities: dict[GeoNameIdStr, City] | None = None
3533
cities_items: list[tuple[GeoNameIdStr, City]] | None = None
3634
cities_by_names: ClassVar[dict[str, list[dict[GeoNameIdStr, City]]]] = {}
3735
us_counties: list[USCounty] | None = None
36+
us_states: dict[USStateCode, USState] | None = None
3837

3938
def __init__(self, min_city_population: int = 15000):
4039
self.min_city_population = min_city_population
@@ -43,17 +42,13 @@ def get_dataset_by_key(self, dataset: dict[Any, TDict], key: str) -> dict[Any, T
4342
return {d[key]: d for c, d in list(dataset.items())}
4443

4544
def get_continents(self) -> dict[ContinentCode, Continent]:
46-
if self.continents is None:
47-
self.continents = self._load_data(self.continents, 'continents.json')
48-
return self.continents
45+
return self._load_data(self.continents, 'continents.json')
4946

5047
def get_countries(self) -> dict[ISOStr, Country]:
51-
if self.countries is None:
52-
self.countries = self._load_data(self.countries, 'countries.json')
53-
return self.countries
48+
return self._load_data(self.countries, 'countries.json')
5449

5550
def get_us_states(self) -> dict[USStateCode, USState]:
56-
return self.us_states
51+
return self._load_data(self.us_states, 'us_states.json')
5752

5853
def get_countries_by_names(self) -> dict[str, Country]:
5954
return self.get_dataset_by_key(self.get_countries(), 'name')
@@ -63,10 +58,7 @@ def get_us_states_by_names(self) -> dict[USStateName, USState]:
6358

6459
def get_cities(self) -> dict[GeoNameIdStr, City]:
6560
"""Get a dictionary of cities keyed by geonameid."""
66-
67-
if self.cities is None:
68-
self.cities = self._load_data(self.cities, f'cities{self.min_city_population}.json')
69-
return self.cities
61+
return self._load_data(self.cities, f'cities{self.min_city_population}.json')
7062

7163
def get_cities_by_name(self, name: str) -> list[dict[GeoNameIdStr, City]]:
7264
"""Get a list of city dictionaries with the given name.
@@ -81,9 +73,7 @@ def get_cities_by_name(self, name: str) -> list[dict[GeoNameIdStr, City]]:
8173
return self.cities_by_names[name]
8274

8375
def get_us_counties(self):
84-
if self.us_counties is None:
85-
self.us_counties = self._load_data(self.us_counties, 'us_counties.json')
86-
return self.us_counties
76+
return self._load_data(self.us_counties, 'us_counties.json')
8777

8878
def search_cities(
8979
self,

geonamescache/geonamesdata.py

Lines changed: 0 additions & 55 deletions
This file was deleted.

0 commit comments

Comments
 (0)